Home | History | Annotate | Line # | Download | only in services
      1 /*
      2  * services/listen_dnsport.c - listen on port 53 for incoming DNS queries.
      3  *
      4  * Copyright (c) 2007, NLnet Labs. All rights reserved.
      5  *
      6  * This software is open source.
      7  *
      8  * Redistribution and use in source and binary forms, with or without
      9  * modification, are permitted provided that the following conditions
     10  * are met:
     11  *
     12  * Redistributions of source code must retain the above copyright notice,
     13  * this list of conditions and the following disclaimer.
     14  *
     15  * Redistributions in binary form must reproduce the above copyright notice,
     16  * this list of conditions and the following disclaimer in the documentation
     17  * and/or other materials provided with the distribution.
     18  *
     19  * Neither the name of the NLNET LABS nor the names of its contributors may
     20  * be used to endorse or promote products derived from this software without
     21  * specific prior written permission.
     22  *
     23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     24  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     26  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     27  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     28  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
     29  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     30  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     31  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     32  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     33  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     34  */
     35 
     36 /**
     37  * \file
     38  *
     39  * This file has functions to get queries from clients.
     40  */
     41 #include "config.h"
     42 #ifdef HAVE_SYS_TYPES_H
     43 #  include <sys/types.h>
     44 #endif
     45 #include <sys/time.h>
     46 #include <limits.h>
     47 #ifdef USE_TCP_FASTOPEN
     48 #include <netinet/tcp.h>
     49 #endif
     50 #include <ctype.h>
     51 #include "services/listen_dnsport.h"
     52 #include "services/outside_network.h"
     53 #include "util/netevent.h"
     54 #include "util/log.h"
     55 #include "util/config_file.h"
     56 #include "util/net_help.h"
     57 #include "sldns/sbuffer.h"
     58 #include "sldns/parseutil.h"
     59 #include "sldns/wire2str.h"
     60 #include "services/mesh.h"
     61 #include "util/fptr_wlist.h"
     62 #include "util/locks.h"
     63 #include "util/timeval_func.h"
     64 
     65 #ifdef HAVE_NETDB_H
     66 #include <netdb.h>
     67 #endif
     68 #include <fcntl.h>
     69 
     70 #ifdef HAVE_SYS_UN_H
     71 #include <sys/un.h>
     72 #endif
     73 
     74 #ifdef HAVE_SYSTEMD
     75 #include <systemd/sd-daemon.h>
     76 #endif
     77 
     78 #ifdef HAVE_IFADDRS_H
     79 #include <ifaddrs.h>
     80 #endif
     81 #ifdef HAVE_NET_IF_H
     82 #include <net/if.h>
     83 #endif
     84 
     85 #ifdef HAVE_TIME_H
     86 #include <time.h>
     87 #endif
     88 #include <sys/time.h>
     89 
     90 #ifdef HAVE_NGTCP2
     91 #include <ngtcp2/ngtcp2.h>
     92 #include <ngtcp2/ngtcp2_crypto.h>
     93 #ifdef HAVE_NGTCP2_NGTCP2_CRYPTO_OSSL_H
     94 #include <ngtcp2/ngtcp2_crypto_ossl.h>
     95 #elif defined(HAVE_NGTCP2_NGTCP2_CRYPTO_QUICTLS_H)
     96 #include <ngtcp2/ngtcp2_crypto_quictls.h>
     97 #elif defined(HAVE_NGTCP2_NGTCP2_CRYPTO_OPENSSL_H)
     98 #include <ngtcp2/ngtcp2_crypto_openssl.h>
     99 #define MAKE_QUIC_METHOD 1
    100 #endif
    101 #endif
    102 
    103 #ifdef HAVE_OPENSSL_SSL_H
    104 #include <openssl/ssl.h>
    105 #endif
    106 
    107 #ifdef HAVE_LINUX_NET_TSTAMP_H
    108 #include <linux/net_tstamp.h>
    109 #endif
    110 
    111 /** number of queued TCP connections for listen() */
    112 #define TCP_BACKLOG 256
    113 
    114 #ifndef THREADS_DISABLED
    115 /** lock on the counter of stream buffer memory */
    116 static lock_basic_type stream_wait_count_lock;
    117 /** lock on the counter of HTTP2 query buffer memory */
    118 static lock_basic_type http2_query_buffer_count_lock;
    119 /** lock on the counter of HTTP2 response buffer memory */
    120 static lock_basic_type http2_response_buffer_count_lock;
    121 #endif
    122 /** size (in bytes) of stream wait buffers */
    123 static size_t stream_wait_count = 0;
    124 /** is the lock initialised for stream wait buffers */
    125 static int stream_wait_lock_inited = 0;
    126 /** size (in bytes) of HTTP2 query buffers */
    127 static size_t http2_query_buffer_count = 0;
    128 /** is the lock initialised for HTTP2 query buffers */
    129 static int http2_query_buffer_lock_inited = 0;
    130 /** size (in bytes) of HTTP2 response buffers */
    131 static size_t http2_response_buffer_count = 0;
    132 /** is the lock initialised for HTTP2 response buffers */
    133 static int http2_response_buffer_lock_inited = 0;
    134 
    135 /**
    136  * Debug print of the getaddrinfo returned address.
    137  * @param addr: the address returned.
    138  * @param additional: additional text that describes the type of socket,
    139  * 	or NULL for no text.
    140  */
    141 static void
    142 verbose_print_addr(struct addrinfo *addr, const char* additional)
    143 {
    144 	if(verbosity >= VERB_ALGO) {
    145 		char buf[100];
    146 		void* sinaddr = &((struct sockaddr_in*)addr->ai_addr)->sin_addr;
    147 #ifdef INET6
    148 		if(addr->ai_family == AF_INET6)
    149 			sinaddr = &((struct sockaddr_in6*)addr->ai_addr)->
    150 				sin6_addr;
    151 #endif /* INET6 */
    152 		if(inet_ntop(addr->ai_family, sinaddr, buf,
    153 			(socklen_t)sizeof(buf)) == 0) {
    154 			(void)strlcpy(buf, "(null)", sizeof(buf));
    155 		}
    156 		buf[sizeof(buf)-1] = 0;
    157 		verbose(VERB_ALGO, "creating %s%s socket %s %d%s%s",
    158 			addr->ai_socktype==SOCK_DGRAM?"udp":
    159 			addr->ai_socktype==SOCK_STREAM?"tcp":"otherproto",
    160 			addr->ai_family==AF_INET?"4":
    161 			addr->ai_family==AF_INET6?"6":
    162 			"_otherfam", buf,
    163 			ntohs(((struct sockaddr_in*)addr->ai_addr)->sin_port),
    164 			(additional?" ":""), (additional?additional:""));
    165 	}
    166 }
    167 
    168 void
    169 verbose_print_unbound_socket(struct unbound_socket* ub_sock)
    170 {
    171 	if(verbosity >= VERB_ALGO) {
    172 		char buf[256];
    173 		log_info("listing of unbound_socket structure:");
    174 		addr_to_str((void*)ub_sock->addr, ub_sock->addrlen, buf,
    175 			sizeof(buf));
    176 		log_info("%s s is: %d, fam is: %s, acl: %s", buf, ub_sock->s,
    177 			ub_sock->fam == AF_INET?"AF_INET":"AF_INET6",
    178 			ub_sock->acl?"yes":"no");
    179 	}
    180 }
    181 
    182 #ifdef HAVE_SYSTEMD
    183 static int
    184 systemd_get_activated(int family, int socktype, int listen,
    185 		      struct sockaddr *addr, socklen_t addrlen,
    186 		      const char *path)
    187 {
    188 	int i = 0;
    189 	int r = 0;
    190 	int s = -1;
    191 	const char* listen_pid, *listen_fds;
    192 
    193 	/* We should use "listen" option only for stream protocols. For UDP it should be -1 */
    194 
    195 	if((r = sd_booted()) < 1) {
    196 		if(r == 0)
    197 			log_warn("systemd is not running");
    198 		else
    199 			log_err("systemd sd_booted(): %s", strerror(-r));
    200 		return -1;
    201 	}
    202 
    203 	listen_pid = getenv("LISTEN_PID");
    204 	listen_fds = getenv("LISTEN_FDS");
    205 
    206 	if (!listen_pid) {
    207 		log_warn("Systemd mandatory ENV variable is not defined: LISTEN_PID");
    208 		return -1;
    209 	}
    210 
    211 	if (!listen_fds) {
    212 		log_warn("Systemd mandatory ENV variable is not defined: LISTEN_FDS");
    213 		return -1;
    214 	}
    215 
    216 	if((r = sd_listen_fds(0)) < 1) {
    217 		if(r == 0)
    218 			log_warn("systemd: did not return socket, check unit configuration");
    219 		else
    220 			log_err("systemd sd_listen_fds(): %s", strerror(-r));
    221 		return -1;
    222 	}
    223 
    224 	for(i = 0; i < r; i++) {
    225 		if(sd_is_socket(SD_LISTEN_FDS_START + i, family, socktype, listen)) {
    226 			s = SD_LISTEN_FDS_START + i;
    227 			break;
    228 		}
    229 	}
    230 	if (s == -1) {
    231 		if (addr)
    232 			log_err_addr("systemd sd_listen_fds()",
    233 				     "no such socket",
    234 				     (struct sockaddr_storage *)addr, addrlen);
    235 		else
    236 			log_err("systemd sd_listen_fds(): %s", path);
    237 	}
    238 	return s;
    239 }
    240 #endif
    241 
    242 int
    243 create_udp_sock(int family, int socktype, struct sockaddr* addr,
    244         socklen_t addrlen, int v6only, int* inuse, int* noproto,
    245 	int rcv, int snd, int listen, int* reuseport, int transparent,
    246 	int freebind, int use_systemd, int dscp)
    247 {
    248 	int s;
    249 	char* err;
    250 #if defined(SO_REUSEADDR) || defined(SO_REUSEPORT) || defined(IPV6_USE_MIN_MTU)  || defined(IP_TRANSPARENT) || defined(IP_BINDANY) || defined(IP_FREEBIND) || defined (SO_BINDANY)
    251 	int on=1;
    252 #endif
    253 #ifdef IPV6_MTU
    254 	int mtu = IPV6_MIN_MTU;
    255 #endif
    256 #if !defined(SO_RCVBUFFORCE) && !defined(SO_RCVBUF)
    257 	(void)rcv;
    258 #endif
    259 #if !defined(SO_SNDBUFFORCE) && !defined(SO_SNDBUF)
    260 	(void)snd;
    261 #endif
    262 #ifndef IPV6_V6ONLY
    263 	(void)v6only;
    264 #endif
    265 #if !defined(IP_TRANSPARENT) && !defined(IP_BINDANY) && !defined(SO_BINDANY)
    266 	(void)transparent;
    267 #endif
    268 #if !defined(IP_FREEBIND)
    269 	(void)freebind;
    270 #endif
    271 #ifdef HAVE_SYSTEMD
    272 	int got_fd_from_systemd = 0;
    273 
    274 	if (!use_systemd
    275 	    || (use_systemd
    276 		&& (s = systemd_get_activated(family, socktype, -1, addr,
    277 					      addrlen, NULL)) == -1)) {
    278 #else
    279 	(void)use_systemd;
    280 #endif
    281 	if((s = socket(family, socktype, 0)) == -1) {
    282 		*inuse = 0;
    283 #ifndef USE_WINSOCK
    284 		if(errno == EAFNOSUPPORT || errno == EPROTONOSUPPORT) {
    285 			*noproto = 1;
    286 			return -1;
    287 		}
    288 #else
    289 		if(WSAGetLastError() == WSAEAFNOSUPPORT ||
    290 			WSAGetLastError() == WSAEPROTONOSUPPORT) {
    291 			*noproto = 1;
    292 			return -1;
    293 		}
    294 #endif
    295 		log_err("can't create socket: %s", sock_strerror(errno));
    296 		*noproto = 0;
    297 		return -1;
    298 	}
    299 #ifdef HAVE_SYSTEMD
    300 	} else {
    301 		got_fd_from_systemd = 1;
    302 	}
    303 #endif
    304 	if(listen) {
    305 #ifdef SO_REUSEADDR
    306 		if(setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (void*)&on,
    307 			(socklen_t)sizeof(on)) < 0) {
    308 			log_err("setsockopt(.. SO_REUSEADDR ..) failed: %s",
    309 				sock_strerror(errno));
    310 #ifndef USE_WINSOCK
    311 			if(errno != ENOSYS) {
    312 				close(s);
    313 				*noproto = 0;
    314 				*inuse = 0;
    315 				return -1;
    316 			}
    317 #else
    318 			closesocket(s);
    319 			*noproto = 0;
    320 			*inuse = 0;
    321 			return -1;
    322 #endif
    323 		}
    324 #endif /* SO_REUSEADDR */
    325 #ifdef SO_REUSEPORT
    326 #  ifdef SO_REUSEPORT_LB
    327 		/* on FreeBSD 12 we have SO_REUSEPORT_LB that does loadbalance
    328 		 * like SO_REUSEPORT on Linux.  This is what the users want
    329 		 * with the config option in unbound.conf; if we actually
    330 		 * need local address and port reuse they'll also need to
    331 		 * have SO_REUSEPORT set for them, assume it was _LB they want.
    332 		 */
    333 		if (reuseport && *reuseport &&
    334 		    setsockopt(s, SOL_SOCKET, SO_REUSEPORT_LB, (void*)&on,
    335 			(socklen_t)sizeof(on)) < 0) {
    336 #ifdef ENOPROTOOPT
    337 			if(errno != ENOPROTOOPT || verbosity >= 3)
    338 				log_warn("setsockopt(.. SO_REUSEPORT_LB ..) failed: %s",
    339 					strerror(errno));
    340 #endif
    341 			/* this option is not essential, we can continue */
    342 			*reuseport = 0;
    343 		}
    344 #  else /* no SO_REUSEPORT_LB */
    345 
    346 		/* try to set SO_REUSEPORT so that incoming
    347 		 * queries are distributed evenly among the receiving threads.
    348 		 * Each thread must have its own socket bound to the same port,
    349 		 * with SO_REUSEPORT set on each socket.
    350 		 */
    351 		if (reuseport && *reuseport &&
    352 		    setsockopt(s, SOL_SOCKET, SO_REUSEPORT, (void*)&on,
    353 			(socklen_t)sizeof(on)) < 0) {
    354 #ifdef ENOPROTOOPT
    355 			if(errno != ENOPROTOOPT || verbosity >= 3)
    356 				log_warn("setsockopt(.. SO_REUSEPORT ..) failed: %s",
    357 					strerror(errno));
    358 #endif
    359 			/* this option is not essential, we can continue */
    360 			*reuseport = 0;
    361 		}
    362 #  endif /* SO_REUSEPORT_LB */
    363 #else
    364 		(void)reuseport;
    365 #endif /* defined(SO_REUSEPORT) */
    366 #ifdef IP_TRANSPARENT
    367 		if (transparent &&
    368 		    setsockopt(s, IPPROTO_IP, IP_TRANSPARENT, (void*)&on,
    369 		    (socklen_t)sizeof(on)) < 0) {
    370 			log_warn("setsockopt(.. IP_TRANSPARENT ..) failed: %s",
    371 			strerror(errno));
    372 		}
    373 #elif defined(IP_BINDANY)
    374 		if (transparent &&
    375 		    setsockopt(s, (family==AF_INET6? IPPROTO_IPV6:IPPROTO_IP),
    376 		    (family == AF_INET6? IPV6_BINDANY:IP_BINDANY),
    377 		    (void*)&on, (socklen_t)sizeof(on)) < 0) {
    378 			log_warn("setsockopt(.. IP%s_BINDANY ..) failed: %s",
    379 			(family==AF_INET6?"V6":""), strerror(errno));
    380 		}
    381 #elif defined(SO_BINDANY)
    382 		if (transparent &&
    383 		    setsockopt(s, SOL_SOCKET, SO_BINDANY, (void*)&on,
    384 		    (socklen_t)sizeof(on)) < 0) {
    385 			log_warn("setsockopt(.. SO_BINDANY ..) failed: %s",
    386 			strerror(errno));
    387 		}
    388 #endif /* IP_TRANSPARENT || IP_BINDANY || SO_BINDANY */
    389 	}
    390 #ifdef IP_FREEBIND
    391 	if(freebind &&
    392 	    setsockopt(s, IPPROTO_IP, IP_FREEBIND, (void*)&on,
    393 	    (socklen_t)sizeof(on)) < 0) {
    394 		log_warn("setsockopt(.. IP_FREEBIND ..) failed: %s",
    395 		strerror(errno));
    396 	}
    397 #endif /* IP_FREEBIND */
    398 	if(rcv) {
    399 #ifdef SO_RCVBUF
    400 		int got;
    401 		socklen_t slen = (socklen_t)sizeof(got);
    402 #  ifdef SO_RCVBUFFORCE
    403 		/* Linux specific: try to use root permission to override
    404 		 * system limits on rcvbuf. The limit is stored in
    405 		 * /proc/sys/net/core/rmem_max or sysctl net.core.rmem_max */
    406 		if(setsockopt(s, SOL_SOCKET, SO_RCVBUFFORCE, (void*)&rcv,
    407 			(socklen_t)sizeof(rcv)) < 0) {
    408 			if(errno != EPERM) {
    409 				log_err("setsockopt(..., SO_RCVBUFFORCE, "
    410 					"...) failed: %s", sock_strerror(errno));
    411 				sock_close(s);
    412 				*noproto = 0;
    413 				*inuse = 0;
    414 				return -1;
    415 			}
    416 #  endif /* SO_RCVBUFFORCE */
    417 			if(setsockopt(s, SOL_SOCKET, SO_RCVBUF, (void*)&rcv,
    418 				(socklen_t)sizeof(rcv)) < 0) {
    419 				log_err("setsockopt(..., SO_RCVBUF, "
    420 					"...) failed: %s", sock_strerror(errno));
    421 				sock_close(s);
    422 				*noproto = 0;
    423 				*inuse = 0;
    424 				return -1;
    425 			}
    426 			/* check if we got the right thing or if system
    427 			 * reduced to some system max.  Warn if so */
    428 			if(getsockopt(s, SOL_SOCKET, SO_RCVBUF, (void*)&got,
    429 				&slen) >= 0 && got < rcv/2) {
    430 				log_warn("so-rcvbuf %u was not granted. "
    431 					"Got %u. To fix: start with "
    432 					"root permissions(linux) or sysctl "
    433 					"bigger net.core.rmem_max(linux) or "
    434 					"kern.ipc.maxsockbuf(bsd) values.",
    435 					(unsigned)rcv, (unsigned)got);
    436 			}
    437 #  ifdef SO_RCVBUFFORCE
    438 		}
    439 #  endif
    440 #endif /* SO_RCVBUF */
    441 	}
    442 	/* first do RCVBUF as the receive buffer is more important */
    443 	if(snd) {
    444 #ifdef SO_SNDBUF
    445 		int got;
    446 		socklen_t slen = (socklen_t)sizeof(got);
    447 #  ifdef SO_SNDBUFFORCE
    448 		/* Linux specific: try to use root permission to override
    449 		 * system limits on sndbuf. The limit is stored in
    450 		 * /proc/sys/net/core/wmem_max or sysctl net.core.wmem_max */
    451 		if(setsockopt(s, SOL_SOCKET, SO_SNDBUFFORCE, (void*)&snd,
    452 			(socklen_t)sizeof(snd)) < 0) {
    453 			if(errno != EPERM && errno != ENOBUFS) {
    454 				log_err("setsockopt(..., SO_SNDBUFFORCE, "
    455 					"...) failed: %s", sock_strerror(errno));
    456 				sock_close(s);
    457 				*noproto = 0;
    458 				*inuse = 0;
    459 				return -1;
    460 			}
    461 			if(errno != EPERM) {
    462 				verbose(VERB_ALGO, "setsockopt(..., SO_SNDBUFFORCE, "
    463 					"...) was not granted: %s", sock_strerror(errno));
    464 			}
    465 #  endif /* SO_SNDBUFFORCE */
    466 			if(setsockopt(s, SOL_SOCKET, SO_SNDBUF, (void*)&snd,
    467 				(socklen_t)sizeof(snd)) < 0) {
    468 				if(errno != ENOSYS && errno != ENOBUFS) {
    469 					log_err("setsockopt(..., SO_SNDBUF, "
    470 						"...) failed: %s", sock_strerror(errno));
    471 					sock_close(s);
    472 					*noproto = 0;
    473 					*inuse = 0;
    474 					return -1;
    475 				}
    476 				log_warn("setsockopt(..., SO_SNDBUF, "
    477 					"...) was not granted: %s", sock_strerror(errno));
    478 			}
    479 			/* check if we got the right thing or if system
    480 			 * reduced to some system max.  Warn if so */
    481 			if(getsockopt(s, SOL_SOCKET, SO_SNDBUF, (void*)&got,
    482 				&slen) >= 0 && got < snd/2) {
    483 				log_warn("so-sndbuf %u was not granted. "
    484 					"Got %u. To fix: start with "
    485 					"root permissions(linux) or sysctl "
    486 					"bigger net.core.wmem_max(linux) or "
    487 					"kern.ipc.maxsockbuf(bsd) values. or "
    488 					"set so-sndbuf: 0 (use system value).",
    489 					(unsigned)snd, (unsigned)got);
    490 			}
    491 #  ifdef SO_SNDBUFFORCE
    492 		}
    493 #  endif
    494 #endif /* SO_SNDBUF */
    495 	}
    496 	err = set_ip_dscp(s, family, dscp);
    497 	if(err != NULL)
    498 		log_warn("error setting IP DiffServ codepoint %d on UDP socket: %s", dscp, err);
    499 	if(family == AF_INET6) {
    500 # if defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
    501 		int omit6_set = 0;
    502 		int action;
    503 # endif
    504 # if defined(IPV6_V6ONLY)
    505 		if(v6only
    506 #   ifdef HAVE_SYSTEMD
    507 			/* Systemd wants to control if the socket is v6 only
    508 			 * or both, with BindIPv6Only=default, ipv6-only or
    509 			 * both in systemd.socket, so it is not set here. */
    510 			&& !got_fd_from_systemd
    511 #   endif
    512 			) {
    513 			int val=(v6only==2)?0:1;
    514 			if (setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY,
    515 				(void*)&val, (socklen_t)sizeof(val)) < 0) {
    516 				log_err("setsockopt(..., IPV6_V6ONLY"
    517 					", ...) failed: %s", sock_strerror(errno));
    518 				sock_close(s);
    519 				*noproto = 0;
    520 				*inuse = 0;
    521 				return -1;
    522 			}
    523 		}
    524 # endif
    525 # if defined(IPV6_USE_MIN_MTU)
    526 		/*
    527 		 * There is no fragmentation of IPv6 datagrams
    528 		 * during forwarding in the network. Therefore
    529 		 * we do not send UDP datagrams larger than
    530 		 * the minimum IPv6 MTU of 1280 octets. The
    531 		 * EDNS0 message length can be larger if the
    532 		 * network stack supports IPV6_USE_MIN_MTU.
    533 		 */
    534 		if (setsockopt(s, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
    535 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
    536 			log_err("setsockopt(..., IPV6_USE_MIN_MTU, "
    537 				"...) failed: %s", sock_strerror(errno));
    538 			sock_close(s);
    539 			*noproto = 0;
    540 			*inuse = 0;
    541 			return -1;
    542 		}
    543 # elif defined(IPV6_MTU)
    544 #   ifndef USE_WINSOCK
    545 		/*
    546 		 * On Linux, to send no larger than 1280, the PMTUD is
    547 		 * disabled by default for datagrams anyway, so we set
    548 		 * the MTU to use.
    549 		 */
    550 		if (setsockopt(s, IPPROTO_IPV6, IPV6_MTU,
    551 			(void*)&mtu, (socklen_t)sizeof(mtu)) < 0) {
    552 			log_err("setsockopt(..., IPV6_MTU, ...) failed: %s",
    553 				sock_strerror(errno));
    554 			sock_close(s);
    555 			*noproto = 0;
    556 			*inuse = 0;
    557 			return -1;
    558 		}
    559 #   elif defined(IPV6_USER_MTU)
    560 		/* As later versions of the mingw crosscompiler define
    561 		 * IPV6_MTU, do the same for windows but use IPV6_USER_MTU
    562 		 * instead which is writable; IPV6_MTU is readonly there. */
    563 		if (setsockopt(s, IPPROTO_IPV6, IPV6_USER_MTU,
    564 			(void*)&mtu, (socklen_t)sizeof(mtu)) < 0) {
    565 			if (WSAGetLastError() != WSAENOPROTOOPT) {
    566 				log_err("setsockopt(..., IPV6_USER_MTU, ...) failed: %s",
    567 					wsa_strerror(WSAGetLastError()));
    568 				sock_close(s);
    569 				*noproto = 0;
    570 				*inuse = 0;
    571 				return -1;
    572 			}
    573 		}
    574 #   endif /* USE_WINSOCK */
    575 # endif /* IPv6 MTU */
    576 # if defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
    577 #  if defined(IP_PMTUDISC_OMIT)
    578 		action = IP_PMTUDISC_OMIT;
    579 		if (setsockopt(s, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
    580 			&action, (socklen_t)sizeof(action)) < 0) {
    581 
    582 			if (errno != EINVAL) {
    583 				log_err("setsockopt(..., IPV6_MTU_DISCOVER, IP_PMTUDISC_OMIT...) failed: %s",
    584 					strerror(errno));
    585 				sock_close(s);
    586 				*noproto = 0;
    587 				*inuse = 0;
    588 				return -1;
    589 			}
    590 		}
    591 		else
    592 		{
    593 		    omit6_set = 1;
    594 		}
    595 #  endif
    596 		if (omit6_set == 0) {
    597 			action = IP_PMTUDISC_DONT;
    598 			if (setsockopt(s, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
    599 				&action, (socklen_t)sizeof(action)) < 0) {
    600 				log_err("setsockopt(..., IPV6_MTU_DISCOVER, IP_PMTUDISC_DONT...) failed: %s",
    601 					strerror(errno));
    602 				sock_close(s);
    603 				*noproto = 0;
    604 				*inuse = 0;
    605 				return -1;
    606 			}
    607 		}
    608 # endif /* IPV6_MTU_DISCOVER */
    609 	} else if(family == AF_INET) {
    610 #  if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
    611 /* linux 3.15 has IP_PMTUDISC_OMIT, Hannes Frederic Sowa made it so that
    612  * PMTU information is not accepted, but fragmentation is allowed
    613  * if and only if the packet size exceeds the outgoing interface MTU
    614  * (and also uses the interface mtu to determine the size of the packets).
    615  * So there won't be any EMSGSIZE error.  Against DNS fragmentation attacks.
    616  * FreeBSD already has same semantics without setting the option. */
    617 		int omit_set = 0;
    618 		int action;
    619 #   if defined(IP_PMTUDISC_OMIT)
    620 		action = IP_PMTUDISC_OMIT;
    621 		if (setsockopt(s, IPPROTO_IP, IP_MTU_DISCOVER,
    622 			&action, (socklen_t)sizeof(action)) < 0) {
    623 
    624 			if (errno != EINVAL) {
    625 				log_err("setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_OMIT...) failed: %s",
    626 					strerror(errno));
    627 				sock_close(s);
    628 				*noproto = 0;
    629 				*inuse = 0;
    630 				return -1;
    631 			}
    632 		}
    633 		else
    634 		{
    635 		    omit_set = 1;
    636 		}
    637 #   endif
    638 		if (omit_set == 0) {
    639    			action = IP_PMTUDISC_DONT;
    640 			if (setsockopt(s, IPPROTO_IP, IP_MTU_DISCOVER,
    641 				&action, (socklen_t)sizeof(action)) < 0) {
    642 				log_err("setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_DONT...) failed: %s",
    643 					strerror(errno));
    644 				sock_close(s);
    645 				*noproto = 0;
    646 				*inuse = 0;
    647 				return -1;
    648 			}
    649 		}
    650 #  elif defined(IP_DONTFRAG) && !defined(__APPLE__)
    651 		/* the IP_DONTFRAG option if defined in the 11.0 OSX headers,
    652 		 * but does not work on that version, so we exclude it */
    653 		/* a nonzero value disables fragmentation, according to
    654 		 * docs.oracle.com for ip(4). */
    655 		int off = 1;
    656 		if (setsockopt(s, IPPROTO_IP, IP_DONTFRAG,
    657 			&off, (socklen_t)sizeof(off)) < 0) {
    658 			log_err("setsockopt(..., IP_DONTFRAG, ...) failed: %s",
    659 				strerror(errno));
    660 			sock_close(s);
    661 			*noproto = 0;
    662 			*inuse = 0;
    663 			return -1;
    664 		}
    665 #  endif /* IPv4 MTU */
    666 	}
    667 	if(
    668 #ifdef HAVE_SYSTEMD
    669 		!got_fd_from_systemd &&
    670 #endif
    671 		bind(s, (struct sockaddr*)addr, addrlen) != 0) {
    672 		*noproto = 0;
    673 		*inuse = 0;
    674 #ifndef USE_WINSOCK
    675 #ifdef EADDRINUSE
    676 		*inuse = (errno == EADDRINUSE);
    677 		/* detect freebsd jail with no ipv6 permission */
    678 		if(family==AF_INET6 && errno==EINVAL)
    679 			*noproto = 1;
    680 		else if(errno != EADDRINUSE &&
    681 			!(errno == EACCES && verbosity < 4 && !listen)
    682 #ifdef EADDRNOTAVAIL
    683 			&& !(errno == EADDRNOTAVAIL && verbosity < 4 && !listen)
    684 #endif
    685 			) {
    686 			log_err_addr("can't bind socket", strerror(errno),
    687 				(struct sockaddr_storage*)addr, addrlen);
    688 		}
    689 #endif /* EADDRINUSE */
    690 #else /* USE_WINSOCK */
    691 		if(WSAGetLastError() != WSAEADDRINUSE &&
    692 			WSAGetLastError() != WSAEADDRNOTAVAIL &&
    693 			!(WSAGetLastError() == WSAEACCES && verbosity < 4 && !listen)) {
    694 			log_err_addr("can't bind socket",
    695 				wsa_strerror(WSAGetLastError()),
    696 				(struct sockaddr_storage*)addr, addrlen);
    697 		}
    698 #endif /* USE_WINSOCK */
    699 		sock_close(s);
    700 		return -1;
    701 	}
    702 	if(!fd_set_nonblock(s)) {
    703 		*noproto = 0;
    704 		*inuse = 0;
    705 		sock_close(s);
    706 		return -1;
    707 	}
    708 	return s;
    709 }
    710 
    711 int
    712 create_tcp_accept_sock(struct addrinfo *addr, int v6only, int* noproto,
    713 	int* reuseport, int transparent, int mss, int nodelay, int freebind,
    714 	int use_systemd, int dscp, const char* additional)
    715 {
    716 	int s = -1;
    717 	char* err;
    718 #if defined(SO_REUSEADDR) || defined(SO_REUSEPORT)		\
    719 	|| defined(IPV6_V6ONLY) || defined(IP_TRANSPARENT)	\
    720 	|| defined(IP_BINDANY) || defined(IP_FREEBIND)		\
    721 	|| defined(SO_BINDANY) || defined(TCP_NODELAY)
    722 	int on = 1;
    723 #endif
    724 #ifdef HAVE_SYSTEMD
    725 	int got_fd_from_systemd = 0;
    726 #endif
    727 #ifdef USE_TCP_FASTOPEN
    728 	int qlen;
    729 #endif
    730 #if !defined(IP_TRANSPARENT) && !defined(IP_BINDANY) && !defined(SO_BINDANY)
    731 	(void)transparent;
    732 #endif
    733 #if !defined(IP_FREEBIND)
    734 	(void)freebind;
    735 #endif
    736 	verbose_print_addr(addr, additional);
    737 	*noproto = 0;
    738 #ifdef HAVE_SYSTEMD
    739 	if (!use_systemd ||
    740 	    (use_systemd
    741 	     && (s = systemd_get_activated(addr->ai_family, addr->ai_socktype, 1,
    742 					   addr->ai_addr, addr->ai_addrlen,
    743 					   NULL)) == -1)) {
    744 #else
    745 	(void)use_systemd;
    746 #endif
    747 	if((s = socket(addr->ai_family, addr->ai_socktype, 0)) == -1) {
    748 #ifndef USE_WINSOCK
    749 		if(errno == EAFNOSUPPORT || errno == EPROTONOSUPPORT) {
    750 			*noproto = 1;
    751 			return -1;
    752 		}
    753 #else
    754 		if(WSAGetLastError() == WSAEAFNOSUPPORT ||
    755 			WSAGetLastError() == WSAEPROTONOSUPPORT) {
    756 			*noproto = 1;
    757 			return -1;
    758 		}
    759 #endif
    760 		log_err("can't create socket: %s", sock_strerror(errno));
    761 		return -1;
    762 	}
    763 	if(nodelay) {
    764 #if defined(IPPROTO_TCP) && defined(TCP_NODELAY)
    765 		if(setsockopt(s, IPPROTO_TCP, TCP_NODELAY, (void*)&on,
    766 			(socklen_t)sizeof(on)) < 0) {
    767 			#ifndef USE_WINSOCK
    768 			log_err(" setsockopt(.. TCP_NODELAY ..) failed: %s",
    769 				strerror(errno));
    770 			#else
    771 			log_err(" setsockopt(.. TCP_NODELAY ..) failed: %s",
    772 				wsa_strerror(WSAGetLastError()));
    773 			#endif
    774 		}
    775 #else
    776 		log_warn(" setsockopt(TCP_NODELAY) unsupported");
    777 #endif /* defined(IPPROTO_TCP) && defined(TCP_NODELAY) */
    778 	}
    779 	if (mss > 0) {
    780 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
    781 		if(setsockopt(s, IPPROTO_TCP, TCP_MAXSEG, (void*)&mss,
    782 			(socklen_t)sizeof(mss)) < 0) {
    783 			log_err(" setsockopt(.. TCP_MAXSEG ..) failed: %s",
    784 				sock_strerror(errno));
    785 		} else {
    786 			verbose(VERB_ALGO,
    787 				" tcp socket mss set to %d", mss);
    788 		}
    789 #else
    790 		log_warn(" setsockopt(TCP_MAXSEG) unsupported");
    791 #endif /* defined(IPPROTO_TCP) && defined(TCP_MAXSEG) */
    792 	}
    793 #ifdef HAVE_SYSTEMD
    794 	} else {
    795 		got_fd_from_systemd = 1;
    796     }
    797 #endif
    798 #ifdef SO_REUSEADDR
    799 	if(setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (void*)&on,
    800 		(socklen_t)sizeof(on)) < 0) {
    801 		log_err("setsockopt(.. SO_REUSEADDR ..) failed: %s",
    802 			sock_strerror(errno));
    803 		sock_close(s);
    804 		return -1;
    805 	}
    806 #endif /* SO_REUSEADDR */
    807 #ifdef IP_FREEBIND
    808 	if (freebind && setsockopt(s, IPPROTO_IP, IP_FREEBIND, (void*)&on,
    809 	    (socklen_t)sizeof(on)) < 0) {
    810 		log_warn("setsockopt(.. IP_FREEBIND ..) failed: %s",
    811 		strerror(errno));
    812 	}
    813 #endif /* IP_FREEBIND */
    814 #ifdef SO_REUSEPORT
    815 	/* try to set SO_REUSEPORT so that incoming
    816 	 * connections are distributed evenly among the receiving threads.
    817 	 * Each thread must have its own socket bound to the same port,
    818 	 * with SO_REUSEPORT set on each socket.
    819 	 */
    820 	if (reuseport && *reuseport &&
    821 		setsockopt(s, SOL_SOCKET, SO_REUSEPORT, (void*)&on,
    822 		(socklen_t)sizeof(on)) < 0) {
    823 #ifdef ENOPROTOOPT
    824 		if(errno != ENOPROTOOPT || verbosity >= 3)
    825 			log_warn("setsockopt(.. SO_REUSEPORT ..) failed: %s",
    826 				strerror(errno));
    827 #endif
    828 		/* this option is not essential, we can continue */
    829 		*reuseport = 0;
    830 	}
    831 #else
    832 	(void)reuseport;
    833 #endif /* defined(SO_REUSEPORT) */
    834 #if defined(IPV6_V6ONLY)
    835 	if(addr->ai_family == AF_INET6 && v6only
    836 #  ifdef HAVE_SYSTEMD
    837 		/* Systemd wants to control if the socket is v6 only
    838 		 * or both, with BindIPv6Only=default, ipv6-only or
    839 		 * both in systemd.socket, so it is not set here. */
    840 		&& !got_fd_from_systemd
    841 #  endif
    842 		) {
    843 		if(setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY,
    844 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
    845 			log_err("setsockopt(..., IPV6_V6ONLY, ...) failed: %s",
    846 				sock_strerror(errno));
    847 			sock_close(s);
    848 			return -1;
    849 		}
    850 	}
    851 #else
    852 	(void)v6only;
    853 #endif /* IPV6_V6ONLY */
    854 #ifdef IP_TRANSPARENT
    855 	if (transparent &&
    856 	    setsockopt(s, IPPROTO_IP, IP_TRANSPARENT, (void*)&on,
    857 	    (socklen_t)sizeof(on)) < 0) {
    858 		log_warn("setsockopt(.. IP_TRANSPARENT ..) failed: %s",
    859 			strerror(errno));
    860 	}
    861 #elif defined(IP_BINDANY)
    862 	if (transparent &&
    863 	    setsockopt(s, (addr->ai_family==AF_INET6? IPPROTO_IPV6:IPPROTO_IP),
    864 	    (addr->ai_family == AF_INET6? IPV6_BINDANY:IP_BINDANY),
    865 	    (void*)&on, (socklen_t)sizeof(on)) < 0) {
    866 		log_warn("setsockopt(.. IP%s_BINDANY ..) failed: %s",
    867 		(addr->ai_family==AF_INET6?"V6":""), strerror(errno));
    868 	}
    869 #elif defined(SO_BINDANY)
    870 	if (transparent &&
    871 	    setsockopt(s, SOL_SOCKET, SO_BINDANY, (void*)&on, (socklen_t)
    872 	    sizeof(on)) < 0) {
    873 		log_warn("setsockopt(.. SO_BINDANY ..) failed: %s",
    874 		strerror(errno));
    875 	}
    876 #endif /* IP_TRANSPARENT || IP_BINDANY || SO_BINDANY */
    877 	err = set_ip_dscp(s, addr->ai_family, dscp);
    878 	if(err != NULL)
    879 		log_warn("error setting IP DiffServ codepoint %d on TCP socket: %s", dscp, err);
    880 	if(
    881 #ifdef HAVE_SYSTEMD
    882 		!got_fd_from_systemd &&
    883 #endif
    884         bind(s, addr->ai_addr, addr->ai_addrlen) != 0) {
    885 #ifndef USE_WINSOCK
    886 		/* detect freebsd jail with no ipv6 permission */
    887 		if(addr->ai_family==AF_INET6 && errno==EINVAL)
    888 			*noproto = 1;
    889 		else {
    890 			log_err_addr("can't bind socket", strerror(errno),
    891 				(struct sockaddr_storage*)addr->ai_addr,
    892 				addr->ai_addrlen);
    893 		}
    894 #else
    895 		log_err_addr("can't bind socket",
    896 			wsa_strerror(WSAGetLastError()),
    897 			(struct sockaddr_storage*)addr->ai_addr,
    898 			addr->ai_addrlen);
    899 #endif
    900 		sock_close(s);
    901 		return -1;
    902 	}
    903 	if(!fd_set_nonblock(s)) {
    904 		sock_close(s);
    905 		return -1;
    906 	}
    907 	if(listen(s, TCP_BACKLOG) == -1) {
    908 		log_err("can't listen: %s", sock_strerror(errno));
    909 		sock_close(s);
    910 		return -1;
    911 	}
    912 #ifdef USE_TCP_FASTOPEN
    913 	/* qlen specifies how many outstanding TFO requests to allow. Limit is a defense
    914 	   against IP spoofing attacks as suggested in RFC7413 */
    915 #ifdef __APPLE__
    916 	/* OS X implementation only supports qlen of 1 via this call. Actual
    917 	   value is configured by the net.inet.tcp.fastopen_backlog kernel param. */
    918 	qlen = 1;
    919 #else
    920 	/* 5 is recommended on linux */
    921 	qlen = 5;
    922 #endif
    923 	if ((setsockopt(s, IPPROTO_TCP, TCP_FASTOPEN, &qlen,
    924 		  sizeof(qlen))) == -1 ) {
    925 #ifdef ENOPROTOOPT
    926 		/* squelch ENOPROTOOPT: freebsd server mode with kernel support
    927 		   disabled, except when verbosity enabled for debugging */
    928 		if(errno != ENOPROTOOPT || verbosity >= 3) {
    929 #endif
    930 		  if(errno == EPERM) {
    931 		  	log_warn("Setting TCP Fast Open as server failed: %s ; this could likely be because sysctl net.inet.tcp.fastopen.enabled, net.inet.tcp.fastopen.server_enable, or net.ipv4.tcp_fastopen is disabled", strerror(errno));
    932 		  } else {
    933 		  	log_err("Setting TCP Fast Open as server failed: %s", strerror(errno));
    934 		  }
    935 #ifdef ENOPROTOOPT
    936 		}
    937 #endif
    938 	}
    939 #endif
    940 	return s;
    941 }
    942 
    943 char*
    944 set_ip_dscp(int socket, int addrfamily, int dscp)
    945 {
    946 	int ds;
    947 
    948 	if(dscp == 0)
    949 		return NULL;
    950 	ds = dscp << 2;
    951 	switch(addrfamily) {
    952 	case AF_INET6:
    953 	#ifdef IPV6_TCLASS
    954 		if(setsockopt(socket, IPPROTO_IPV6, IPV6_TCLASS, (void*)&ds,
    955 			sizeof(ds)) < 0)
    956 			return sock_strerror(errno);
    957 		break;
    958 	#else
    959 		return "IPV6_TCLASS not defined on this system";
    960 	#endif
    961 	default:
    962 		if(setsockopt(socket, IPPROTO_IP, IP_TOS, (void*)&ds, sizeof(ds)) < 0)
    963 			return sock_strerror(errno);
    964 		break;
    965 	}
    966 	return NULL;
    967 }
    968 
    969 int
    970 create_local_accept_sock(const char *path, int* noproto, int use_systemd)
    971 {
    972 #ifdef HAVE_SYSTEMD
    973 	int ret;
    974 
    975 	if (use_systemd && (ret = systemd_get_activated(AF_LOCAL, SOCK_STREAM, 1, NULL, 0, path)) != -1)
    976 		return ret;
    977 	else {
    978 #endif
    979 #ifdef HAVE_SYS_UN_H
    980 	int s;
    981 	struct sockaddr_un usock;
    982 #ifndef HAVE_SYSTEMD
    983 	(void)use_systemd;
    984 #endif
    985 
    986 	verbose(VERB_ALGO, "creating unix socket %s", path);
    987 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
    988 	/* this member exists on BSDs, not Linux */
    989 	usock.sun_len = (unsigned)sizeof(usock);
    990 #endif
    991 	usock.sun_family = AF_LOCAL;
    992 	/* length is 92-108, 104 on FreeBSD */
    993 	(void)strlcpy(usock.sun_path, path, sizeof(usock.sun_path));
    994 
    995 	if ((s = socket(AF_LOCAL, SOCK_STREAM, 0)) == -1) {
    996 		log_err("Cannot create local socket %s (%s)",
    997 			path, strerror(errno));
    998 		return -1;
    999 	}
   1000 
   1001 	if (unlink(path) && errno != ENOENT) {
   1002 		/* The socket already exists and cannot be removed */
   1003 		log_err("Cannot remove old local socket %s (%s)",
   1004 			path, strerror(errno));
   1005 		goto err;
   1006 	}
   1007 
   1008 	if (bind(s, (struct sockaddr *)&usock,
   1009 		(socklen_t)sizeof(struct sockaddr_un)) == -1) {
   1010 		log_err("Cannot bind local socket %s (%s)",
   1011 			path, strerror(errno));
   1012 		goto err;
   1013 	}
   1014 
   1015 	if (!fd_set_nonblock(s)) {
   1016 		log_err("Cannot set non-blocking mode");
   1017 		goto err;
   1018 	}
   1019 
   1020 	if (listen(s, TCP_BACKLOG) == -1) {
   1021 		log_err("can't listen: %s", strerror(errno));
   1022 		goto err;
   1023 	}
   1024 
   1025 	(void)noproto; /*unused*/
   1026 	return s;
   1027 
   1028 err:
   1029 	sock_close(s);
   1030 	return -1;
   1031 
   1032 #ifdef HAVE_SYSTEMD
   1033 	}
   1034 #endif
   1035 #else
   1036 	(void)use_systemd;
   1037 	(void)path;
   1038 	log_err("Local sockets are not supported");
   1039 	*noproto = 1;
   1040 	return -1;
   1041 #endif
   1042 }
   1043 
   1044 
   1045 /**
   1046  * Create socket from getaddrinfo results
   1047  */
   1048 static int
   1049 make_sock(int stype, const char* ifname, int port,
   1050 	struct addrinfo *hints, int v6only, int* noip6, size_t rcv, size_t snd,
   1051 	int* reuseport, int transparent, int tcp_mss, int nodelay, int freebind,
   1052 	int use_systemd, int dscp, struct unbound_socket* ub_sock,
   1053 	const char* additional)
   1054 {
   1055 	struct addrinfo *res = NULL;
   1056 	int r, s, inuse, noproto;
   1057 	char portbuf[32];
   1058 	snprintf(portbuf, sizeof(portbuf), "%d", port);
   1059 	hints->ai_socktype = stype;
   1060 	*noip6 = 0;
   1061 	if((r=getaddrinfo(ifname, portbuf, hints, &res)) != 0 || !res) {
   1062 #ifdef USE_WINSOCK
   1063 		if(r == EAI_NONAME && hints->ai_family == AF_INET6){
   1064 			*noip6 = 1; /* 'Host not found' for IP6 on winXP */
   1065 			return -1;
   1066 		}
   1067 #endif
   1068 		log_err("node %s:%s getaddrinfo: %s %s",
   1069 			ifname?ifname:"default", portbuf, gai_strerror(r),
   1070 #ifdef EAI_SYSTEM
   1071 			(r==EAI_SYSTEM?(char*)strerror(errno):"")
   1072 #else
   1073 			""
   1074 #endif
   1075 		);
   1076 		return -1;
   1077 	}
   1078 	if(stype == SOCK_DGRAM) {
   1079 		verbose_print_addr(res, additional);
   1080 		s = create_udp_sock(res->ai_family, res->ai_socktype,
   1081 			(struct sockaddr*)res->ai_addr, res->ai_addrlen,
   1082 			v6only, &inuse, &noproto, (int)rcv, (int)snd, 1,
   1083 			reuseport, transparent, freebind, use_systemd, dscp);
   1084 		if(s == -1 && inuse) {
   1085 			log_err("bind: address already in use");
   1086 		} else if(s == -1 && noproto && hints->ai_family == AF_INET6){
   1087 			*noip6 = 1;
   1088 		}
   1089 	} else	{
   1090 		s = create_tcp_accept_sock(res, v6only, &noproto, reuseport,
   1091 			transparent, tcp_mss, nodelay, freebind, use_systemd,
   1092 			dscp, additional);
   1093 		if(s == -1 && noproto && hints->ai_family == AF_INET6){
   1094 			*noip6 = 1;
   1095 		}
   1096 	}
   1097 
   1098 	if(!res->ai_addr) {
   1099 		log_err("getaddrinfo returned no address");
   1100 		freeaddrinfo(res);
   1101 		sock_close(s);
   1102 		return -1;
   1103 	}
   1104 	ub_sock->addr = memdup(res->ai_addr, res->ai_addrlen);
   1105 	ub_sock->addrlen = res->ai_addrlen;
   1106 	if(!ub_sock->addr) {
   1107 		log_err("out of memory: allocate listening address");
   1108 		freeaddrinfo(res);
   1109 		sock_close(s);
   1110 		return -1;
   1111 	}
   1112 	freeaddrinfo(res);
   1113 
   1114 	ub_sock->s = s;
   1115 	ub_sock->fam = hints->ai_family;
   1116 	ub_sock->acl = NULL;
   1117 
   1118 	return s;
   1119 }
   1120 
   1121 /** make socket and first see if ifname contains port override info */
   1122 static int
   1123 make_sock_port(int stype, const char* ifname, int port,
   1124 	struct addrinfo *hints, int v6only, int* noip6, size_t rcv, size_t snd,
   1125 	int* reuseport, int transparent, int tcp_mss, int nodelay, int freebind,
   1126 	int use_systemd, int dscp, struct unbound_socket* ub_sock,
   1127 	const char* additional)
   1128 {
   1129 	char* s = strchr(ifname, '@');
   1130 	if(s) {
   1131 		/* override port with ifspec@port */
   1132 		int port;
   1133 		char newif[128];
   1134 		if((size_t)(s-ifname) >= sizeof(newif)) {
   1135 			log_err("ifname too long: %s", ifname);
   1136 			*noip6 = 0;
   1137 			return -1;
   1138 		}
   1139 		port = atoi(s+1);
   1140 		if(port < 0 || 0 == port || port > 65535) {
   1141 			log_err("invalid portnumber in interface: %s", ifname);
   1142 			*noip6 = 0;
   1143 			return -1;
   1144 		}
   1145 		(void)strlcpy(newif, ifname, sizeof(newif));
   1146 		newif[s-ifname] = 0;
   1147 		return make_sock(stype, newif, port, hints, v6only, noip6, rcv,
   1148 			snd, reuseport, transparent, tcp_mss, nodelay, freebind,
   1149 			use_systemd, dscp, ub_sock, additional);
   1150 	}
   1151 	return make_sock(stype, ifname, port, hints, v6only, noip6, rcv, snd,
   1152 		reuseport, transparent, tcp_mss, nodelay, freebind, use_systemd,
   1153 		dscp, ub_sock, additional);
   1154 }
   1155 
   1156 /**
   1157  * Add port to open ports list.
   1158  * @param list: list head. changed.
   1159  * @param s: fd.
   1160  * @param ftype: if fd is UDP.
   1161  * @param pp2_enabled: if PROXYv2 is enabled for this port.
   1162  * @param ub_sock: socket with address.
   1163  * @return false on failure. list in unchanged then.
   1164  */
   1165 static int
   1166 port_insert(struct listen_port** list, int s, enum listen_type ftype,
   1167 	int pp2_enabled, struct unbound_socket* ub_sock)
   1168 {
   1169 	struct listen_port* item = (struct listen_port*)malloc(
   1170 		sizeof(struct listen_port));
   1171 	if(!item)
   1172 		return 0;
   1173 	item->next = *list;
   1174 	item->fd = s;
   1175 	item->ftype = ftype;
   1176 	item->pp2_enabled = pp2_enabled;
   1177 	item->socket = ub_sock;
   1178 	*list = item;
   1179 	return 1;
   1180 }
   1181 
   1182 /** set fd to receive software timestamps */
   1183 static int
   1184 set_recvtimestamp(int s)
   1185 {
   1186 #ifdef HAVE_LINUX_NET_TSTAMP_H
   1187 	int opt = SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE;
   1188 	if (setsockopt(s, SOL_SOCKET, SO_TIMESTAMPNS, (void*)&opt, (socklen_t)sizeof(opt)) < 0) {
   1189 		log_err("setsockopt(..., SO_TIMESTAMPNS, ...) failed: %s",
   1190 			strerror(errno));
   1191 		return 0;
   1192 	}
   1193 	return 1;
   1194 #elif defined(SO_TIMESTAMP) && defined(SCM_TIMESTAMP)
   1195 	int on = 1;
   1196 	/* FreeBSD and also Linux. */
   1197 	if (setsockopt(s, SOL_SOCKET, SO_TIMESTAMP, (void*)&on, (socklen_t)sizeof(on)) < 0) {
   1198 		log_err("setsockopt(..., SO_TIMESTAMP, ...) failed: %s",
   1199 			strerror(errno));
   1200 		return 0;
   1201 	}
   1202 	return 1;
   1203 #else
   1204 	log_err("packets timestamping is not supported on this platform");
   1205 	(void)s;
   1206 	return 0;
   1207 #endif
   1208 }
   1209 
   1210 /** set fd to receive source address packet info */
   1211 static int
   1212 set_recvpktinfo(int s, int family)
   1213 {
   1214 #if defined(IPV6_RECVPKTINFO) || defined(IPV6_PKTINFO) || (defined(IP_RECVDSTADDR) && defined(IP_SENDSRCADDR)) || defined(IP_PKTINFO)
   1215 	int on = 1;
   1216 #else
   1217 	(void)s;
   1218 #endif
   1219 	if(family == AF_INET6) {
   1220 #           ifdef IPV6_RECVPKTINFO
   1221 		if(setsockopt(s, IPPROTO_IPV6, IPV6_RECVPKTINFO,
   1222 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
   1223 			log_err("setsockopt(..., IPV6_RECVPKTINFO, ...) failed: %s",
   1224 				strerror(errno));
   1225 			return 0;
   1226 		}
   1227 #           elif defined(IPV6_PKTINFO)
   1228 		if(setsockopt(s, IPPROTO_IPV6, IPV6_PKTINFO,
   1229 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
   1230 			log_err("setsockopt(..., IPV6_PKTINFO, ...) failed: %s",
   1231 				strerror(errno));
   1232 			return 0;
   1233 		}
   1234 #           else
   1235 		log_err("no IPV6_RECVPKTINFO and IPV6_PKTINFO options, please "
   1236 			"disable interface-automatic or do-ip6 in config");
   1237 		return 0;
   1238 #           endif /* defined IPV6_RECVPKTINFO */
   1239 
   1240 	} else if(family == AF_INET) {
   1241 #           ifdef IP_PKTINFO
   1242 		if(setsockopt(s, IPPROTO_IP, IP_PKTINFO,
   1243 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
   1244 			log_err("setsockopt(..., IP_PKTINFO, ...) failed: %s",
   1245 				strerror(errno));
   1246 			return 0;
   1247 		}
   1248 #           elif defined(IP_RECVDSTADDR) && defined(IP_SENDSRCADDR)
   1249 		if(setsockopt(s, IPPROTO_IP, IP_RECVDSTADDR,
   1250 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
   1251 			log_err("setsockopt(..., IP_RECVDSTADDR, ...) failed: %s",
   1252 				strerror(errno));
   1253 			return 0;
   1254 		}
   1255 #           else
   1256 		log_err("no IP_SENDSRCADDR or IP_PKTINFO option, please disable "
   1257 			"interface-automatic or do-ip4 in config");
   1258 		return 0;
   1259 #           endif /* IP_PKTINFO */
   1260 
   1261 	}
   1262 	return 1;
   1263 }
   1264 
   1265 /**
   1266  * Helper for ports_open. Creates one interface (or NULL for default).
   1267  * @param ifname: The interface ip address.
   1268  * @param do_auto: use automatic interface detection.
   1269  * 	If enabled, then ifname must be the wildcard name.
   1270  * @param do_udp: if udp should be used.
   1271  * @param do_tcp: if tcp should be used.
   1272  * @param hints: for getaddrinfo. family and flags have to be set by caller.
   1273  * @param port: Port number to use.
   1274  * @param list: list of open ports, appended to, changed to point to list head.
   1275  * @param rcv: receive buffer size for UDP
   1276  * @param snd: send buffer size for UDP
   1277  * @param ssl_port: ssl service port number
   1278  * @param tls_additional_port: list of additional ssl service port numbers.
   1279  * @param https_port: DoH service port number
   1280  * @param proxy_protocol_port: list of PROXYv2 port numbers.
   1281  * @param reuseport: try to set SO_REUSEPORT if nonNULL and true.
   1282  * 	set to false on exit if reuseport failed due to no kernel support.
   1283  * @param transparent: set IP_TRANSPARENT socket option.
   1284  * @param tcp_mss: maximum segment size of tcp socket. default if zero.
   1285  * @param freebind: set IP_FREEBIND socket option.
   1286  * @param http2_nodelay: set TCP_NODELAY on HTTP/2 connection
   1287  * @param use_systemd: if true, fetch sockets from systemd.
   1288  * @param dnscrypt_port: dnscrypt service port number
   1289  * @param dscp: DSCP to use.
   1290  * @param quic_port: dns over quic port number.
   1291  * @param http_notls_downstream: if no tls is used for https downstream.
   1292  * @param sock_queue_timeout: the sock_queue_timeout from config. Seconds to
   1293  * 	wait to discard if UDP packets have waited for long in the socket
   1294  * 	buffer.
   1295  * @return: returns false on error.
   1296  */
   1297 static int
   1298 ports_create_if(const char* ifname, int do_auto, int do_udp, int do_tcp,
   1299 	struct addrinfo *hints, int port, struct listen_port** list,
   1300 	size_t rcv, size_t snd, int ssl_port,
   1301 	struct config_strlist* tls_additional_port, int https_port,
   1302 	struct config_strlist* proxy_protocol_port,
   1303 	int* reuseport, int transparent, int tcp_mss, int freebind,
   1304 	int http2_nodelay, int use_systemd, int dnscrypt_port, int dscp,
   1305 	int quic_port, int http_notls_downstream, int sock_queue_timeout)
   1306 {
   1307 	int s, noip6=0;
   1308 	int is_ssl = if_is_ssl(ifname, port, ssl_port, tls_additional_port);
   1309 	int is_https = if_is_https(ifname, port, https_port);
   1310 	int is_dnscrypt = if_is_dnscrypt(ifname, port, dnscrypt_port);
   1311 	int is_pp2 = if_is_pp2(ifname, port, proxy_protocol_port);
   1312 	int is_doq = if_is_quic(ifname, port, quic_port);
   1313 	/* Always set TCP_NODELAY on TLS connection as it speeds up the TLS
   1314 	 * handshake. DoH had already such option so we respect it.
   1315 	 * Otherwise the server waits before sending more handshake data for
   1316 	 * the client ACK (Nagle's algorithm), which is delayed because the
   1317 	 * client waits for more data before ACKing (delayed ACK). */
   1318 	int nodelay = is_https?http2_nodelay:is_ssl;
   1319 	struct unbound_socket* ub_sock;
   1320 	const char* add = NULL;
   1321 
   1322 	if(!do_udp && !do_tcp)
   1323 		return 0;
   1324 
   1325 	if(is_pp2) {
   1326 		if(is_dnscrypt) {
   1327 			fatal_exit("PROXYv2 and DNSCrypt combination not "
   1328 				"supported!");
   1329 		} else if(is_https) {
   1330 			fatal_exit("PROXYv2 and DoH combination not "
   1331 				"supported!");
   1332 		} else if(is_doq) {
   1333 			fatal_exit("PROXYv2 and DoQ combination not "
   1334 				"supported!");
   1335 		}
   1336 	}
   1337 
   1338 	/* Check if both UDP and TCP ports should be open.
   1339 	 * In the case of encrypted channels, probably an unencrypted channel
   1340 	 * at the same port is not desired. */
   1341 	if((is_ssl || is_https) && !is_doq) do_udp = do_auto = 0;
   1342 	if((is_doq) && !(is_https || is_ssl)) do_tcp = 0;
   1343 
   1344 	if(do_auto) {
   1345 		ub_sock = calloc(1, sizeof(struct unbound_socket));
   1346 		if(!ub_sock)
   1347 			return 0;
   1348 		if((s = make_sock_port(SOCK_DGRAM, ifname, port, hints, 1,
   1349 			&noip6, rcv, snd, reuseport, transparent,
   1350 			tcp_mss, nodelay, freebind, use_systemd, dscp, ub_sock,
   1351 			(is_dnscrypt?"udpancil_dnscrypt":"udpancil"))) == -1) {
   1352 			free(ub_sock->addr);
   1353 			free(ub_sock);
   1354 			if(noip6) {
   1355 				log_warn("IPv6 protocol not available");
   1356 				return 1;
   1357 			}
   1358 			return 0;
   1359 		}
   1360 		/* getting source addr packet info is highly non-portable */
   1361 		if(!set_recvpktinfo(s, hints->ai_family)) {
   1362 			sock_close(s);
   1363 			free(ub_sock->addr);
   1364 			free(ub_sock);
   1365 			return 0;
   1366 		}
   1367 		if (sock_queue_timeout && !set_recvtimestamp(s)) {
   1368 			log_warn("socket timestamping is not available");
   1369 		}
   1370 		if(!port_insert(list, s, is_dnscrypt
   1371 			?listen_type_udpancil_dnscrypt:listen_type_udpancil,
   1372 			is_pp2, ub_sock)) {
   1373 			sock_close(s);
   1374 			free(ub_sock->addr);
   1375 			free(ub_sock);
   1376 			return 0;
   1377 		}
   1378 	} else if(do_udp) {
   1379 		enum listen_type udp_port_type;
   1380 		ub_sock = calloc(1, sizeof(struct unbound_socket));
   1381 		if(!ub_sock)
   1382 			return 0;
   1383 		if(is_dnscrypt) {
   1384 			udp_port_type = listen_type_udp_dnscrypt;
   1385 			add = "dnscrypt";
   1386 		} else if(is_doq) {
   1387 			udp_port_type = listen_type_doq;
   1388 			add = "doq";
   1389 			if(if_listens_on(ifname, port, 53, NULL)) {
   1390 				log_err("DNS over QUIC is strictly not "
   1391 					"allowed on port 53 as per RFC 9250. "
   1392 					"Port 53 is for DNS datagrams. Error "
   1393 					"for interface '%s'.", ifname);
   1394 				free(ub_sock->addr);
   1395 				free(ub_sock);
   1396 				return 0;
   1397 			}
   1398 		} else {
   1399 			udp_port_type = listen_type_udp;
   1400 			add = NULL;
   1401 		}
   1402 		/* regular udp socket */
   1403 		if((s = make_sock_port(SOCK_DGRAM, ifname, port, hints, 1,
   1404 			&noip6, rcv, snd, reuseport, transparent,
   1405 			tcp_mss, nodelay, freebind, use_systemd, dscp, ub_sock,
   1406 			add)) == -1) {
   1407 			free(ub_sock->addr);
   1408 			free(ub_sock);
   1409 			if(noip6) {
   1410 				log_warn("IPv6 protocol not available");
   1411 				return 1;
   1412 			}
   1413 			return 0;
   1414 		}
   1415 		if(udp_port_type == listen_type_doq) {
   1416 			if(!set_recvpktinfo(s, hints->ai_family)) {
   1417 				sock_close(s);
   1418 				free(ub_sock->addr);
   1419 				free(ub_sock);
   1420 				return 0;
   1421 			}
   1422 		}
   1423 		if(udp_port_type == listen_type_udp && sock_queue_timeout)
   1424 			udp_port_type = listen_type_udpancil;
   1425 		if (sock_queue_timeout) {
   1426 			if(!set_recvtimestamp(s)) {
   1427 				log_warn("socket timestamping is not available");
   1428 			} else {
   1429 				if(udp_port_type == listen_type_udp)
   1430 					udp_port_type = listen_type_udpancil;
   1431 			}
   1432 		}
   1433 		if(!port_insert(list, s, udp_port_type, is_pp2, ub_sock)) {
   1434 			sock_close(s);
   1435 			free(ub_sock->addr);
   1436 			free(ub_sock);
   1437 			return 0;
   1438 		}
   1439 	}
   1440 	if(do_tcp) {
   1441 		enum listen_type port_type;
   1442 		ub_sock = calloc(1, sizeof(struct unbound_socket));
   1443 		if(!ub_sock)
   1444 			return 0;
   1445 		if(is_ssl) {
   1446 			port_type = listen_type_ssl;
   1447 			add = "tls";
   1448 		} else if(is_https) {
   1449 			port_type = listen_type_http;
   1450 			add = "https";
   1451 			if(http_notls_downstream)
   1452 				add = "http";
   1453 		} else if(is_dnscrypt) {
   1454 			port_type = listen_type_tcp_dnscrypt;
   1455 			add = "dnscrypt";
   1456 		} else {
   1457 			port_type = listen_type_tcp;
   1458 			add = NULL;
   1459 		}
   1460 		if((s = make_sock_port(SOCK_STREAM, ifname, port, hints, 1,
   1461 			&noip6, 0, 0, reuseport, transparent, tcp_mss, nodelay,
   1462 			freebind, use_systemd, dscp, ub_sock, add)) == -1) {
   1463 			free(ub_sock->addr);
   1464 			free(ub_sock);
   1465 			if(noip6) {
   1466 				/*log_warn("IPv6 protocol not available");*/
   1467 				return 1;
   1468 			}
   1469 			return 0;
   1470 		}
   1471 		if(is_ssl)
   1472 			verbose(VERB_ALGO, "setup TCP for SSL service");
   1473 		if(!port_insert(list, s, port_type, is_pp2, ub_sock)) {
   1474 			sock_close(s);
   1475 			free(ub_sock->addr);
   1476 			free(ub_sock);
   1477 			return 0;
   1478 		}
   1479 	}
   1480 	return 1;
   1481 }
   1482 
   1483 /**
   1484  * Add items to commpoint list in front.
   1485  * @param c: commpoint to add.
   1486  * @param front: listen struct.
   1487  * @return: false on failure.
   1488  */
   1489 static int
   1490 listen_cp_insert(struct comm_point* c, struct listen_dnsport* front)
   1491 {
   1492 	struct listen_list* item = (struct listen_list*)malloc(
   1493 		sizeof(struct listen_list));
   1494 	if(!item)
   1495 		return 0;
   1496 	item->com = c;
   1497 	item->next = front->cps;
   1498 	front->cps = item;
   1499 	return 1;
   1500 }
   1501 
   1502 void listen_setup_locks(void)
   1503 {
   1504 	if(!stream_wait_lock_inited) {
   1505 		lock_basic_init(&stream_wait_count_lock);
   1506 		stream_wait_lock_inited = 1;
   1507 	}
   1508 	if(!http2_query_buffer_lock_inited) {
   1509 		lock_basic_init(&http2_query_buffer_count_lock);
   1510 		http2_query_buffer_lock_inited = 1;
   1511 	}
   1512 	if(!http2_response_buffer_lock_inited) {
   1513 		lock_basic_init(&http2_response_buffer_count_lock);
   1514 		http2_response_buffer_lock_inited = 1;
   1515 	}
   1516 }
   1517 
   1518 void listen_desetup_locks(void)
   1519 {
   1520 	if(stream_wait_lock_inited) {
   1521 		stream_wait_lock_inited = 0;
   1522 		lock_basic_destroy(&stream_wait_count_lock);
   1523 	}
   1524 	if(http2_query_buffer_lock_inited) {
   1525 		http2_query_buffer_lock_inited = 0;
   1526 		lock_basic_destroy(&http2_query_buffer_count_lock);
   1527 	}
   1528 	if(http2_response_buffer_lock_inited) {
   1529 		http2_response_buffer_lock_inited = 0;
   1530 		lock_basic_destroy(&http2_response_buffer_count_lock);
   1531 	}
   1532 }
   1533 
   1534 struct listen_dnsport*
   1535 listen_create(struct comm_base* base, struct listen_port* ports,
   1536 	size_t bufsize, int tcp_accept_count, int tcp_idle_timeout,
   1537 	int harden_large_queries, uint32_t http_max_streams,
   1538 	char* http_endpoint, int http_notls, struct tcl_list* tcp_conn_limit,
   1539 	void* dot_sslctx, void* doh_sslctx, void* quic_sslctx,
   1540 	struct dt_env* dtenv,
   1541 	struct doq_table* doq_table,
   1542 	struct ub_randstate* rnd,struct config_file* cfg,
   1543 	comm_point_callback_type* cb, void *cb_arg)
   1544 {
   1545 	struct listen_dnsport* front = (struct listen_dnsport*)
   1546 		malloc(sizeof(struct listen_dnsport));
   1547 	if(!front)
   1548 		return NULL;
   1549 	front->cps = NULL;
   1550 	front->udp_buff = sldns_buffer_new(bufsize);
   1551 #ifdef USE_DNSCRYPT
   1552 	front->dnscrypt_udp_buff = NULL;
   1553 #endif
   1554 	if(!front->udp_buff) {
   1555 		free(front);
   1556 		return NULL;
   1557 	}
   1558 
   1559 	/* create comm points as needed */
   1560 	while(ports) {
   1561 		struct comm_point* cp = NULL;
   1562 		if(ports->ftype == listen_type_udp ||
   1563 		   ports->ftype == listen_type_udp_dnscrypt) {
   1564 			cp = comm_point_create_udp(base, ports->fd,
   1565 				front->udp_buff, ports->pp2_enabled, cb,
   1566 				cb_arg, ports->socket);
   1567 		} else if(ports->ftype == listen_type_doq && doq_table) {
   1568 #ifndef HAVE_NGTCP2
   1569 			log_warn("Unbound is not compiled with "
   1570 				"ngtcp2. This is required to use DNS "
   1571 				"over QUIC.");
   1572 #endif
   1573 			cp = comm_point_create_doq(base, ports->fd,
   1574 				front->udp_buff, cb, cb_arg, ports->socket,
   1575 				doq_table, rnd, quic_sslctx, cfg);
   1576 		} else if(ports->ftype == listen_type_tcp ||
   1577 				ports->ftype == listen_type_tcp_dnscrypt) {
   1578 			cp = comm_point_create_tcp(base, ports->fd,
   1579 				tcp_accept_count, tcp_idle_timeout,
   1580 				harden_large_queries, 0, NULL,
   1581 				tcp_conn_limit, bufsize, front->udp_buff,
   1582 				ports->ftype, ports->pp2_enabled, cb, cb_arg,
   1583 				ports->socket);
   1584 		} else if(ports->ftype == listen_type_ssl ||
   1585 			ports->ftype == listen_type_http) {
   1586 			cp = comm_point_create_tcp(base, ports->fd,
   1587 				tcp_accept_count, tcp_idle_timeout,
   1588 				harden_large_queries,
   1589 				http_max_streams, http_endpoint,
   1590 				tcp_conn_limit, bufsize, front->udp_buff,
   1591 				ports->ftype, ports->pp2_enabled, cb, cb_arg,
   1592 				ports->socket);
   1593 			if(ports->ftype == listen_type_http) {
   1594 				if(!doh_sslctx && !http_notls) {
   1595 					log_warn("HTTPS port configured, but "
   1596 						"no TLS tls-service-key or "
   1597 						"tls-service-pem set");
   1598 				}
   1599 #ifndef HAVE_SSL_CTX_SET_ALPN_SELECT_CB
   1600 				if(!http_notls) {
   1601 					log_warn("Unbound is not compiled "
   1602 						"with an OpenSSL version "
   1603 						"supporting ALPN "
   1604 						"(OpenSSL >= 1.0.2). This "
   1605 						"is required to use "
   1606 						"DNS-over-HTTPS");
   1607 				}
   1608 #endif
   1609 #ifndef HAVE_NGHTTP2_NGHTTP2_H
   1610 				log_warn("Unbound is not compiled with "
   1611 					"nghttp2. This is required to use "
   1612 					"DNS-over-HTTPS.");
   1613 #endif
   1614 			}
   1615 		} else if(ports->ftype == listen_type_udpancil ||
   1616 				  ports->ftype == listen_type_udpancil_dnscrypt) {
   1617 #if defined(AF_INET6) && defined(IPV6_PKTINFO) && defined(HAVE_RECVMSG)
   1618 			cp = comm_point_create_udp_ancil(base, ports->fd,
   1619 				front->udp_buff, ports->pp2_enabled, cb,
   1620 				cb_arg, ports->socket);
   1621 #else
   1622 			log_warn("This system does not support UDP ancillary data.");
   1623 #endif
   1624 		}
   1625 		if(!cp) {
   1626 			log_err("can't create commpoint");
   1627 			listen_delete(front);
   1628 			return NULL;
   1629 		}
   1630 		if((http_notls && ports->ftype == listen_type_http) ||
   1631 			(ports->ftype == listen_type_tcp) ||
   1632 			(ports->ftype == listen_type_udp) ||
   1633 			(ports->ftype == listen_type_udpancil) ||
   1634 			(ports->ftype == listen_type_tcp_dnscrypt) ||
   1635 			(ports->ftype == listen_type_udp_dnscrypt) ||
   1636 			(ports->ftype == listen_type_udpancil_dnscrypt)) {
   1637 			cp->ssl = NULL;
   1638 		} else if(ports->ftype == listen_type_doq) {
   1639 			cp->ssl = quic_sslctx;
   1640 		} else if(ports->ftype == listen_type_http) {
   1641 			cp->ssl = doh_sslctx;
   1642 		} else {
   1643 			cp->ssl = dot_sslctx;
   1644 		}
   1645 		cp->dtenv = dtenv;
   1646 		cp->do_not_close = 1;
   1647 #ifdef USE_DNSCRYPT
   1648 		if (ports->ftype == listen_type_udp_dnscrypt ||
   1649 			ports->ftype == listen_type_tcp_dnscrypt ||
   1650 			ports->ftype == listen_type_udpancil_dnscrypt) {
   1651 			cp->dnscrypt = 1;
   1652 			cp->dnscrypt_buffer = sldns_buffer_new(bufsize);
   1653 			if(!cp->dnscrypt_buffer) {
   1654 				log_err("can't alloc dnscrypt_buffer");
   1655 				comm_point_delete(cp);
   1656 				listen_delete(front);
   1657 				return NULL;
   1658 			}
   1659 			front->dnscrypt_udp_buff = cp->dnscrypt_buffer;
   1660 		}
   1661 #endif
   1662 		if(!listen_cp_insert(cp, front)) {
   1663 			log_err("malloc failed");
   1664 			comm_point_delete(cp);
   1665 			listen_delete(front);
   1666 			return NULL;
   1667 		}
   1668 		ports = ports->next;
   1669 	}
   1670 	if(!front->cps) {
   1671 		log_err("Could not open sockets to accept queries.");
   1672 		listen_delete(front);
   1673 		return NULL;
   1674 	}
   1675 
   1676 	return front;
   1677 }
   1678 
   1679 void
   1680 listen_list_delete(struct listen_list* list)
   1681 {
   1682 	struct listen_list *p = list, *pn;
   1683 	while(p) {
   1684 		pn = p->next;
   1685 		comm_point_delete(p->com);
   1686 		free(p);
   1687 		p = pn;
   1688 	}
   1689 }
   1690 
   1691 void
   1692 listen_delete(struct listen_dnsport* front)
   1693 {
   1694 	if(!front)
   1695 		return;
   1696 	listen_list_delete(front->cps);
   1697 #ifdef USE_DNSCRYPT
   1698 	if(front->dnscrypt_udp_buff &&
   1699 		front->udp_buff != front->dnscrypt_udp_buff) {
   1700 		sldns_buffer_free(front->dnscrypt_udp_buff);
   1701 	}
   1702 #endif
   1703 	sldns_buffer_free(front->udp_buff);
   1704 	free(front);
   1705 }
   1706 
   1707 #ifdef HAVE_GETIFADDRS
   1708 static int
   1709 resolve_ifa_name(struct ifaddrs *ifas, const char *search_ifa, char ***ip_addresses, int *ip_addresses_size)
   1710 {
   1711 	struct ifaddrs *ifa;
   1712 	void *tmpbuf;
   1713 	int last_ip_addresses_size = *ip_addresses_size;
   1714 
   1715 	for(ifa = ifas; ifa != NULL; ifa = ifa->ifa_next) {
   1716 		sa_family_t family;
   1717 		const char* atsign;
   1718 #ifdef INET6      /* |   address ip    | % |  ifa name  | @ |  port  | nul */
   1719 		char addr_buf[INET6_ADDRSTRLEN + 1 + IF_NAMESIZE + 1 + 16 + 1];
   1720 #else
   1721 		char addr_buf[INET_ADDRSTRLEN + 1 + 16 + 1];
   1722 #endif
   1723 
   1724 		if((atsign=strrchr(search_ifa, '@')) != NULL) {
   1725 			if(strlen(ifa->ifa_name) != (size_t)(atsign-search_ifa)
   1726 			   || strncmp(ifa->ifa_name, search_ifa,
   1727 			   atsign-search_ifa) != 0)
   1728 				continue;
   1729 		} else {
   1730 			if(strcmp(ifa->ifa_name, search_ifa) != 0)
   1731 				continue;
   1732 			atsign = "";
   1733 		}
   1734 
   1735 		if(ifa->ifa_addr == NULL)
   1736 			continue;
   1737 
   1738 		family = ifa->ifa_addr->sa_family;
   1739 		if(family == AF_INET) {
   1740 			char a4[INET_ADDRSTRLEN + 1];
   1741 			struct sockaddr_in *in4 = (struct sockaddr_in *)
   1742 				ifa->ifa_addr;
   1743 			if(!inet_ntop(family, &in4->sin_addr, a4, sizeof(a4))) {
   1744 				log_err("inet_ntop failed");
   1745 				return 0;
   1746 			}
   1747 			snprintf(addr_buf, sizeof(addr_buf), "%s%s",
   1748 				a4, atsign);
   1749 		}
   1750 #ifdef INET6
   1751 		else if(family == AF_INET6) {
   1752 			struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)
   1753 				ifa->ifa_addr;
   1754 			char a6[INET6_ADDRSTRLEN + 1];
   1755 			char if_index_name[IF_NAMESIZE + 1];
   1756 			if_index_name[0] = 0;
   1757 			if(!inet_ntop(family, &in6->sin6_addr, a6, sizeof(a6))) {
   1758 				log_err("inet_ntop failed");
   1759 				return 0;
   1760 			}
   1761 			(void)if_indextoname(in6->sin6_scope_id,
   1762 				(char *)if_index_name);
   1763 			if (strlen(if_index_name) != 0) {
   1764 				snprintf(addr_buf, sizeof(addr_buf),
   1765 					"%s%%%s%s", a6, if_index_name, atsign);
   1766 			} else {
   1767 				snprintf(addr_buf, sizeof(addr_buf), "%s%s",
   1768 					a6, atsign);
   1769 			}
   1770 		}
   1771 #endif
   1772 		else {
   1773 			continue;
   1774 		}
   1775 		verbose(4, "interface %s has address %s", search_ifa, addr_buf);
   1776 
   1777 		tmpbuf = realloc(*ip_addresses, sizeof(char *) * (*ip_addresses_size + 1));
   1778 		if(!tmpbuf) {
   1779 			log_err("realloc failed: out of memory");
   1780 			return 0;
   1781 		} else {
   1782 			*ip_addresses = tmpbuf;
   1783 		}
   1784 		(*ip_addresses)[*ip_addresses_size] = strdup(addr_buf);
   1785 		if(!(*ip_addresses)[*ip_addresses_size]) {
   1786 			log_err("strdup failed: out of memory");
   1787 			return 0;
   1788 		}
   1789 		(*ip_addresses_size)++;
   1790 	}
   1791 
   1792 	if (*ip_addresses_size == last_ip_addresses_size) {
   1793 		tmpbuf = realloc(*ip_addresses, sizeof(char *) * (*ip_addresses_size + 1));
   1794 		if(!tmpbuf) {
   1795 			log_err("realloc failed: out of memory");
   1796 			return 0;
   1797 		} else {
   1798 			*ip_addresses = tmpbuf;
   1799 		}
   1800 		(*ip_addresses)[*ip_addresses_size] = strdup(search_ifa);
   1801 		if(!(*ip_addresses)[*ip_addresses_size]) {
   1802 			log_err("strdup failed: out of memory");
   1803 			return 0;
   1804 		}
   1805 		(*ip_addresses_size)++;
   1806 	}
   1807 	return 1;
   1808 }
   1809 #endif /* HAVE_GETIFADDRS */
   1810 
   1811 int resolve_interface_names(char** ifs, int num_ifs,
   1812 	struct config_strlist* list, char*** resif, int* num_resif)
   1813 {
   1814 #ifdef HAVE_GETIFADDRS
   1815 	struct ifaddrs *addrs = NULL;
   1816 	if(num_ifs == 0 && list == NULL) {
   1817 		*resif = NULL;
   1818 		*num_resif = 0;
   1819 		return 1;
   1820 	}
   1821 	if(getifaddrs(&addrs) == -1) {
   1822 		log_err("failed to list interfaces: getifaddrs: %s",
   1823 			strerror(errno));
   1824 		freeifaddrs(addrs);
   1825 		return 0;
   1826 	}
   1827 	if(ifs) {
   1828 		int i;
   1829 		for(i=0; i<num_ifs; i++) {
   1830 			if(!resolve_ifa_name(addrs, ifs[i], resif, num_resif)) {
   1831 				freeifaddrs(addrs);
   1832 				config_del_strarray(*resif, *num_resif);
   1833 				*resif = NULL;
   1834 				*num_resif = 0;
   1835 				return 0;
   1836 			}
   1837 		}
   1838 	}
   1839 	if(list) {
   1840 		struct config_strlist* p;
   1841 		for(p = list; p; p = p->next) {
   1842 			if(!resolve_ifa_name(addrs, p->str, resif, num_resif)) {
   1843 				freeifaddrs(addrs);
   1844 				config_del_strarray(*resif, *num_resif);
   1845 				*resif = NULL;
   1846 				*num_resif = 0;
   1847 				return 0;
   1848 			}
   1849 }
   1850 	}
   1851 	freeifaddrs(addrs);
   1852 	return 1;
   1853 #else
   1854 	struct config_strlist* p;
   1855 	if(num_ifs == 0 && list == NULL) {
   1856 		*resif = NULL;
   1857 		*num_resif = 0;
   1858 		return 1;
   1859 	}
   1860 	*num_resif = num_ifs;
   1861 	for(p = list; p; p = p->next) {
   1862 		(*num_resif)++;
   1863 	}
   1864 	*resif = calloc(*num_resif, sizeof(**resif));
   1865 	if(!*resif) {
   1866 		log_err("out of memory");
   1867 		return 0;
   1868 	}
   1869 	if(ifs) {
   1870 		int i;
   1871 		for(i=0; i<num_ifs; i++) {
   1872 			(*resif)[i] = strdup(ifs[i]);
   1873 			if(!((*resif)[i])) {
   1874 				log_err("out of memory");
   1875 				config_del_strarray(*resif, *num_resif);
   1876 				*resif = NULL;
   1877 				*num_resif = 0;
   1878 				return 0;
   1879 			}
   1880 		}
   1881 	}
   1882 	if(list) {
   1883 		int idx = num_ifs;
   1884 		for(p = list; p; p = p->next) {
   1885 			(*resif)[idx] = strdup(p->str);
   1886 			if(!((*resif)[idx])) {
   1887 				log_err("out of memory");
   1888 				config_del_strarray(*resif, *num_resif);
   1889 				*resif = NULL;
   1890 				*num_resif = 0;
   1891 				return 0;
   1892 			}
   1893 			idx++;
   1894 		}
   1895 	}
   1896 	return 1;
   1897 #endif /* HAVE_GETIFADDRS */
   1898 }
   1899 
   1900 struct listen_port*
   1901 listening_ports_open(struct config_file* cfg, char** ifs, int num_ifs,
   1902 	int* reuseport)
   1903 {
   1904 	struct listen_port* list = NULL;
   1905 	struct addrinfo hints;
   1906 	int i, do_ip4, do_ip6;
   1907 	int do_tcp, do_auto;
   1908 	do_ip4 = cfg->do_ip4;
   1909 	do_ip6 = cfg->do_ip6;
   1910 	do_tcp = cfg->do_tcp;
   1911 	do_auto = cfg->if_automatic && cfg->do_udp;
   1912 	if(cfg->incoming_num_tcp == 0)
   1913 		do_tcp = 0;
   1914 
   1915 	/* getaddrinfo */
   1916 	memset(&hints, 0, sizeof(hints));
   1917 	hints.ai_flags = AI_PASSIVE;
   1918 	/* no name lookups on our listening ports */
   1919 	if(num_ifs > 0)
   1920 		hints.ai_flags |= AI_NUMERICHOST;
   1921 	hints.ai_family = AF_UNSPEC;
   1922 #ifndef INET6
   1923 	do_ip6 = 0;
   1924 #endif
   1925 	if(!do_ip4 && !do_ip6) {
   1926 		return NULL;
   1927 	}
   1928 	/* create ip4 and ip6 ports so that return addresses are nice. */
   1929 	if(do_auto || num_ifs == 0) {
   1930 		if(do_auto && cfg->if_automatic_ports &&
   1931 			cfg->if_automatic_ports[0]!=0) {
   1932 			char* now = cfg->if_automatic_ports;
   1933 			while(now && *now) {
   1934 				char* after;
   1935 				int extraport;
   1936 				while(isspace((unsigned char)*now))
   1937 					now++;
   1938 				if(!*now)
   1939 					break;
   1940 				after = now;
   1941 				extraport = (int)strtol(now, &after, 10);
   1942 				if(extraport < 0 || extraport > 65535) {
   1943 					log_err("interface-automatic-ports port number out of range, at position %d of '%s'", (int)(now-cfg->if_automatic_ports)+1, cfg->if_automatic_ports);
   1944 					listening_ports_free(list);
   1945 					return NULL;
   1946 				}
   1947 				if(extraport == 0 && now == after) {
   1948 					log_err("interface-automatic-ports could not be parsed, at position %d of '%s'", (int)(now-cfg->if_automatic_ports)+1, cfg->if_automatic_ports);
   1949 					listening_ports_free(list);
   1950 					return NULL;
   1951 				}
   1952 				now = after;
   1953 				if(do_ip6) {
   1954 					hints.ai_family = AF_INET6;
   1955 					if(!ports_create_if("::0",
   1956 						do_auto, cfg->do_udp, do_tcp,
   1957 						&hints, extraport, &list,
   1958 						cfg->so_rcvbuf, cfg->so_sndbuf,
   1959 						cfg->ssl_port, cfg->tls_additional_port,
   1960 						cfg->https_port,
   1961 						cfg->proxy_protocol_port,
   1962 						reuseport, cfg->ip_transparent,
   1963 						cfg->tcp_mss, cfg->ip_freebind,
   1964 						cfg->http_nodelay, cfg->use_systemd,
   1965 						cfg->dnscrypt_port, cfg->ip_dscp,
   1966 						cfg->quic_port, cfg->http_notls_downstream,
   1967 						cfg->sock_queue_timeout)) {
   1968 						listening_ports_free(list);
   1969 						return NULL;
   1970 					}
   1971 				}
   1972 				if(do_ip4) {
   1973 					hints.ai_family = AF_INET;
   1974 					if(!ports_create_if("0.0.0.0",
   1975 						do_auto, cfg->do_udp, do_tcp,
   1976 						&hints, extraport, &list,
   1977 						cfg->so_rcvbuf, cfg->so_sndbuf,
   1978 						cfg->ssl_port, cfg->tls_additional_port,
   1979 						cfg->https_port,
   1980 						cfg->proxy_protocol_port,
   1981 						reuseport, cfg->ip_transparent,
   1982 						cfg->tcp_mss, cfg->ip_freebind,
   1983 						cfg->http_nodelay, cfg->use_systemd,
   1984 						cfg->dnscrypt_port, cfg->ip_dscp,
   1985 						cfg->quic_port, cfg->http_notls_downstream,
   1986 						cfg->sock_queue_timeout)) {
   1987 						listening_ports_free(list);
   1988 						return NULL;
   1989 					}
   1990 				}
   1991 			}
   1992 			return list;
   1993 		}
   1994 		if(do_ip6) {
   1995 			hints.ai_family = AF_INET6;
   1996 			if(!ports_create_if(do_auto?"::0":"::1",
   1997 				do_auto, cfg->do_udp, do_tcp,
   1998 				&hints, cfg->port, &list,
   1999 				cfg->so_rcvbuf, cfg->so_sndbuf,
   2000 				cfg->ssl_port, cfg->tls_additional_port,
   2001 				cfg->https_port, cfg->proxy_protocol_port,
   2002 				reuseport, cfg->ip_transparent,
   2003 				cfg->tcp_mss, cfg->ip_freebind,
   2004 				cfg->http_nodelay, cfg->use_systemd,
   2005 				cfg->dnscrypt_port, cfg->ip_dscp,
   2006 				cfg->quic_port, cfg->http_notls_downstream,
   2007 				cfg->sock_queue_timeout)) {
   2008 				listening_ports_free(list);
   2009 				return NULL;
   2010 			}
   2011 		}
   2012 		if(do_ip4) {
   2013 			hints.ai_family = AF_INET;
   2014 			if(!ports_create_if(do_auto?"0.0.0.0":"127.0.0.1",
   2015 				do_auto, cfg->do_udp, do_tcp,
   2016 				&hints, cfg->port, &list,
   2017 				cfg->so_rcvbuf, cfg->so_sndbuf,
   2018 				cfg->ssl_port, cfg->tls_additional_port,
   2019 				cfg->https_port, cfg->proxy_protocol_port,
   2020 				reuseport, cfg->ip_transparent,
   2021 				cfg->tcp_mss, cfg->ip_freebind,
   2022 				cfg->http_nodelay, cfg->use_systemd,
   2023 				cfg->dnscrypt_port, cfg->ip_dscp,
   2024 				cfg->quic_port, cfg->http_notls_downstream,
   2025 				cfg->sock_queue_timeout)) {
   2026 				listening_ports_free(list);
   2027 				return NULL;
   2028 			}
   2029 		}
   2030 	} else for(i = 0; i<num_ifs; i++) {
   2031 		if(str_is_ip6(ifs[i])) {
   2032 			if(!do_ip6)
   2033 				continue;
   2034 			hints.ai_family = AF_INET6;
   2035 			if(!ports_create_if(ifs[i], 0, cfg->do_udp,
   2036 				do_tcp, &hints, cfg->port, &list,
   2037 				cfg->so_rcvbuf, cfg->so_sndbuf,
   2038 				cfg->ssl_port, cfg->tls_additional_port,
   2039 				cfg->https_port, cfg->proxy_protocol_port,
   2040 				reuseport, cfg->ip_transparent,
   2041 				cfg->tcp_mss, cfg->ip_freebind,
   2042 				cfg->http_nodelay, cfg->use_systemd,
   2043 				cfg->dnscrypt_port, cfg->ip_dscp,
   2044 				cfg->quic_port, cfg->http_notls_downstream,
   2045 				cfg->sock_queue_timeout)) {
   2046 				listening_ports_free(list);
   2047 				return NULL;
   2048 			}
   2049 		} else {
   2050 			if(!do_ip4)
   2051 				continue;
   2052 			hints.ai_family = AF_INET;
   2053 			if(!ports_create_if(ifs[i], 0, cfg->do_udp,
   2054 				do_tcp, &hints, cfg->port, &list,
   2055 				cfg->so_rcvbuf, cfg->so_sndbuf,
   2056 				cfg->ssl_port, cfg->tls_additional_port,
   2057 				cfg->https_port, cfg->proxy_protocol_port,
   2058 				reuseport, cfg->ip_transparent,
   2059 				cfg->tcp_mss, cfg->ip_freebind,
   2060 				cfg->http_nodelay, cfg->use_systemd,
   2061 				cfg->dnscrypt_port, cfg->ip_dscp,
   2062 				cfg->quic_port, cfg->http_notls_downstream,
   2063 				cfg->sock_queue_timeout)) {
   2064 				listening_ports_free(list);
   2065 				return NULL;
   2066 			}
   2067 		}
   2068 	}
   2069 
   2070 	return list;
   2071 }
   2072 
   2073 void listening_ports_free(struct listen_port* list)
   2074 {
   2075 	struct listen_port* nx;
   2076 	while(list) {
   2077 		nx = list->next;
   2078 		if(list->fd != -1) {
   2079 			sock_close(list->fd);
   2080 		}
   2081 		/* rc_ports don't have ub_socket */
   2082 		if(list->socket) {
   2083 			free(list->socket->addr);
   2084 			free(list->socket);
   2085 		}
   2086 		free(list);
   2087 		list = nx;
   2088 	}
   2089 }
   2090 
   2091 size_t listen_get_mem(struct listen_dnsport* listen)
   2092 {
   2093 	struct listen_list* p;
   2094 	size_t s = sizeof(*listen) + sizeof(*listen->base) +
   2095 		sizeof(*listen->udp_buff) +
   2096 		sldns_buffer_capacity(listen->udp_buff);
   2097 #ifdef USE_DNSCRYPT
   2098 	s += sizeof(*listen->dnscrypt_udp_buff);
   2099 	if(listen->udp_buff != listen->dnscrypt_udp_buff){
   2100 		s += sldns_buffer_capacity(listen->dnscrypt_udp_buff);
   2101 	}
   2102 #endif
   2103 	for(p = listen->cps; p; p = p->next) {
   2104 		s += sizeof(*p);
   2105 		s += comm_point_get_mem(p->com);
   2106 	}
   2107 	return s;
   2108 }
   2109 
   2110 void listen_stop_accept(struct listen_dnsport* listen)
   2111 {
   2112 	/* do not stop the ones that have no tcp_free list
   2113 	 * (they have already stopped listening) */
   2114 	struct listen_list* p;
   2115 	for(p=listen->cps; p; p=p->next) {
   2116 		if(p->com->type == comm_tcp_accept &&
   2117 			p->com->tcp_free != NULL) {
   2118 			comm_point_stop_listening(p->com);
   2119 		}
   2120 	}
   2121 }
   2122 
   2123 void listen_start_accept(struct listen_dnsport* listen)
   2124 {
   2125 	/* do not start the ones that have no tcp_free list, it is no
   2126 	 * use to listen to them because they have no free tcp handlers */
   2127 	struct listen_list* p;
   2128 	for(p=listen->cps; p; p=p->next) {
   2129 		if(p->com->type == comm_tcp_accept &&
   2130 			p->com->tcp_free != NULL) {
   2131 			comm_point_start_listening(p->com, -1, -1);
   2132 		}
   2133 	}
   2134 }
   2135 
   2136 struct tcp_req_info*
   2137 tcp_req_info_create(struct sldns_buffer* spoolbuf)
   2138 {
   2139 	struct tcp_req_info* req = (struct tcp_req_info*)malloc(sizeof(*req));
   2140 	if(!req) {
   2141 		log_err("malloc failure for new stream outoforder processing structure");
   2142 		return NULL;
   2143 	}
   2144 	memset(req, 0, sizeof(*req));
   2145 	req->spool_buffer = spoolbuf;
   2146 	return req;
   2147 }
   2148 
   2149 void
   2150 tcp_req_info_delete(struct tcp_req_info* req)
   2151 {
   2152 	if(!req) return;
   2153 	tcp_req_info_clear(req);
   2154 	/* cp is pointer back to commpoint that owns this struct and
   2155 	 * called delete on us */
   2156 	/* spool_buffer is shared udp buffer, not deleted here */
   2157 	free(req);
   2158 }
   2159 
   2160 void tcp_req_info_clear(struct tcp_req_info* req)
   2161 {
   2162 	struct tcp_req_open_item* open, *nopen;
   2163 	struct tcp_req_done_item* item, *nitem;
   2164 	if(!req) return;
   2165 
   2166 	/* free outstanding request mesh reply entries */
   2167 	open = req->open_req_list;
   2168 	while(open) {
   2169 		nopen = open->next;
   2170 		mesh_state_remove_reply(open->mesh, open->mesh_state, req->cp);
   2171 		free(open);
   2172 		open = nopen;
   2173 	}
   2174 	req->open_req_list = NULL;
   2175 	req->num_open_req = 0;
   2176 
   2177 	/* free pending writable result packets */
   2178 	item = req->done_req_list;
   2179 	while(item) {
   2180 		nitem = item->next;
   2181 		lock_basic_lock(&stream_wait_count_lock);
   2182 		stream_wait_count -= (sizeof(struct tcp_req_done_item)
   2183 			+item->len);
   2184 		lock_basic_unlock(&stream_wait_count_lock);
   2185 		free(item->buf);
   2186 		free(item);
   2187 		item = nitem;
   2188 	}
   2189 	req->done_req_list = NULL;
   2190 	req->num_done_req = 0;
   2191 	req->read_is_closed = 0;
   2192 }
   2193 
   2194 void
   2195 tcp_req_info_remove_mesh_state(struct tcp_req_info* req, struct mesh_state* m)
   2196 {
   2197 	struct tcp_req_open_item* open, *prev = NULL;
   2198 	if(!req || !m) return;
   2199 	open = req->open_req_list;
   2200 	while(open) {
   2201 		if(open->mesh_state == m) {
   2202 			struct tcp_req_open_item* next;
   2203 			if(prev) prev->next = open->next;
   2204 			else req->open_req_list = open->next;
   2205 			/* caller has to manage the mesh state reply entry */
   2206 			next = open->next;
   2207 			free(open);
   2208 			req->num_open_req --;
   2209 
   2210 			/* prev = prev; */
   2211 			open = next;
   2212 			continue;
   2213 		}
   2214 		prev = open;
   2215 		open = open->next;
   2216 	}
   2217 }
   2218 
   2219 /** setup listening for read or write */
   2220 static void
   2221 tcp_req_info_setup_listen(struct tcp_req_info* req)
   2222 {
   2223 	int wr = 0;
   2224 	int rd = 0;
   2225 
   2226 	if(req->cp->tcp_byte_count != 0) {
   2227 		/* cannot change, halfway through */
   2228 		return;
   2229 	}
   2230 
   2231 	if(!req->cp->tcp_is_reading)
   2232 		wr = 1;
   2233 	if(!req->read_is_closed)
   2234 		rd = 1;
   2235 
   2236 	if(wr) {
   2237 		req->cp->tcp_is_reading = 0;
   2238 		comm_point_stop_listening(req->cp);
   2239 		comm_point_start_listening(req->cp, -1,
   2240 			adjusted_tcp_timeout(req->cp));
   2241 	} else if(rd) {
   2242 		req->cp->tcp_is_reading = 1;
   2243 		comm_point_stop_listening(req->cp);
   2244 		comm_point_start_listening(req->cp, -1,
   2245 			adjusted_tcp_timeout(req->cp));
   2246 		/* and also read it (from SSL stack buffers), so
   2247 		 * no event read event is expected since the remainder of
   2248 		 * the TLS frame is sitting in the buffers. */
   2249 		req->read_again = 1;
   2250 	} else {
   2251 		comm_point_stop_listening(req->cp);
   2252 		comm_point_start_listening(req->cp, -1,
   2253 			adjusted_tcp_timeout(req->cp));
   2254 		comm_point_listen_for_rw(req->cp, 0, 0);
   2255 	}
   2256 }
   2257 
   2258 /** remove first item from list of pending results */
   2259 static struct tcp_req_done_item*
   2260 tcp_req_info_pop_done(struct tcp_req_info* req)
   2261 {
   2262 	struct tcp_req_done_item* item;
   2263 	log_assert(req->num_done_req > 0 && req->done_req_list);
   2264 	item = req->done_req_list;
   2265 	lock_basic_lock(&stream_wait_count_lock);
   2266 	stream_wait_count -= (sizeof(struct tcp_req_done_item)+item->len);
   2267 	lock_basic_unlock(&stream_wait_count_lock);
   2268 	req->done_req_list = req->done_req_list->next;
   2269 	req->num_done_req --;
   2270 	return item;
   2271 }
   2272 
   2273 /** Send given buffer and setup to write */
   2274 static void
   2275 tcp_req_info_start_write_buf(struct tcp_req_info* req, uint8_t* buf,
   2276 	size_t len)
   2277 {
   2278 	sldns_buffer_clear(req->cp->buffer);
   2279 	sldns_buffer_write(req->cp->buffer, buf, len);
   2280 	sldns_buffer_flip(req->cp->buffer);
   2281 
   2282 	req->cp->tcp_is_reading = 0; /* we are now writing */
   2283 }
   2284 
   2285 /** pick up the next result and start writing it to the channel */
   2286 static void
   2287 tcp_req_pickup_next_result(struct tcp_req_info* req)
   2288 {
   2289 	if(req->num_done_req > 0) {
   2290 		/* unlist the done item from the list of pending results */
   2291 		struct tcp_req_done_item* item = tcp_req_info_pop_done(req);
   2292 		tcp_req_info_start_write_buf(req, item->buf, item->len);
   2293 		free(item->buf);
   2294 		free(item);
   2295 	}
   2296 }
   2297 
   2298 /** the read channel has closed */
   2299 int
   2300 tcp_req_info_handle_read_close(struct tcp_req_info* req)
   2301 {
   2302 	verbose(VERB_ALGO, "tcp channel read side closed %d", req->cp->fd);
   2303 	/* RFC 7766 6.2.4 says to drop pending replies when client closes. */
   2304 	return 0; /* drop connection */
   2305 }
   2306 
   2307 void
   2308 tcp_req_info_handle_writedone(struct tcp_req_info* req)
   2309 {
   2310 	/* back to reading state, we finished this write event */
   2311 	sldns_buffer_clear(req->cp->buffer);
   2312 	if(req->num_done_req == 0 && req->read_is_closed) {
   2313 		/* no more to write and nothing to read, close it */
   2314 		comm_point_drop_reply(&req->cp->repinfo);
   2315 		return;
   2316 	}
   2317 	req->cp->tcp_is_reading = 1;
   2318 	/* see if another result needs writing */
   2319 	tcp_req_pickup_next_result(req);
   2320 
   2321 	/* see if there is more to write, if not stop_listening for writing */
   2322 	/* see if new requests are allowed, if so, start_listening
   2323 	 * for reading */
   2324 	tcp_req_info_setup_listen(req);
   2325 }
   2326 
   2327 void
   2328 tcp_req_info_handle_readdone(struct tcp_req_info* req)
   2329 {
   2330 	struct comm_point* c = req->cp;
   2331 
   2332 	/* we want to read up several requests, unless there are
   2333 	 * pending answers */
   2334 
   2335 	req->is_drop = 0;
   2336 	req->is_reply = 0;
   2337 	req->in_worker_handle = 1;
   2338 	sldns_buffer_set_limit(req->spool_buffer, 0);
   2339 	/* handle the current request */
   2340 	/* this calls the worker handle request routine that could give
   2341 	 * a cache response, or localdata response, or drop the reply,
   2342 	 * or schedule a mesh entry for later */
   2343 	fptr_ok(fptr_whitelist_comm_point(c->callback));
   2344 	if( (*c->callback)(c, c->cb_arg, NETEVENT_NOERROR, &c->repinfo) ) {
   2345 		req->in_worker_handle = 0;
   2346 		/* there is an answer, put it up.  It is already in the
   2347 		 * c->buffer, just send it. */
   2348 		/* since we were just reading a query, the channel is
   2349 		 * clear to write to */
   2350 	send_it:
   2351 		c->tcp_is_reading = 0;
   2352 		comm_point_stop_listening(c);
   2353 		comm_point_start_listening(c, -1, adjusted_tcp_timeout(c));
   2354 		return;
   2355 	}
   2356 	req->in_worker_handle = 0;
   2357 	/* it should be waiting in the mesh for recursion.
   2358 	 * If mesh failed to add a new entry and called commpoint_drop_reply.
   2359 	 * Then the mesh state has been cleared. */
   2360 	if(req->is_drop) {
   2361 		/* the reply has been dropped, stream has been closed. */
   2362 		return;
   2363 	}
   2364 	/* If mesh failed(mallocfail) and called commpoint_send_reply with
   2365 	 * something like servfail then we pick up that reply below. */
   2366 	if(req->is_reply) {
   2367 		goto send_it;
   2368 	}
   2369 
   2370 	sldns_buffer_clear(c->buffer);
   2371 	/* if pending answers, pick up an answer and start sending it */
   2372 	tcp_req_pickup_next_result(req);
   2373 
   2374 	/* if answers pending, start sending answers */
   2375 	/* read more requests if we can have more requests */
   2376 	tcp_req_info_setup_listen(req);
   2377 }
   2378 
   2379 int
   2380 tcp_req_info_add_meshstate(struct tcp_req_info* req,
   2381 	struct mesh_area* mesh, struct mesh_state* m)
   2382 {
   2383 	struct tcp_req_open_item* item;
   2384 	log_assert(req && mesh && m);
   2385 	item = (struct tcp_req_open_item*)malloc(sizeof(*item));
   2386 	if(!item) return 0;
   2387 	item->next = req->open_req_list;
   2388 	item->mesh = mesh;
   2389 	item->mesh_state = m;
   2390 	req->open_req_list = item;
   2391 	req->num_open_req++;
   2392 	return 1;
   2393 }
   2394 
   2395 /** Add a result to the result list.  At the end. */
   2396 static int
   2397 tcp_req_info_add_result(struct tcp_req_info* req, uint8_t* buf, size_t len)
   2398 {
   2399 	struct tcp_req_done_item* last = NULL;
   2400 	struct tcp_req_done_item* item;
   2401 	size_t space;
   2402 
   2403 	/* see if we have space */
   2404 	space = sizeof(struct tcp_req_done_item) + len;
   2405 	lock_basic_lock(&stream_wait_count_lock);
   2406 	if(stream_wait_count + space > stream_wait_max) {
   2407 		lock_basic_unlock(&stream_wait_count_lock);
   2408 		verbose(VERB_ALGO, "drop stream reply, no space left, in stream-wait-size");
   2409 		return 0;
   2410 	}
   2411 	stream_wait_count += space;
   2412 	lock_basic_unlock(&stream_wait_count_lock);
   2413 
   2414 	/* find last element */
   2415 	last = req->done_req_list;
   2416 	while(last && last->next)
   2417 		last = last->next;
   2418 
   2419 	/* create new element */
   2420 	item = (struct tcp_req_done_item*)malloc(sizeof(*item));
   2421 	if(!item) {
   2422 		log_err("malloc failure, for stream result list");
   2423 		return 0;
   2424 	}
   2425 	item->next = NULL;
   2426 	item->len = len;
   2427 	item->buf = memdup(buf, len);
   2428 	if(!item->buf) {
   2429 		free(item);
   2430 		log_err("malloc failure, adding reply to stream result list");
   2431 		return 0;
   2432 	}
   2433 
   2434 	/* link in */
   2435 	if(last) last->next = item;
   2436 	else req->done_req_list = item;
   2437 	req->num_done_req++;
   2438 	return 1;
   2439 }
   2440 
   2441 void
   2442 tcp_req_info_send_reply(struct tcp_req_info* req)
   2443 {
   2444 	if(req->in_worker_handle) {
   2445 		/* reply from mesh is in the spool_buffer */
   2446 		/* copy now, so that the spool buffer is free for other tasks
   2447 		 * before the callback is done */
   2448 		sldns_buffer_clear(req->cp->buffer);
   2449 		sldns_buffer_write(req->cp->buffer,
   2450 			sldns_buffer_begin(req->spool_buffer),
   2451 			sldns_buffer_limit(req->spool_buffer));
   2452 		sldns_buffer_flip(req->cp->buffer);
   2453 		req->is_reply = 1;
   2454 		return;
   2455 	}
   2456 	/* now that the query has been handled, that mesh_reply entry
   2457 	 * should be removed, from the tcp_req_info list,
   2458 	 * the mesh state cleanup removes then with region_cleanup and
   2459 	 * replies_sent true. */
   2460 	/* see if we can send it straight away (we are not doing
   2461 	 * anything else).  If so, copy to buffer and start */
   2462 	if(req->cp->tcp_is_reading && req->cp->tcp_byte_count == 0) {
   2463 		/* buffer is free, and was ready to read new query into,
   2464 		 * but we are now going to use it to send this answer */
   2465 		tcp_req_info_start_write_buf(req,
   2466 			sldns_buffer_begin(req->spool_buffer),
   2467 			sldns_buffer_limit(req->spool_buffer));
   2468 		/* switch to listen to write events */
   2469 		comm_point_stop_listening(req->cp);
   2470 		comm_point_start_listening(req->cp, -1,
   2471 			adjusted_tcp_timeout(req->cp));
   2472 		return;
   2473 	}
   2474 	/* queue up the answer behind the others already pending */
   2475 	if(!tcp_req_info_add_result(req, sldns_buffer_begin(req->spool_buffer),
   2476 		sldns_buffer_limit(req->spool_buffer))) {
   2477 		/* drop the connection, we are out of resources */
   2478 		comm_point_drop_reply(&req->cp->repinfo);
   2479 	}
   2480 }
   2481 
   2482 size_t tcp_req_info_get_stream_buffer_size(void)
   2483 {
   2484 	size_t s;
   2485 	if(!stream_wait_lock_inited)
   2486 		return stream_wait_count;
   2487 	lock_basic_lock(&stream_wait_count_lock);
   2488 	s = stream_wait_count;
   2489 	lock_basic_unlock(&stream_wait_count_lock);
   2490 	return s;
   2491 }
   2492 
   2493 size_t http2_get_query_buffer_size(void)
   2494 {
   2495 	size_t s;
   2496 	if(!http2_query_buffer_lock_inited)
   2497 		return http2_query_buffer_count;
   2498 	lock_basic_lock(&http2_query_buffer_count_lock);
   2499 	s = http2_query_buffer_count;
   2500 	lock_basic_unlock(&http2_query_buffer_count_lock);
   2501 	return s;
   2502 }
   2503 
   2504 size_t http2_get_response_buffer_size(void)
   2505 {
   2506 	size_t s;
   2507 	if(!http2_response_buffer_lock_inited)
   2508 		return http2_response_buffer_count;
   2509 	lock_basic_lock(&http2_response_buffer_count_lock);
   2510 	s = http2_response_buffer_count;
   2511 	lock_basic_unlock(&http2_response_buffer_count_lock);
   2512 	return s;
   2513 }
   2514 
   2515 #ifdef HAVE_NGHTTP2
   2516 /** nghttp2 callback. Used to copy response from rbuffer to nghttp2 session */
   2517 static ssize_t http2_submit_response_read_callback(
   2518 	nghttp2_session* ATTR_UNUSED(session),
   2519 	int32_t stream_id, uint8_t* buf, size_t length, uint32_t* data_flags,
   2520 	nghttp2_data_source* source, void* ATTR_UNUSED(cb_arg))
   2521 {
   2522 	struct http2_stream* h2_stream;
   2523 	struct http2_session* h2_session = source->ptr;
   2524 	size_t copylen = length;
   2525 	if(!(h2_stream = nghttp2_session_get_stream_user_data(
   2526 		h2_session->session, stream_id))) {
   2527 		verbose(VERB_QUERY, "http2: cannot get stream data, closing "
   2528 			"stream");
   2529 		return NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE;
   2530 	}
   2531 	if(!h2_stream->rbuffer ||
   2532 		sldns_buffer_remaining(h2_stream->rbuffer) == 0) {
   2533 		verbose(VERB_QUERY, "http2: cannot submit buffer. No data "
   2534 			"available in rbuffer");
   2535 		/* rbuffer will be free'd in frame close cb */
   2536 		return NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE;
   2537 	}
   2538 
   2539 	if(copylen > sldns_buffer_remaining(h2_stream->rbuffer))
   2540 		copylen = sldns_buffer_remaining(h2_stream->rbuffer);
   2541 	if(copylen > SSIZE_MAX)
   2542 		copylen = SSIZE_MAX; /* will probably never happen */
   2543 
   2544 	memcpy(buf, sldns_buffer_current(h2_stream->rbuffer), copylen);
   2545 	sldns_buffer_skip(h2_stream->rbuffer, copylen);
   2546 
   2547 	if(sldns_buffer_remaining(h2_stream->rbuffer) == 0) {
   2548 		*data_flags |= NGHTTP2_DATA_FLAG_EOF;
   2549 		lock_basic_lock(&http2_response_buffer_count_lock);
   2550 		http2_response_buffer_count -=
   2551 			sldns_buffer_capacity(h2_stream->rbuffer);
   2552 		lock_basic_unlock(&http2_response_buffer_count_lock);
   2553 		sldns_buffer_free(h2_stream->rbuffer);
   2554 		h2_stream->rbuffer = NULL;
   2555 	}
   2556 
   2557 	return copylen;
   2558 }
   2559 
   2560 /**
   2561  * Send RST_STREAM frame for stream.
   2562  * @param h2_session: http2 session to submit frame to
   2563  * @param h2_stream: http2 stream containing frame ID to use in RST_STREAM
   2564  * @return 0 on error, 1 otherwise
   2565  */
   2566 static int http2_submit_rst_stream(struct http2_session* h2_session,
   2567 		struct http2_stream* h2_stream)
   2568 {
   2569 	int ret = nghttp2_submit_rst_stream(h2_session->session,
   2570 		NGHTTP2_FLAG_NONE, h2_stream->stream_id,
   2571 		NGHTTP2_INTERNAL_ERROR);
   2572 	if(ret) {
   2573 		verbose(VERB_QUERY, "http2: nghttp2_submit_rst_stream failed, "
   2574 			"error: %s", nghttp2_strerror(ret));
   2575 		return 0;
   2576 	}
   2577 	return 1;
   2578 }
   2579 
   2580 /**
   2581  * DNS response ready to be submitted to nghttp2, to be prepared for sending
   2582  * out. Response is stored in c->buffer. Copy to rbuffer because the c->buffer
   2583  * might be used before this will be sent out.
   2584  * @param h2_session: http2 session, containing c->buffer which contains answer
   2585  * @return 0 on error, 1 otherwise
   2586  */
   2587 int http2_submit_dns_response(struct http2_session* h2_session)
   2588 {
   2589 	int ret;
   2590 	nghttp2_data_provider data_prd;
   2591 	char status[4];
   2592 	nghttp2_nv headers[3];
   2593 	struct http2_stream* h2_stream = h2_session->c->h2_stream;
   2594 	size_t rlen;
   2595 	char rlen_str[32];
   2596 
   2597 	if(h2_stream->rbuffer) {
   2598 		log_err("http2 submit response error: rbuffer already "
   2599 			"exists");
   2600 		return 0;
   2601 	}
   2602 	if(sldns_buffer_remaining(h2_session->c->buffer) == 0) {
   2603 		log_err("http2 submit response error: c->buffer not complete");
   2604 		return 0;
   2605 	}
   2606 
   2607 	if(snprintf(status, 4, "%d", h2_stream->status) != 3) {
   2608 		verbose(VERB_QUERY, "http2: submit response error: "
   2609 			"invalid status");
   2610 		return 0;
   2611 	}
   2612 
   2613 	rlen = sldns_buffer_remaining(h2_session->c->buffer);
   2614 	snprintf(rlen_str, sizeof(rlen_str), "%u", (unsigned)rlen);
   2615 
   2616 	lock_basic_lock(&http2_response_buffer_count_lock);
   2617 	if(http2_response_buffer_count + rlen > http2_response_buffer_max) {
   2618 		lock_basic_unlock(&http2_response_buffer_count_lock);
   2619 		verbose(VERB_ALGO, "reset HTTP2 stream, no space left, "
   2620 			"in https-response-buffer-size");
   2621 		return http2_submit_rst_stream(h2_session, h2_stream);
   2622 	}
   2623 	http2_response_buffer_count += rlen;
   2624 	lock_basic_unlock(&http2_response_buffer_count_lock);
   2625 
   2626 	if(!(h2_stream->rbuffer = sldns_buffer_new(rlen))) {
   2627 		lock_basic_lock(&http2_response_buffer_count_lock);
   2628 		http2_response_buffer_count -= rlen;
   2629 		lock_basic_unlock(&http2_response_buffer_count_lock);
   2630 		log_err("http2 submit response error: malloc failure");
   2631 		return 0;
   2632 	}
   2633 
   2634 	headers[0].name = (uint8_t*)":status";
   2635 	headers[0].namelen = 7;
   2636 	headers[0].value = (uint8_t*)status;
   2637 	headers[0].valuelen = 3;
   2638 	headers[0].flags = NGHTTP2_NV_FLAG_NONE;
   2639 
   2640 	headers[1].name = (uint8_t*)"content-type";
   2641 	headers[1].namelen = 12;
   2642 	headers[1].value = (uint8_t*)"application/dns-message";
   2643 	headers[1].valuelen = 23;
   2644 	headers[1].flags = NGHTTP2_NV_FLAG_NONE;
   2645 
   2646 	headers[2].name = (uint8_t*)"content-length";
   2647 	headers[2].namelen = 14;
   2648 	headers[2].value = (uint8_t*)rlen_str;
   2649 	headers[2].valuelen = strlen(rlen_str);
   2650 	headers[2].flags = NGHTTP2_NV_FLAG_NONE;
   2651 
   2652 	sldns_buffer_write(h2_stream->rbuffer,
   2653 		sldns_buffer_current(h2_session->c->buffer),
   2654 		sldns_buffer_remaining(h2_session->c->buffer));
   2655 	sldns_buffer_flip(h2_stream->rbuffer);
   2656 
   2657 	data_prd.source.ptr = h2_session;
   2658 	data_prd.read_callback = http2_submit_response_read_callback;
   2659 	ret = nghttp2_submit_response(h2_session->session, h2_stream->stream_id,
   2660 		headers, 3, &data_prd);
   2661 	if(ret) {
   2662 		verbose(VERB_QUERY, "http2: set_stream_user_data failed, "
   2663 			"error: %s", nghttp2_strerror(ret));
   2664 		return 0;
   2665 	}
   2666 	return 1;
   2667 }
   2668 #else
   2669 int http2_submit_dns_response(void* ATTR_UNUSED(v))
   2670 {
   2671 	return 0;
   2672 }
   2673 #endif
   2674 
   2675 #ifdef HAVE_NGHTTP2
   2676 /** HTTP status to descriptive string */
   2677 static char* http_status_to_str(enum http_status s)
   2678 {
   2679 	switch(s) {
   2680 		case HTTP_STATUS_OK:
   2681 			return "OK";
   2682 		case HTTP_STATUS_BAD_REQUEST:
   2683 			return "Bad Request";
   2684 		case HTTP_STATUS_NOT_FOUND:
   2685 			return "Not Found";
   2686 		case HTTP_STATUS_PAYLOAD_TOO_LARGE:
   2687 			return "Payload Too Large";
   2688 		case HTTP_STATUS_URI_TOO_LONG:
   2689 			return "URI Too Long";
   2690 		case HTTP_STATUS_UNSUPPORTED_MEDIA_TYPE:
   2691 			return "Unsupported Media Type";
   2692 		case HTTP_STATUS_NOT_IMPLEMENTED:
   2693 			return "Not Implemented";
   2694 	}
   2695 	return "Status Unknown";
   2696 }
   2697 
   2698 /** nghttp2 callback. Used to copy error message to nghttp2 session */
   2699 static ssize_t http2_submit_error_read_callback(
   2700 	nghttp2_session* ATTR_UNUSED(session),
   2701 	int32_t stream_id, uint8_t* buf, size_t length, uint32_t* data_flags,
   2702 	nghttp2_data_source* source, void* ATTR_UNUSED(cb_arg))
   2703 {
   2704 	struct http2_stream* h2_stream;
   2705 	struct http2_session* h2_session = source->ptr;
   2706 	char* msg;
   2707 	if(!(h2_stream = nghttp2_session_get_stream_user_data(
   2708 		h2_session->session, stream_id))) {
   2709 		verbose(VERB_QUERY, "http2: cannot get stream data, closing "
   2710 			"stream");
   2711 		return NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE;
   2712 	}
   2713 	*data_flags |= NGHTTP2_DATA_FLAG_EOF;
   2714 	msg = http_status_to_str(h2_stream->status);
   2715 	if(length < strlen(msg))
   2716 		return 0; /* not worth trying over multiple frames */
   2717 	memcpy(buf, msg, strlen(msg));
   2718 	return strlen(msg);
   2719 
   2720 }
   2721 
   2722 /**
   2723  * HTTP error response ready to be submitted to nghttp2, to be prepared for
   2724  * sending out. Message body will contain descriptive string for HTTP status.
   2725  * @param h2_session: http2 session to submit to
   2726  * @param h2_stream: http2 stream containing HTTP status to use for error
   2727  * @return 0 on error, 1 otherwise
   2728  */
   2729 static int http2_submit_error(struct http2_session* h2_session,
   2730 	struct http2_stream* h2_stream)
   2731 {
   2732 	int ret;
   2733 	char status[4];
   2734 	nghttp2_data_provider data_prd;
   2735 	nghttp2_nv headers[1]; /* will be copied by nghttp */
   2736 	if(snprintf(status, 4, "%d", h2_stream->status) != 3) {
   2737 		verbose(VERB_QUERY, "http2: submit error failed, "
   2738 			"invalid status");
   2739 		return 0;
   2740 	}
   2741 	headers[0].name = (uint8_t*)":status";
   2742 	headers[0].namelen = 7;
   2743 	headers[0].value = (uint8_t*)status;
   2744 	headers[0].valuelen = 3;
   2745 	headers[0].flags = NGHTTP2_NV_FLAG_NONE;
   2746 
   2747 	data_prd.source.ptr = h2_session;
   2748 	data_prd.read_callback = http2_submit_error_read_callback;
   2749 
   2750 	ret = nghttp2_submit_response(h2_session->session, h2_stream->stream_id,
   2751 		headers, 1, &data_prd);
   2752 	if(ret) {
   2753 		verbose(VERB_QUERY, "http2: submit error failed, "
   2754 			"error: %s", nghttp2_strerror(ret));
   2755 		return 0;
   2756 	}
   2757 	return 1;
   2758 }
   2759 
   2760 /**
   2761  * Start query handling. Query is stored in the stream, and will be free'd here.
   2762  * @param h2_session: http2 session, containing comm point
   2763  * @param h2_stream: stream containing buffered query
   2764  * @return: -1 on error, 1 if answer is stored in c->buffer, 0 if there is no
   2765  * reply available (yet).
   2766  */
   2767 static int http2_query_read_done(struct http2_session* h2_session,
   2768 	struct http2_stream* h2_stream)
   2769 {
   2770 	log_assert(h2_stream->qbuffer);
   2771 
   2772 	if(h2_session->c->h2_stream) {
   2773 		verbose(VERB_ALGO, "http2_query_read_done failure: shared "
   2774 			"buffer already assigned to stream");
   2775 		return -1;
   2776 	}
   2777 
   2778     /* the c->buffer might be used by mesh_send_reply and no be cleard
   2779 	 * need to be cleared before use */
   2780 	sldns_buffer_clear(h2_session->c->buffer);
   2781 	if(sldns_buffer_remaining(h2_session->c->buffer) <
   2782 		sldns_buffer_remaining(h2_stream->qbuffer)) {
   2783 		/* qbuffer will be free'd in frame close cb */
   2784 		sldns_buffer_clear(h2_session->c->buffer);
   2785 		verbose(VERB_ALGO, "http2_query_read_done failure: can't fit "
   2786 			"qbuffer in c->buffer");
   2787 		return -1;
   2788 	}
   2789 
   2790 	sldns_buffer_write(h2_session->c->buffer,
   2791 		sldns_buffer_current(h2_stream->qbuffer),
   2792 		sldns_buffer_remaining(h2_stream->qbuffer));
   2793 
   2794 	lock_basic_lock(&http2_query_buffer_count_lock);
   2795 	http2_query_buffer_count -= sldns_buffer_capacity(h2_stream->qbuffer);
   2796 	lock_basic_unlock(&http2_query_buffer_count_lock);
   2797 	sldns_buffer_free(h2_stream->qbuffer);
   2798 	h2_stream->qbuffer = NULL;
   2799 
   2800 	sldns_buffer_flip(h2_session->c->buffer);
   2801 	h2_session->c->h2_stream = h2_stream;
   2802 	fptr_ok(fptr_whitelist_comm_point(h2_session->c->callback));
   2803 	if((*h2_session->c->callback)(h2_session->c, h2_session->c->cb_arg,
   2804 		NETEVENT_NOERROR, &h2_session->c->repinfo)) {
   2805 		return 1; /* answer in c->buffer */
   2806 	}
   2807 	sldns_buffer_clear(h2_session->c->buffer);
   2808 	h2_session->c->h2_stream = NULL;
   2809 	return 0; /* mesh state added, or dropped */
   2810 }
   2811 
   2812 /** nghttp2 callback. Used to check if the received frame indicates the end of a
   2813  * stream. Gather collected request data and start query handling. */
   2814 static int http2_req_frame_recv_cb(nghttp2_session* session,
   2815 	const nghttp2_frame* frame, void* cb_arg)
   2816 {
   2817 	struct http2_session* h2_session = (struct http2_session*)cb_arg;
   2818 	struct http2_stream* h2_stream;
   2819 	int query_read_done;
   2820 
   2821 	if((frame->hd.type != NGHTTP2_DATA &&
   2822 		frame->hd.type != NGHTTP2_HEADERS) ||
   2823 		!(frame->hd.flags & NGHTTP2_FLAG_END_STREAM)) {
   2824 			return 0;
   2825 	}
   2826 
   2827 	if(!(h2_stream = nghttp2_session_get_stream_user_data(
   2828 		session, frame->hd.stream_id)))
   2829 		return 0;
   2830 
   2831 	if(h2_stream->invalid_endpoint) {
   2832 		h2_stream->status = HTTP_STATUS_NOT_FOUND;
   2833 		goto submit_http_error;
   2834 	}
   2835 
   2836 	if(h2_stream->invalid_content_type) {
   2837 		h2_stream->status = HTTP_STATUS_UNSUPPORTED_MEDIA_TYPE;
   2838 		goto submit_http_error;
   2839 	}
   2840 
   2841 	if(h2_stream->http_method != HTTP_METHOD_GET &&
   2842 		h2_stream->http_method != HTTP_METHOD_POST) {
   2843 		h2_stream->status = HTTP_STATUS_NOT_IMPLEMENTED;
   2844 		goto submit_http_error;
   2845 	}
   2846 
   2847 	if(h2_stream->query_too_large) {
   2848 		if(h2_stream->http_method == HTTP_METHOD_POST)
   2849 			h2_stream->status = HTTP_STATUS_PAYLOAD_TOO_LARGE;
   2850 		else
   2851 			h2_stream->status = HTTP_STATUS_URI_TOO_LONG;
   2852 		goto submit_http_error;
   2853 	}
   2854 
   2855 	if(!h2_stream->qbuffer) {
   2856 		h2_stream->status = HTTP_STATUS_BAD_REQUEST;
   2857 		goto submit_http_error;
   2858 	}
   2859 
   2860 	if(h2_stream->status) {
   2861 submit_http_error:
   2862 		verbose(VERB_QUERY, "http2 request invalid, returning :status="
   2863 			"%d", h2_stream->status);
   2864 		if(!http2_submit_error(h2_session, h2_stream)) {
   2865 			return NGHTTP2_ERR_CALLBACK_FAILURE;
   2866 		}
   2867 		return 0;
   2868 	}
   2869 	h2_stream->status = HTTP_STATUS_OK;
   2870 
   2871 	sldns_buffer_flip(h2_stream->qbuffer);
   2872 	h2_session->postpone_drop = 1;
   2873 	query_read_done = http2_query_read_done(h2_session, h2_stream);
   2874 	h2_session->postpone_drop = 0;
   2875 	if(query_read_done < 0)
   2876 		return NGHTTP2_ERR_CALLBACK_FAILURE;
   2877 	else if(!query_read_done) {
   2878 		if(h2_session->is_drop) {
   2879 			/* connection needs to be closed. Return failure to make
   2880 			 * sure no other action are taken anymore on comm point.
   2881 			 * failure will result in reclaiming (and closing)
   2882 			 * of comm point. */
   2883 			verbose(VERB_QUERY, "http2 query dropped in worker cb");
   2884 			return NGHTTP2_ERR_CALLBACK_FAILURE;
   2885 		}
   2886 		/* nothing to submit right now, query added to mesh. */
   2887 		return 0;
   2888 	}
   2889 	if(!http2_submit_dns_response(h2_session)) {
   2890 		sldns_buffer_clear(h2_session->c->buffer);
   2891 		h2_session->c->h2_stream = NULL;
   2892 		return NGHTTP2_ERR_CALLBACK_FAILURE;
   2893 	}
   2894 	verbose(VERB_QUERY, "http2 query submitted to session");
   2895 	sldns_buffer_clear(h2_session->c->buffer);
   2896 	h2_session->c->h2_stream = NULL;
   2897 	return 0;
   2898 }
   2899 
   2900 /** nghttp2 callback. Used to detect start of new streams. */
   2901 static int http2_req_begin_headers_cb(nghttp2_session* session,
   2902 	const nghttp2_frame* frame, void* cb_arg)
   2903 {
   2904 	struct http2_session* h2_session = (struct http2_session*)cb_arg;
   2905 	struct http2_stream* h2_stream;
   2906 	int ret;
   2907 	if(frame->hd.type != NGHTTP2_HEADERS ||
   2908 		frame->headers.cat != NGHTTP2_HCAT_REQUEST) {
   2909 		/* only interested in request headers */
   2910 		return 0;
   2911 	}
   2912 	if(!(h2_stream = http2_stream_create(frame->hd.stream_id))) {
   2913 		log_err("malloc failure while creating http2 stream");
   2914 		return NGHTTP2_ERR_CALLBACK_FAILURE;
   2915 	}
   2916 	http2_session_add_stream(h2_session, h2_stream);
   2917 	ret = nghttp2_session_set_stream_user_data(session,
   2918 		frame->hd.stream_id, h2_stream);
   2919 	if(ret) {
   2920 		/* stream does not exist */
   2921 		verbose(VERB_QUERY, "http2: set_stream_user_data failed, "
   2922 			"error: %s", nghttp2_strerror(ret));
   2923 		return NGHTTP2_ERR_CALLBACK_FAILURE;
   2924 	}
   2925 
   2926 	return 0;
   2927 }
   2928 
   2929 /**
   2930  * base64url decode, store in qbuffer
   2931  * @param h2_session: http2 session
   2932  * @param h2_stream: http2 stream
   2933  * @param start: start of the base64 string
   2934  * @param length: length of the base64 string
   2935  * @return: 0 on error, 1 otherwise. query will be stored in h2_stream->qbuffer,
   2936  * buffer will be NULL is unparseble.
   2937  */
   2938 static int http2_buffer_uri_query(struct http2_session* h2_session,
   2939 	struct http2_stream* h2_stream, const uint8_t* start, size_t length)
   2940 {
   2941 	size_t expectb64len;
   2942 	int b64len;
   2943 	if(h2_stream->http_method == HTTP_METHOD_POST)
   2944 		return 1;
   2945 	if(length == 0)
   2946 		return 1;
   2947 	if(h2_stream->qbuffer) {
   2948 		verbose(VERB_ALGO, "http2_req_header fail, "
   2949 			"qbuffer already set");
   2950 		return 0;
   2951 	}
   2952 
   2953 	/* calculate size, might be a bit bigger than the real
   2954 	 * decoded buffer size */
   2955 	expectb64len = sldns_b64_pton_calculate_size(length);
   2956 	log_assert(expectb64len > 0);
   2957 	if(expectb64len >
   2958 		h2_session->c->http2_stream_max_qbuffer_size) {
   2959 		h2_stream->query_too_large = 1;
   2960 		return 1;
   2961 	}
   2962 
   2963 	lock_basic_lock(&http2_query_buffer_count_lock);
   2964 	if(http2_query_buffer_count + expectb64len > http2_query_buffer_max) {
   2965 		lock_basic_unlock(&http2_query_buffer_count_lock);
   2966 		verbose(VERB_ALGO, "reset HTTP2 stream, no space left, "
   2967 			"in http2-query-buffer-size");
   2968 		return http2_submit_rst_stream(h2_session, h2_stream);
   2969 	}
   2970 	http2_query_buffer_count += expectb64len;
   2971 	lock_basic_unlock(&http2_query_buffer_count_lock);
   2972 	if(!(h2_stream->qbuffer = sldns_buffer_new(expectb64len))) {
   2973 		lock_basic_lock(&http2_query_buffer_count_lock);
   2974 		http2_query_buffer_count -= expectb64len;
   2975 		lock_basic_unlock(&http2_query_buffer_count_lock);
   2976 		log_err("http2_req_header fail, qbuffer "
   2977 			"malloc failure");
   2978 		return 0;
   2979 	}
   2980 
   2981 	if(sldns_b64_contains_nonurl((char const*)start, length)) {
   2982 		char buf[65536+4];
   2983 		verbose(VERB_ALGO, "HTTP2 stream contains wrong b64 encoding");
   2984 		/* copy to the scratch buffer temporarily to terminate the
   2985 		 * string with a zero */
   2986 		if(length+1 > sizeof(buf)) {
   2987 			/* too long */
   2988 			lock_basic_lock(&http2_query_buffer_count_lock);
   2989 			http2_query_buffer_count -= expectb64len;
   2990 			lock_basic_unlock(&http2_query_buffer_count_lock);
   2991 			sldns_buffer_free(h2_stream->qbuffer);
   2992 			h2_stream->qbuffer = NULL;
   2993 			return 1;
   2994 		}
   2995 		memmove(buf, start, length);
   2996 		buf[length] = 0;
   2997 		if(!(b64len = sldns_b64_pton(buf, sldns_buffer_current(
   2998 			h2_stream->qbuffer), expectb64len)) || b64len < 0) {
   2999 			lock_basic_lock(&http2_query_buffer_count_lock);
   3000 			http2_query_buffer_count -= expectb64len;
   3001 			lock_basic_unlock(&http2_query_buffer_count_lock);
   3002 			sldns_buffer_free(h2_stream->qbuffer);
   3003 			h2_stream->qbuffer = NULL;
   3004 			return 1;
   3005 		}
   3006 	} else {
   3007 		if(!(b64len = sldns_b64url_pton(
   3008 			(char const *)start, length,
   3009 			sldns_buffer_current(h2_stream->qbuffer),
   3010 			expectb64len)) || b64len < 0) {
   3011 			lock_basic_lock(&http2_query_buffer_count_lock);
   3012 			http2_query_buffer_count -= expectb64len;
   3013 			lock_basic_unlock(&http2_query_buffer_count_lock);
   3014 			sldns_buffer_free(h2_stream->qbuffer);
   3015 			h2_stream->qbuffer = NULL;
   3016 			/* return without error, method can be an
   3017 			 * unknown POST */
   3018 			return 1;
   3019 		}
   3020 	}
   3021 	sldns_buffer_skip(h2_stream->qbuffer, (size_t)b64len);
   3022 	return 1;
   3023 }
   3024 
   3025 /** nghttp2 callback. Used to parse headers from HEADER frames. */
   3026 static int http2_req_header_cb(nghttp2_session* session,
   3027 	const nghttp2_frame* frame, const uint8_t* name, size_t namelen,
   3028 	const uint8_t* value, size_t valuelen, uint8_t ATTR_UNUSED(flags),
   3029 	void* cb_arg)
   3030 {
   3031 	struct http2_stream* h2_stream = NULL;
   3032 	struct http2_session* h2_session = (struct http2_session*)cb_arg;
   3033 	/* nghttp2 deals with CONTINUATION frames and provides them as part of
   3034 	 * the HEADER */
   3035 	if(frame->hd.type != NGHTTP2_HEADERS ||
   3036 		frame->headers.cat != NGHTTP2_HCAT_REQUEST) {
   3037 		/* only interested in request headers */
   3038 		return 0;
   3039 	}
   3040 	if(!(h2_stream = nghttp2_session_get_stream_user_data(session,
   3041 		frame->hd.stream_id)))
   3042 		return 0;
   3043 
   3044 	/* earlier checks already indicate we can stop handling this query */
   3045 	if(h2_stream->http_method == HTTP_METHOD_UNSUPPORTED ||
   3046 		h2_stream->invalid_content_type ||
   3047 		h2_stream->invalid_endpoint)
   3048 		return 0;
   3049 
   3050 
   3051 	/* nghttp2 performs some sanity checks in the headers, including:
   3052 	 * name and value are guaranteed to be null terminated
   3053 	 * name is guaranteed to be lowercase
   3054 	 * content-length value is guaranteed to contain digits
   3055 	 */
   3056 
   3057 	if(!h2_stream->http_method && namelen == 7 &&
   3058 		memcmp(":method", name, namelen) == 0) {
   3059 		/* Case insensitive check on :method value to be on the safe
   3060 		 * side. I failed to find text about case sensitivity in specs.
   3061 		 */
   3062 		if(valuelen == 3 && strcasecmp("GET", (const char*)value) == 0)
   3063 			h2_stream->http_method = HTTP_METHOD_GET;
   3064 		else if(valuelen == 4 &&
   3065 			strcasecmp("POST", (const char*)value) == 0) {
   3066 			h2_stream->http_method = HTTP_METHOD_POST;
   3067 			if(h2_stream->qbuffer) {
   3068 				/* POST method uses query from DATA frames */
   3069 				lock_basic_lock(&http2_query_buffer_count_lock);
   3070 				http2_query_buffer_count -=
   3071 					sldns_buffer_capacity(h2_stream->qbuffer);
   3072 				lock_basic_unlock(&http2_query_buffer_count_lock);
   3073 				sldns_buffer_free(h2_stream->qbuffer);
   3074 				h2_stream->qbuffer = NULL;
   3075 			}
   3076 		} else
   3077 			h2_stream->http_method = HTTP_METHOD_UNSUPPORTED;
   3078 		return 0;
   3079 	}
   3080 	if(namelen == 5 && memcmp(":path", name, namelen) == 0) {
   3081 		/* :path may contain DNS query, depending on method. Method might
   3082 		 * not be known yet here, so check after finishing receiving
   3083 		 * stream. */
   3084 #define	HTTP_QUERY_PARAM "?dns="
   3085 		size_t el = strlen(h2_session->c->http_endpoint);
   3086 		size_t qpl = strlen(HTTP_QUERY_PARAM);
   3087 
   3088 		if(valuelen < el || memcmp(h2_session->c->http_endpoint,
   3089 			value, el) != 0) {
   3090 			h2_stream->invalid_endpoint = 1;
   3091 			return 0;
   3092 		}
   3093 		/* larger than endpoint only allowed if it is for the query
   3094 		 * parameter */
   3095 		if(valuelen <= el+qpl ||
   3096 			memcmp(HTTP_QUERY_PARAM, value+el, qpl) != 0) {
   3097 			if(valuelen != el)
   3098 				h2_stream->invalid_endpoint = 1;
   3099 			return 0;
   3100 		}
   3101 
   3102 		if(!http2_buffer_uri_query(h2_session, h2_stream,
   3103 			value+(el+qpl), valuelen-(el+qpl))) {
   3104 			return NGHTTP2_ERR_CALLBACK_FAILURE;
   3105 		}
   3106 		return 0;
   3107 	}
   3108 	/* Content type is a SHOULD (rfc7231#section-3.1.1.5) when using POST,
   3109 	 * and not needed when using GET. Don't enforce.
   3110 	 * If set only allow lowercase "application/dns-message".
   3111 	 *
   3112 	 * Clients SHOULD (rfc8484#section-4.1) set an accept header, but MUST
   3113 	 * be able to handle "application/dns-message". Since that is the only
   3114 	 * content-type supported we can ignore the accept header.
   3115 	 */
   3116 	if((namelen == 12 && memcmp("content-type", name, namelen) == 0)) {
   3117 		if(valuelen != 23 || memcmp("application/dns-message", value,
   3118 			valuelen) != 0) {
   3119 			h2_stream->invalid_content_type = 1;
   3120 		}
   3121 	}
   3122 
   3123 	/* Only interested in content-lentg for POST (on not yet known) method.
   3124 	 */
   3125 	if((!h2_stream->http_method ||
   3126 		h2_stream->http_method == HTTP_METHOD_POST) &&
   3127 		!h2_stream->content_length && namelen  == 14 &&
   3128 		memcmp("content-length", name, namelen) == 0) {
   3129 		if(valuelen > 5) {
   3130 			h2_stream->query_too_large = 1;
   3131 			return 0;
   3132 		}
   3133 		/* guaranteed to only contain digits and be null terminated */
   3134 		h2_stream->content_length = atoi((const char*)value);
   3135 		if(h2_stream->content_length >
   3136 			h2_session->c->http2_stream_max_qbuffer_size) {
   3137 			h2_stream->query_too_large = 1;
   3138 			return 0;
   3139 		}
   3140 	}
   3141 	return 0;
   3142 }
   3143 
   3144 /** nghttp2 callback. Used to get data from DATA frames, which can contain
   3145  * queries in POST requests. */
   3146 static int http2_req_data_chunk_recv_cb(nghttp2_session* ATTR_UNUSED(session),
   3147 	uint8_t ATTR_UNUSED(flags), int32_t stream_id, const uint8_t* data,
   3148 	size_t len, void* cb_arg)
   3149 {
   3150 	struct http2_session* h2_session = (struct http2_session*)cb_arg;
   3151 	struct http2_stream* h2_stream;
   3152 	size_t qlen = 0;
   3153 
   3154 	if(!(h2_stream = nghttp2_session_get_stream_user_data(
   3155 		h2_session->session, stream_id))) {
   3156 		return 0;
   3157 	}
   3158 
   3159 	if(h2_stream->query_too_large)
   3160 		return 0;
   3161 
   3162 	if(!h2_stream->qbuffer) {
   3163 		if(h2_stream->content_length) {
   3164 			if(h2_stream->content_length < len)
   3165 				/* getting more data in DATA frame than
   3166 				 * advertised in content-length header. */
   3167 				return NGHTTP2_ERR_CALLBACK_FAILURE;
   3168 			qlen = h2_stream->content_length;
   3169 		} else if(len <= h2_session->c->http2_stream_max_qbuffer_size) {
   3170 			/* setting this to msg-buffer-size can result in a lot
   3171 			 * of memory consumption. Most queries should fit in a
   3172 			 * single DATA frame, and most POST queries will
   3173 			 * contain content-length which does not impose this
   3174 			 * limit. */
   3175 			qlen = len;
   3176 		}
   3177 	}
   3178 	if(!h2_stream->qbuffer && qlen) {
   3179 		lock_basic_lock(&http2_query_buffer_count_lock);
   3180 		if(http2_query_buffer_count + qlen > http2_query_buffer_max) {
   3181 			lock_basic_unlock(&http2_query_buffer_count_lock);
   3182 			verbose(VERB_ALGO, "reset HTTP2 stream, no space left, "
   3183 				"in http2-query-buffer-size");
   3184 			return http2_submit_rst_stream(h2_session, h2_stream);
   3185 		}
   3186 		http2_query_buffer_count += qlen;
   3187 		lock_basic_unlock(&http2_query_buffer_count_lock);
   3188 		if(!(h2_stream->qbuffer = sldns_buffer_new(qlen))) {
   3189 			lock_basic_lock(&http2_query_buffer_count_lock);
   3190 			http2_query_buffer_count -= qlen;
   3191 			lock_basic_unlock(&http2_query_buffer_count_lock);
   3192 		}
   3193 	}
   3194 
   3195 	if(!h2_stream->qbuffer ||
   3196 		sldns_buffer_remaining(h2_stream->qbuffer) < len) {
   3197 		verbose(VERB_ALGO, "http2 data_chunk_recv failed. Not enough "
   3198 			"buffer space for POST query. Can happen on multi "
   3199 			"frame requests without content-length header");
   3200 		h2_stream->query_too_large = 1;
   3201 		return 0;
   3202 	}
   3203 
   3204 	sldns_buffer_write(h2_stream->qbuffer, data, len);
   3205 
   3206 	return 0;
   3207 }
   3208 
   3209 void http2_req_stream_clear(struct http2_stream* h2_stream)
   3210 {
   3211 	if(h2_stream->qbuffer) {
   3212 		lock_basic_lock(&http2_query_buffer_count_lock);
   3213 		http2_query_buffer_count -=
   3214 			sldns_buffer_capacity(h2_stream->qbuffer);
   3215 		lock_basic_unlock(&http2_query_buffer_count_lock);
   3216 		sldns_buffer_free(h2_stream->qbuffer);
   3217 		h2_stream->qbuffer = NULL;
   3218 	}
   3219 	if(h2_stream->rbuffer) {
   3220 		lock_basic_lock(&http2_response_buffer_count_lock);
   3221 		http2_response_buffer_count -=
   3222 			sldns_buffer_capacity(h2_stream->rbuffer);
   3223 		lock_basic_unlock(&http2_response_buffer_count_lock);
   3224 		sldns_buffer_free(h2_stream->rbuffer);
   3225 		h2_stream->rbuffer = NULL;
   3226 	}
   3227 }
   3228 
   3229 nghttp2_session_callbacks* http2_req_callbacks_create(void)
   3230 {
   3231 	nghttp2_session_callbacks *callbacks;
   3232 	if(nghttp2_session_callbacks_new(&callbacks) == NGHTTP2_ERR_NOMEM) {
   3233 		log_err("failed to initialize nghttp2 callback");
   3234 		return NULL;
   3235 	}
   3236 	/* reception of header block started, used to create h2_stream */
   3237 	nghttp2_session_callbacks_set_on_begin_headers_callback(callbacks,
   3238 		http2_req_begin_headers_cb);
   3239 	/* complete frame received, used to get data from stream if frame
   3240 	 * has end stream flag, and start processing query */
   3241 	nghttp2_session_callbacks_set_on_frame_recv_callback(callbacks,
   3242 		http2_req_frame_recv_cb);
   3243 	/* get request info from headers */
   3244 	nghttp2_session_callbacks_set_on_header_callback(callbacks,
   3245 		http2_req_header_cb);
   3246 	/* get data from DATA frames, containing POST query */
   3247 	nghttp2_session_callbacks_set_on_data_chunk_recv_callback(callbacks,
   3248 		http2_req_data_chunk_recv_cb);
   3249 
   3250 	/* generic HTTP2 callbacks */
   3251 	nghttp2_session_callbacks_set_recv_callback(callbacks, http2_recv_cb);
   3252 	nghttp2_session_callbacks_set_send_callback(callbacks, http2_send_cb);
   3253 	nghttp2_session_callbacks_set_on_stream_close_callback(callbacks,
   3254 		http2_stream_close_cb);
   3255 
   3256 	return callbacks;
   3257 }
   3258 #endif /* HAVE_NGHTTP2 */
   3259 
   3260 #ifdef HAVE_NGTCP2
   3261 struct doq_table*
   3262 doq_table_create(struct config_file* cfg, struct ub_randstate* rnd)
   3263 {
   3264 	struct doq_table* table;
   3265 
   3266 	if (!cfg->quic_port)
   3267 		return NULL;
   3268 	table = calloc(1, sizeof(*table));
   3269 	if(!table)
   3270 		return NULL;
   3271 #ifdef USE_NGTCP2_CRYPTO_OSSL
   3272 	/* Initialize the ossl crypto, it is harmless to call twice,
   3273 	 * and this is before use of doq connections. */
   3274 	if(ngtcp2_crypto_ossl_init() != 0) {
   3275 		log_err("ngtcp2_crypto_ossl_init failed");
   3276 		free(table);
   3277 		return NULL;
   3278 	}
   3279 #elif defined(HAVE_NGTCP2_CRYPTO_QUICTLS_INIT)
   3280 	if(ngtcp2_crypto_quictls_init() != 0) {
   3281 		log_err("ngtcp2_crypto_quictls_init failed");
   3282 		free(table);
   3283 		return NULL;
   3284 	}
   3285 #endif
   3286 	table->idle_timeout = ((uint64_t)cfg->tcp_idle_timeout)*
   3287 		NGTCP2_MILLISECONDS;
   3288 	table->sv_scidlen = 16;
   3289 	table->static_secret_len = 16;
   3290 	table->static_secret = malloc(table->static_secret_len);
   3291 	if(!table->static_secret) {
   3292 		free(table);
   3293 		return NULL;
   3294 	}
   3295 	doq_fill_rand(rnd, table->static_secret, table->static_secret_len);
   3296 	table->conn_tree = rbtree_create(doq_conn_cmp);
   3297 	if(!table->conn_tree) {
   3298 		free(table->static_secret);
   3299 		free(table);
   3300 		return NULL;
   3301 	}
   3302 	table->conid_tree = rbtree_create(doq_conid_cmp);
   3303 	if(!table->conid_tree) {
   3304 		free(table->static_secret);
   3305 		free(table->conn_tree);
   3306 		free(table);
   3307 		return NULL;
   3308 	}
   3309 	table->timer_tree = rbtree_create(doq_timer_cmp);
   3310 	if(!table->timer_tree) {
   3311 		free(table->static_secret);
   3312 		free(table->conn_tree);
   3313 		free(table->conid_tree);
   3314 		free(table);
   3315 		return NULL;
   3316 	}
   3317 	lock_rw_init(&table->lock);
   3318 	lock_rw_init(&table->conid_lock);
   3319 	lock_basic_init(&table->size_lock);
   3320 	lock_protect(&table->lock, &table->static_secret,
   3321 		sizeof(table->static_secret));
   3322 	lock_protect(&table->lock, &table->static_secret_len,
   3323 		sizeof(table->static_secret_len));
   3324 	lock_protect(&table->lock, table->static_secret,
   3325 		table->static_secret_len);
   3326 	lock_protect(&table->lock, &table->sv_scidlen,
   3327 		sizeof(table->sv_scidlen));
   3328 	lock_protect(&table->lock, &table->idle_timeout,
   3329 		sizeof(table->idle_timeout));
   3330 	lock_protect(&table->lock, &table->conn_tree, sizeof(table->conn_tree));
   3331 	lock_protect(&table->lock, table->conn_tree, sizeof(*table->conn_tree));
   3332 	lock_protect(&table->conid_lock, table->conid_tree,
   3333 		sizeof(*table->conid_tree));
   3334 	lock_protect(&table->lock, table->timer_tree,
   3335 		sizeof(*table->timer_tree));
   3336 	lock_protect(&table->size_lock, &table->current_size,
   3337 		sizeof(table->current_size));
   3338 	return table;
   3339 }
   3340 
   3341 /** delete elements from the connection tree */
   3342 static void
   3343 conn_tree_del(rbnode_type* node, void* arg)
   3344 {
   3345 	struct doq_table* table = (struct doq_table*)arg;
   3346 	struct doq_conn* conn;
   3347 	if(!node || !table)
   3348 		return;
   3349 	conn = (struct doq_conn*)node->key;
   3350 	if(conn->timer.timer_in_list) {
   3351 		/* Remove timer from list first, because finding the rbnode
   3352 		 * element of the setlist of same timeouts needs tree lookup.
   3353 		 * Edit the tree structure after that lookup. */
   3354 		doq_timer_list_remove(conn->table, &conn->timer);
   3355 	}
   3356 	if(conn->timer.timer_in_tree)
   3357 		doq_timer_tree_remove(conn->table, &conn->timer);
   3358 	doq_table_quic_size_subtract(table, sizeof(*conn)+conn->key.dcidlen);
   3359 	doq_conn_delete(conn, table);
   3360 }
   3361 
   3362 /** delete elements from the connection id tree */
   3363 static void
   3364 conid_tree_del(rbnode_type* node, void* ATTR_UNUSED(arg))
   3365 {
   3366 	if(!node)
   3367 		return;
   3368 	doq_conid_delete((struct doq_conid*)node->key);
   3369 }
   3370 
   3371 void
   3372 doq_table_delete(struct doq_table* table)
   3373 {
   3374 	if(!table)
   3375 		return;
   3376 	lock_rw_destroy(&table->lock);
   3377 	free(table->static_secret);
   3378 	if(table->conn_tree) {
   3379 		traverse_postorder(table->conn_tree, conn_tree_del, table);
   3380 		free(table->conn_tree);
   3381 	}
   3382 	lock_rw_destroy(&table->conid_lock);
   3383 	if(table->conid_tree) {
   3384 		/* The tree should be empty, because the doq_conn_delete calls
   3385 		 * above should have also removed their conid elements. */
   3386 		traverse_postorder(table->conid_tree, conid_tree_del, NULL);
   3387 		free(table->conid_tree);
   3388 	}
   3389 	lock_basic_destroy(&table->size_lock);
   3390 	if(table->timer_tree) {
   3391 		/* The tree should be empty, because the conn_tree_del calls
   3392 		 * above should also have removed them. Also the doq_timer
   3393 		 * is part of the doq_conn struct, so is already freed. */
   3394 		free(table->timer_tree);
   3395 	}
   3396 	table->write_list_first = NULL;
   3397 	table->write_list_last = NULL;
   3398 	free(table);
   3399 }
   3400 
   3401 struct doq_timer*
   3402 doq_timer_find_time(struct doq_table* table, struct timeval* tv)
   3403 {
   3404 	struct doq_timer key;
   3405 	struct rbnode_type* node;
   3406 	log_assert(table != NULL);
   3407 	memset(&key, 0, sizeof(key));
   3408 	key.time.tv_sec = tv->tv_sec;
   3409 	key.time.tv_usec = tv->tv_usec;
   3410 	node = rbtree_search(table->timer_tree, &key);
   3411 	if(node)
   3412 		return (struct doq_timer*)node->key;
   3413 	return NULL;
   3414 }
   3415 
   3416 void
   3417 doq_timer_tree_remove(struct doq_table* table, struct doq_timer* timer)
   3418 {
   3419 	if(!timer->timer_in_tree)
   3420 		return;
   3421 	rbtree_delete(table->timer_tree, timer);
   3422 	timer->timer_in_tree = 0;
   3423 	/* This item could have more timers in the same set. */
   3424 	if(timer->setlist_first) {
   3425 		struct doq_timer* rb_timer = timer->setlist_first;
   3426 		/* del first element from setlist */
   3427 		if(rb_timer->setlist_next)
   3428 			rb_timer->setlist_next->setlist_prev = NULL;
   3429 		else
   3430 			timer->setlist_last = NULL;
   3431 		timer->setlist_first = rb_timer->setlist_next;
   3432 		rb_timer->setlist_prev = NULL;
   3433 		rb_timer->setlist_next = NULL;
   3434 		rb_timer->timer_in_list = 0;
   3435 		/* insert it into the tree as new rb element */
   3436 		memset(&rb_timer->node, 0, sizeof(rb_timer->node));
   3437 		rb_timer->node.key = rb_timer;
   3438 		rbtree_insert(table->timer_tree, &rb_timer->node);
   3439 		rb_timer->timer_in_tree = 1;
   3440 		/* the setlist, if any remainder, moves to the rb element */
   3441 		rb_timer->setlist_first = timer->setlist_first;
   3442 		rb_timer->setlist_last = timer->setlist_last;
   3443 		timer->setlist_first = NULL;
   3444 		timer->setlist_last = NULL;
   3445 		rb_timer->worker_doq_socket = timer->worker_doq_socket;
   3446 	}
   3447 	timer->worker_doq_socket = NULL;
   3448 }
   3449 
   3450 void
   3451 doq_timer_list_remove(struct doq_table* table, struct doq_timer* timer)
   3452 {
   3453 	struct doq_timer* rb_timer;
   3454 	if(!timer->timer_in_list)
   3455 		return;
   3456 	/* The item in the rbtree has the list start and end. */
   3457 	rb_timer = doq_timer_find_time(table, &timer->time);
   3458 	if(rb_timer) {
   3459 		if(timer->setlist_prev)
   3460 			timer->setlist_prev->setlist_next = timer->setlist_next;
   3461 		else
   3462 			rb_timer->setlist_first = timer->setlist_next;
   3463 		if(timer->setlist_next)
   3464 			timer->setlist_next->setlist_prev = timer->setlist_prev;
   3465 		else
   3466 			rb_timer->setlist_last = timer->setlist_prev;
   3467 		timer->setlist_prev = NULL;
   3468 		timer->setlist_next = NULL;
   3469 	}
   3470 	timer->timer_in_list = 0;
   3471 }
   3472 
   3473 /** doq append timer to setlist */
   3474 static void
   3475 doq_timer_list_append(struct doq_timer* rb_timer, struct doq_timer* timer)
   3476 {
   3477 	log_assert(timer->timer_in_list == 0);
   3478 	timer->timer_in_list = 1;
   3479 	timer->setlist_next = NULL;
   3480 	timer->setlist_prev = rb_timer->setlist_last;
   3481 	if(rb_timer->setlist_last)
   3482 		rb_timer->setlist_last->setlist_next = timer;
   3483 	else
   3484 		rb_timer->setlist_first = timer;
   3485 	rb_timer->setlist_last = timer;
   3486 }
   3487 
   3488 void
   3489 doq_timer_unset(struct doq_table* table, struct doq_timer* timer)
   3490 {
   3491 	if(timer->timer_in_list) {
   3492 		/* Remove timer from list first, because finding the rbnode
   3493 		 * element of the setlist of same timeouts needs tree lookup.
   3494 		 * Edit the tree structure after that lookup. */
   3495 		doq_timer_list_remove(table, timer);
   3496 	}
   3497 	if(timer->timer_in_tree)
   3498 		doq_timer_tree_remove(table, timer);
   3499 	timer->worker_doq_socket = NULL;
   3500 }
   3501 
   3502 void doq_timer_set(struct doq_table* table, struct doq_timer* timer,
   3503 	struct doq_server_socket* worker_doq_socket, struct timeval* tv)
   3504 {
   3505 	struct doq_timer* rb_timer;
   3506 	if(verbosity >= VERB_ALGO && timer->conn) {
   3507 		char a[256];
   3508 		struct timeval rel;
   3509 		addr_to_str((void*)&timer->conn->key.paddr.addr,
   3510 			timer->conn->key.paddr.addrlen, a, sizeof(a));
   3511 		timeval_subtract(&rel, tv, worker_doq_socket->now_tv);
   3512 		verbose(VERB_ALGO, "doq %s timer set %d.%6.6d in %d.%6.6d",
   3513 			a, (int)tv->tv_sec, (int)tv->tv_usec,
   3514 			(int)rel.tv_sec, (int)rel.tv_usec);
   3515 	}
   3516 	if(timer->timer_in_tree || timer->timer_in_list) {
   3517 		if(timer->time.tv_sec == tv->tv_sec &&
   3518 			timer->time.tv_usec == tv->tv_usec)
   3519 			return; /* already set on that time */
   3520 		doq_timer_unset(table, timer);
   3521 	}
   3522 	timer->time.tv_sec = tv->tv_sec;
   3523 	timer->time.tv_usec = tv->tv_usec;
   3524 	rb_timer = doq_timer_find_time(table, tv);
   3525 	if(rb_timer) {
   3526 		/* There is a timeout already with this value. Timer is
   3527 		 * added to the setlist. */
   3528 		doq_timer_list_append(rb_timer, timer);
   3529 	} else {
   3530 		/* There is no timeout with this value. Make timer a new
   3531 		 * tree element. */
   3532 		memset(&timer->node, 0, sizeof(timer->node));
   3533 		timer->node.key = timer;
   3534 		rbtree_insert(table->timer_tree, &timer->node);
   3535 		timer->timer_in_tree = 1;
   3536 		timer->setlist_first = NULL;
   3537 		timer->setlist_last = NULL;
   3538 		timer->worker_doq_socket = worker_doq_socket;
   3539 	}
   3540 }
   3541 
   3542 struct doq_conn*
   3543 doq_conn_create(struct comm_point* c, struct doq_pkt_addr* paddr,
   3544 	const uint8_t* dcid, size_t dcidlen, uint32_t version)
   3545 {
   3546 	struct doq_conn* conn = calloc(1, sizeof(*conn));
   3547 	if(!conn)
   3548 		return NULL;
   3549 	conn->node.key = conn;
   3550 	conn->doq_socket = c->doq_socket;
   3551 	conn->table = c->doq_socket->table;
   3552 	memmove(&conn->key.paddr.addr, &paddr->addr, paddr->addrlen);
   3553 	conn->key.paddr.addrlen = paddr->addrlen;
   3554 	memmove(&conn->key.paddr.localaddr, &paddr->localaddr,
   3555 		paddr->localaddrlen);
   3556 	conn->key.paddr.localaddrlen = paddr->localaddrlen;
   3557 	conn->key.paddr.ifindex = paddr->ifindex;
   3558 	conn->key.dcid = memdup((void*)dcid, dcidlen);
   3559 	if(!conn->key.dcid) {
   3560 		free(conn);
   3561 		return NULL;
   3562 	}
   3563 	conn->key.dcidlen = dcidlen;
   3564 	conn->version = version;
   3565 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
   3566 	ngtcp2_ccerr_default(&conn->ccerr);
   3567 #else
   3568 	ngtcp2_connection_close_error_default(&conn->last_error);
   3569 #endif
   3570 	rbtree_init(&conn->stream_tree, &doq_stream_cmp);
   3571 	conn->timer.conn = conn;
   3572 	lock_basic_init(&conn->lock);
   3573 	lock_protect(&conn->lock, &conn->key, sizeof(conn->key));
   3574 	lock_protect(&conn->lock, &conn->doq_socket, sizeof(conn->doq_socket));
   3575 	lock_protect(&conn->lock, &conn->table, sizeof(conn->table));
   3576 	lock_protect(&conn->lock, &conn->is_deleted, sizeof(conn->is_deleted));
   3577 	lock_protect(&conn->lock, &conn->version, sizeof(conn->version));
   3578 	lock_protect(&conn->lock, &conn->conn, sizeof(conn->conn));
   3579 	lock_protect(&conn->lock, &conn->conid_list, sizeof(conn->conid_list));
   3580 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
   3581 	lock_protect(&conn->lock, &conn->ccerr, sizeof(conn->ccerr));
   3582 #else
   3583 	lock_protect(&conn->lock, &conn->last_error, sizeof(conn->last_error));
   3584 #endif
   3585 	lock_protect(&conn->lock, &conn->tls_alert, sizeof(conn->tls_alert));
   3586 	lock_protect(&conn->lock, &conn->ssl, sizeof(conn->ssl));
   3587 	lock_protect(&conn->lock, &conn->close_pkt, sizeof(conn->close_pkt));
   3588 	lock_protect(&conn->lock, &conn->close_pkt_len, sizeof(conn->close_pkt_len));
   3589 	lock_protect(&conn->lock, &conn->close_ecn, sizeof(conn->close_ecn));
   3590 	lock_protect(&conn->lock, &conn->stream_tree, sizeof(conn->stream_tree));
   3591 	lock_protect(&conn->lock, &conn->stream_write_first, sizeof(conn->stream_write_first));
   3592 	lock_protect(&conn->lock, &conn->stream_write_last, sizeof(conn->stream_write_last));
   3593 	lock_protect(&conn->lock, &conn->write_interest, sizeof(conn->write_interest));
   3594 	lock_protect(&conn->lock, &conn->on_write_list, sizeof(conn->on_write_list));
   3595 	lock_protect(&conn->lock, &conn->write_prev, sizeof(conn->write_prev));
   3596 	lock_protect(&conn->lock, &conn->write_next, sizeof(conn->write_next));
   3597 	return conn;
   3598 }
   3599 
   3600 /** delete stream tree node */
   3601 static void
   3602 stream_tree_del(rbnode_type* node, void* arg)
   3603 {
   3604 	struct doq_table* table = (struct doq_table*)arg;
   3605 	struct doq_stream* stream;
   3606 	if(!node)
   3607 		return;
   3608 	stream = (struct doq_stream*)node;
   3609 	if(stream->in)
   3610 		doq_table_quic_size_subtract(table, stream->inlen);
   3611 	if(stream->out)
   3612 		doq_table_quic_size_subtract(table, stream->outlen);
   3613 	doq_table_quic_size_subtract(table, sizeof(*stream));
   3614 	doq_stream_delete(stream);
   3615 }
   3616 
   3617 void
   3618 doq_conn_delete(struct doq_conn* conn, struct doq_table* table)
   3619 {
   3620 	if(!conn)
   3621 		return;
   3622 	lock_basic_destroy(&conn->lock);
   3623 	lock_rw_wrlock(&conn->table->conid_lock);
   3624 	doq_conn_clear_conids(conn);
   3625 	lock_rw_unlock(&conn->table->conid_lock);
   3626 	/* Remove the app data from ngtcp2 before SSL_free of conn->ssl,
   3627 	 * because the ngtcp2 conn is deleted. */
   3628 	SSL_set_app_data(conn->ssl, NULL);
   3629 	if(conn->stream_tree.count != 0) {
   3630 		traverse_postorder(&conn->stream_tree, stream_tree_del, table);
   3631 	}
   3632 	free(conn->key.dcid);
   3633 	SSL_free(conn->ssl);
   3634 #ifdef USE_NGTCP2_CRYPTO_OSSL
   3635 	ngtcp2_crypto_ossl_ctx_del(conn->ossl_ctx);
   3636 #endif
   3637 	ngtcp2_conn_del(conn->conn);
   3638 	free(conn->close_pkt);
   3639 	free(conn);
   3640 }
   3641 
   3642 int
   3643 doq_conn_cmp(const void* key1, const void* key2)
   3644 {
   3645 	struct doq_conn* c = (struct doq_conn*)key1;
   3646 	struct doq_conn* d = (struct doq_conn*)key2;
   3647 	int r;
   3648 	/* Compared in the order destination address, then
   3649 	 * local address, ifindex and then dcid.
   3650 	 * So that for a search for findlessorequal for the destination
   3651 	 * address will find connections to that address, with different
   3652 	 * dcids.
   3653 	 * Also a printout in sorted order prints the connections by IP
   3654 	 * address of destination, and then a number of them depending on the
   3655 	 * dcids. */
   3656 	if(c->key.paddr.addrlen != d->key.paddr.addrlen) {
   3657 		if(c->key.paddr.addrlen < d->key.paddr.addrlen)
   3658 			return -1;
   3659 		return 1;
   3660 	}
   3661 	if((r=memcmp(&c->key.paddr.addr, &d->key.paddr.addr,
   3662 		c->key.paddr.addrlen))!=0)
   3663 		return r;
   3664 	if(c->key.paddr.localaddrlen != d->key.paddr.localaddrlen) {
   3665 		if(c->key.paddr.localaddrlen < d->key.paddr.localaddrlen)
   3666 			return -1;
   3667 		return 1;
   3668 	}
   3669 	if((r=memcmp(&c->key.paddr.localaddr, &d->key.paddr.localaddr,
   3670 		c->key.paddr.localaddrlen))!=0)
   3671 		return r;
   3672 	if(c->key.paddr.ifindex != d->key.paddr.ifindex) {
   3673 		if(c->key.paddr.ifindex < d->key.paddr.ifindex)
   3674 			return -1;
   3675 		return 1;
   3676 	}
   3677 	if(c->key.dcidlen != d->key.dcidlen) {
   3678 		if(c->key.dcidlen < d->key.dcidlen)
   3679 			return -1;
   3680 		return 1;
   3681 	}
   3682 	if((r=memcmp(c->key.dcid, d->key.dcid, c->key.dcidlen))!=0)
   3683 		return r;
   3684 	return 0;
   3685 }
   3686 
   3687 int doq_conid_cmp(const void* key1, const void* key2)
   3688 {
   3689 	struct doq_conid* c = (struct doq_conid*)key1;
   3690 	struct doq_conid* d = (struct doq_conid*)key2;
   3691 	if(c->cidlen != d->cidlen) {
   3692 		if(c->cidlen < d->cidlen)
   3693 			return -1;
   3694 		return 1;
   3695 	}
   3696 	return memcmp(c->cid, d->cid, c->cidlen);
   3697 }
   3698 
   3699 int doq_timer_cmp(const void* key1, const void* key2)
   3700 {
   3701 	struct doq_timer* e = (struct doq_timer*)key1;
   3702 	struct doq_timer* f = (struct doq_timer*)key2;
   3703 	if(e->time.tv_sec < f->time.tv_sec)
   3704 		return -1;
   3705 	if(e->time.tv_sec > f->time.tv_sec)
   3706 		return 1;
   3707 	if(e->time.tv_usec < f->time.tv_usec)
   3708 		return -1;
   3709 	if(e->time.tv_usec > f->time.tv_usec)
   3710 		return 1;
   3711 	return 0;
   3712 }
   3713 
   3714 int doq_stream_cmp(const void* key1, const void* key2)
   3715 {
   3716 	struct doq_stream* c = (struct doq_stream*)key1;
   3717 	struct doq_stream* d = (struct doq_stream*)key2;
   3718 	if(c->stream_id != d->stream_id) {
   3719 		if(c->stream_id < d->stream_id)
   3720 			return -1;
   3721 		return 1;
   3722 	}
   3723 	return 0;
   3724 }
   3725 
   3726 /** doq store a local address in repinfo */
   3727 static void
   3728 doq_repinfo_store_localaddr(struct comm_reply* repinfo,
   3729 	struct doq_addr_storage* localaddr, socklen_t localaddrlen)
   3730 {
   3731 	/* use the pktinfo that we have for ancillary udp data otherwise,
   3732 	 * this saves space for a sockaddr */
   3733 	memset(&repinfo->pktinfo, 0, sizeof(repinfo->pktinfo));
   3734 	if(addr_is_ip6((void*)localaddr, localaddrlen)) {
   3735 #ifdef IPV6_PKTINFO
   3736 		struct sockaddr_in6* sa6 = (struct sockaddr_in6*)localaddr;
   3737 		memmove(&repinfo->pktinfo.v6info.ipi6_addr,
   3738 			&sa6->sin6_addr, sizeof(struct in6_addr));
   3739 		repinfo->doq_srcport = sa6->sin6_port;
   3740 #endif
   3741 		repinfo->srctype = 6;
   3742 	} else {
   3743 #ifdef IP_PKTINFO
   3744 		struct sockaddr_in* sa = (struct sockaddr_in*)localaddr;
   3745 		memmove(&repinfo->pktinfo.v4info.ipi_addr,
   3746 			&sa->sin_addr, sizeof(struct in_addr));
   3747 		repinfo->doq_srcport = sa->sin_port;
   3748 #elif defined(IP_RECVDSTADDR)
   3749 		struct sockaddr_in* sa = (struct sockaddr_in*)localaddr;
   3750 		memmove(&repinfo->pktinfo.v4addr, &sa->sin_addr,
   3751 			sizeof(struct in_addr));
   3752 		repinfo->doq_srcport = sa->sin_port;
   3753 #endif
   3754 		repinfo->srctype = 4;
   3755 	}
   3756 }
   3757 
   3758 /** doq retrieve localaddr from repinfo */
   3759 static void
   3760 doq_repinfo_retrieve_localaddr(struct comm_reply* repinfo,
   3761 	struct doq_addr_storage* localaddr, socklen_t* localaddrlen)
   3762 {
   3763 	if(repinfo->srctype == 6) {
   3764 #ifdef IPV6_PKTINFO
   3765 		struct sockaddr_in6* sa6 = (struct sockaddr_in6*)localaddr;
   3766 		*localaddrlen = (socklen_t)sizeof(struct sockaddr_in6);
   3767 		memset(sa6, 0, *localaddrlen);
   3768 		sa6->sin6_family = AF_INET6;
   3769 		memmove(&sa6->sin6_addr, &repinfo->pktinfo.v6info.ipi6_addr,
   3770 			sizeof(struct in6_addr));
   3771 		sa6->sin6_port = repinfo->doq_srcport;
   3772 #endif
   3773 	} else {
   3774 #ifdef IP_PKTINFO
   3775 		struct sockaddr_in* sa = (struct sockaddr_in*)localaddr;
   3776 		*localaddrlen = (socklen_t)sizeof(struct sockaddr_in);
   3777 		memset(sa, 0, *localaddrlen);
   3778 		sa->sin_family = AF_INET;
   3779 		memmove(&sa->sin_addr, &repinfo->pktinfo.v4info.ipi_addr,
   3780 			sizeof(struct in_addr));
   3781 		sa->sin_port = repinfo->doq_srcport;
   3782 #elif defined(IP_RECVDSTADDR)
   3783 		struct sockaddr_in* sa = (struct sockaddr_in*)localaddr;
   3784 		*localaddrlen = (socklen_t)sizeof(struct sockaddr_in);
   3785 		memset(sa, 0, *localaddrlen);
   3786 		sa->sin_family = AF_INET;
   3787 		memmove(&sa->sin_addr, &repinfo->pktinfo.v4addr,
   3788 			sizeof(struct in_addr));
   3789 		sa->sin_port = repinfo->doq_srcport;
   3790 #endif
   3791 	}
   3792 }
   3793 
   3794 /** doq write a connection key into repinfo, false if it does not fit */
   3795 static int
   3796 doq_conn_key_store_repinfo(struct doq_conn_key* key,
   3797 	struct comm_reply* repinfo)
   3798 {
   3799 	repinfo->is_proxied = 0;
   3800 	repinfo->doq_ifindex = key->paddr.ifindex;
   3801 	repinfo->remote_addrlen = key->paddr.addrlen;
   3802 	memmove(&repinfo->remote_addr, &key->paddr.addr,
   3803 		repinfo->remote_addrlen);
   3804 	repinfo->client_addrlen = key->paddr.addrlen;
   3805 	memmove(&repinfo->client_addr, &key->paddr.addr,
   3806 		repinfo->client_addrlen);
   3807 	doq_repinfo_store_localaddr(repinfo, &key->paddr.localaddr,
   3808 		key->paddr.localaddrlen);
   3809 	if(key->dcidlen > sizeof(repinfo->doq_dcid))
   3810 		return 0;
   3811 	repinfo->doq_dcidlen = key->dcidlen;
   3812 	memmove(repinfo->doq_dcid, key->dcid, key->dcidlen);
   3813 	return 1;
   3814 }
   3815 
   3816 void
   3817 doq_conn_key_from_repinfo(struct doq_conn_key* key, struct comm_reply* repinfo)
   3818 {
   3819 	key->paddr.ifindex = repinfo->doq_ifindex;
   3820 	key->paddr.addrlen = repinfo->remote_addrlen;
   3821 	memmove(&key->paddr.addr, &repinfo->remote_addr,
   3822 		repinfo->remote_addrlen);
   3823 	doq_repinfo_retrieve_localaddr(repinfo, &key->paddr.localaddr,
   3824 		&key->paddr.localaddrlen);
   3825 	key->dcidlen = repinfo->doq_dcidlen;
   3826 	key->dcid = repinfo->doq_dcid;
   3827 }
   3828 
   3829 /** doq add a stream to the connection */
   3830 static void
   3831 doq_conn_add_stream(struct doq_conn* conn, struct doq_stream* stream)
   3832 {
   3833 	(void)rbtree_insert(&conn->stream_tree, &stream->node);
   3834 }
   3835 
   3836 /** doq delete a stream from the connection */
   3837 static void
   3838 doq_conn_del_stream(struct doq_conn* conn, struct doq_stream* stream)
   3839 {
   3840 	(void)rbtree_delete(&conn->stream_tree, &stream->node);
   3841 }
   3842 
   3843 /** doq create new stream */
   3844 static struct doq_stream*
   3845 doq_stream_create(int64_t stream_id)
   3846 {
   3847 	struct doq_stream* stream = calloc(1, sizeof(*stream));
   3848 	if(!stream)
   3849 		return NULL;
   3850 	stream->node.key = stream;
   3851 	stream->stream_id = stream_id;
   3852 	return stream;
   3853 }
   3854 
   3855 void doq_stream_delete(struct doq_stream* stream)
   3856 {
   3857 	if(!stream)
   3858 		return;
   3859 	free(stream->in);
   3860 	free(stream->out);
   3861 	free(stream);
   3862 }
   3863 
   3864 struct doq_stream*
   3865 doq_stream_find(struct doq_conn* conn, int64_t stream_id)
   3866 {
   3867 	rbnode_type* node;
   3868 	struct doq_stream key;
   3869 	key.node.key = &key;
   3870 	key.stream_id = stream_id;
   3871 	node = rbtree_search(&conn->stream_tree, &key);
   3872 	if(node)
   3873 		return (struct doq_stream*)node->key;
   3874 	return NULL;
   3875 }
   3876 
   3877 /** doq put stream on the conn write list */
   3878 static void
   3879 doq_stream_on_write_list(struct doq_conn* conn, struct doq_stream* stream)
   3880 {
   3881 	if(stream->on_write_list)
   3882 		return;
   3883 	stream->write_prev = conn->stream_write_last;
   3884 	if(conn->stream_write_last)
   3885 		conn->stream_write_last->write_next = stream;
   3886 	else
   3887 		conn->stream_write_first = stream;
   3888 	conn->stream_write_last = stream;
   3889 	stream->write_next = NULL;
   3890 	stream->on_write_list = 1;
   3891 }
   3892 
   3893 /** doq remove stream from the conn write list */
   3894 static void
   3895 doq_stream_off_write_list(struct doq_conn* conn, struct doq_stream* stream)
   3896 {
   3897 	if(!stream->on_write_list)
   3898 		return;
   3899 	if(stream->write_next)
   3900 		stream->write_next->write_prev = stream->write_prev;
   3901 	else conn->stream_write_last = stream->write_prev;
   3902 	if(stream->write_prev)
   3903 		stream->write_prev->write_next = stream->write_next;
   3904 	else conn->stream_write_first = stream->write_next;
   3905 	stream->write_prev = NULL;
   3906 	stream->write_next = NULL;
   3907 	stream->on_write_list = 0;
   3908 }
   3909 
   3910 /** doq stream remove in buffer */
   3911 static void
   3912 doq_stream_remove_in_buffer(struct doq_stream* stream, struct doq_table* table)
   3913 {
   3914 	if(stream->in) {
   3915 		doq_table_quic_size_subtract(table, stream->inlen);
   3916 		free(stream->in);
   3917 		stream->in = NULL;
   3918 		stream->inlen = 0;
   3919 	}
   3920 }
   3921 
   3922 /** doq stream remove out buffer */
   3923 static void
   3924 doq_stream_remove_out_buffer(struct doq_stream* stream,
   3925 	struct doq_table* table)
   3926 {
   3927 	if(stream->out) {
   3928 		doq_table_quic_size_subtract(table, stream->outlen);
   3929 		free(stream->out);
   3930 		stream->out = NULL;
   3931 		stream->outlen = 0;
   3932 	}
   3933 }
   3934 
   3935 int
   3936 doq_stream_close(struct doq_conn* conn, struct doq_stream* stream,
   3937 	int send_shutdown)
   3938 {
   3939 	int ret;
   3940 	if(stream->is_closed)
   3941 		return 1;
   3942 	stream->is_closed = 1;
   3943 	doq_stream_off_write_list(conn, stream);
   3944 	if(send_shutdown) {
   3945 		verbose(VERB_ALGO, "doq: shutdown stream_id %d with app_error_code %d",
   3946 			(int)stream->stream_id, (int)DOQ_APP_ERROR_CODE);
   3947 		ret = ngtcp2_conn_shutdown_stream(conn->conn,
   3948 #ifdef HAVE_NGTCP2_CONN_SHUTDOWN_STREAM4
   3949 			0,
   3950 #endif
   3951 			stream->stream_id, DOQ_APP_ERROR_CODE);
   3952 		if(ret != 0) {
   3953 			log_err("doq ngtcp2_conn_shutdown_stream %d failed: %s",
   3954 				(int)stream->stream_id, ngtcp2_strerror(ret));
   3955 			return 0;
   3956 		}
   3957 		doq_conn_write_enable(conn);
   3958 	}
   3959 	verbose(VERB_ALGO, "doq: conn extend max streams bidi by 1");
   3960 	ngtcp2_conn_extend_max_streams_bidi(conn->conn, 1);
   3961 	doq_conn_write_enable(conn);
   3962 	doq_stream_remove_in_buffer(stream, conn->doq_socket->table);
   3963 	doq_stream_remove_out_buffer(stream, conn->doq_socket->table);
   3964 	doq_table_quic_size_subtract(conn->doq_socket->table, sizeof(*stream));
   3965 	doq_conn_del_stream(conn, stream);
   3966 	doq_stream_delete(stream);
   3967 	return 1;
   3968 }
   3969 
   3970 /** doq stream pick up answer data from buffer */
   3971 static int
   3972 doq_stream_pickup_answer(struct doq_stream* stream, struct sldns_buffer* buf)
   3973 {
   3974 	stream->is_answer_available = 1;
   3975 	if(stream->out) {
   3976 		free(stream->out);
   3977 		stream->out = NULL;
   3978 		stream->outlen = 0;
   3979 	}
   3980 	stream->nwrite = 0;
   3981 	stream->outlen = sldns_buffer_limit(buf);
   3982 	/* For quic the output bytes have to stay allocated and available,
   3983 	 * for potential resends, until the remote end has acknowledged them.
   3984 	 * This includes the tcplen start uint16_t, in outlen_wire. */
   3985 	stream->outlen_wire = htons(stream->outlen);
   3986 	stream->out = memdup(sldns_buffer_begin(buf), sldns_buffer_limit(buf));
   3987 	if(!stream->out) {
   3988 		log_err("doq could not send answer: out of memory");
   3989 		return 0;
   3990 	}
   3991 	return 1;
   3992 }
   3993 
   3994 int
   3995 doq_stream_send_reply(struct doq_conn* conn, struct doq_stream* stream,
   3996 	struct sldns_buffer* buf)
   3997 {
   3998 	if(verbosity >= VERB_ALGO) {
   3999 		char* s = sldns_wire2str_pkt(sldns_buffer_begin(buf),
   4000 			sldns_buffer_limit(buf));
   4001 		verbose(VERB_ALGO, "doq stream %d response\n%s",
   4002 			(int)stream->stream_id, (s?s:"null"));
   4003 		free(s);
   4004 	}
   4005 	if(stream->out)
   4006 		doq_table_quic_size_subtract(conn->doq_socket->table,
   4007 			stream->outlen);
   4008 	if(!doq_stream_pickup_answer(stream, buf))
   4009 		return 0;
   4010 	doq_table_quic_size_add(conn->doq_socket->table, stream->outlen);
   4011 	doq_stream_on_write_list(conn, stream);
   4012 	doq_conn_write_enable(conn);
   4013 	return 1;
   4014 }
   4015 
   4016 /** doq stream data length has completed, allocations can be done. False on
   4017  * allocation failure. */
   4018 static int
   4019 doq_stream_datalen_complete(struct doq_stream* stream, struct doq_table* table)
   4020 {
   4021 	if(stream->inlen > 1024*1024) {
   4022 		log_err("doq stream in length too large %d",
   4023 			(int)stream->inlen);
   4024 		return 0;
   4025 	}
   4026 	stream->in = calloc(1, stream->inlen);
   4027 	if(!stream->in) {
   4028 		log_err("doq could not read stream, calloc failed: "
   4029 			"out of memory");
   4030 		return 0;
   4031 	}
   4032 	doq_table_quic_size_add(table, stream->inlen);
   4033 	return 1;
   4034 }
   4035 
   4036 /** doq stream data is complete, the input data has been received. */
   4037 static int
   4038 doq_stream_data_complete(struct doq_conn* conn, struct doq_stream* stream)
   4039 {
   4040 	struct comm_point* c;
   4041 	if(verbosity >= VERB_ALGO) {
   4042 		char* s = sldns_wire2str_pkt(stream->in, stream->inlen);
   4043 		char a[128];
   4044 		addr_to_str((void*)&conn->key.paddr.addr,
   4045 			conn->key.paddr.addrlen, a, sizeof(a));
   4046 		verbose(VERB_ALGO, "doq %s stream %d incoming query\n%s",
   4047 			a, (int)stream->stream_id, (s?s:"null"));
   4048 		free(s);
   4049 	}
   4050 	stream->is_query_complete = 1;
   4051 	c = conn->doq_socket->cp;
   4052 	if(!stream->in) {
   4053 		verbose(VERB_ALGO, "doq_stream_data_complete: no in buffer");
   4054 		return 0;
   4055 	}
   4056 	if(stream->inlen > sldns_buffer_capacity(c->buffer)) {
   4057 		verbose(VERB_ALGO, "doq_stream_data_complete: query too long");
   4058 		return 0;
   4059 	}
   4060 	sldns_buffer_clear(c->buffer);
   4061 	sldns_buffer_write(c->buffer, stream->in, stream->inlen);
   4062 	sldns_buffer_flip(c->buffer);
   4063 	c->repinfo.c = c;
   4064 	if(!doq_conn_key_store_repinfo(&conn->key, &c->repinfo)) {
   4065 		verbose(VERB_ALGO, "doq_stream_data_complete: connection "
   4066 			"DCID too long");
   4067 		return 0;
   4068 	}
   4069 	c->repinfo.doq_streamid = stream->stream_id;
   4070 	conn->doq_socket->current_conn = conn;
   4071 	fptr_ok(fptr_whitelist_comm_point(c->callback));
   4072 	if( (*c->callback)(c, c->cb_arg, NETEVENT_NOERROR, &c->repinfo)) {
   4073 		conn->doq_socket->current_conn = NULL;
   4074 		if(!doq_stream_send_reply(conn, stream, c->buffer)) {
   4075 			verbose(VERB_ALGO, "doq: failed to send_reply");
   4076 			return 0;
   4077 		}
   4078 		return 1;
   4079 	}
   4080 	conn->doq_socket->current_conn = NULL;
   4081 	return 1;
   4082 }
   4083 
   4084 /** doq receive data for a stream, more bytes of the incoming data */
   4085 static int
   4086 doq_stream_recv_data(struct doq_stream* stream, const uint8_t* data,
   4087 	size_t datalen, int* recv_done, struct doq_table* table)
   4088 {
   4089 	int got_data = 0;
   4090 	/* read the tcplength uint16_t at the start */
   4091 	if(stream->nread < 2) {
   4092 		uint16_t tcplen = 0;
   4093 		size_t todolen = 2 - stream->nread;
   4094 
   4095 		if(stream->nread > 0) {
   4096 			/* put in the already read byte if there is one */
   4097 			tcplen = stream->inlen;
   4098 		}
   4099 		if(datalen < todolen)
   4100 			todolen = datalen;
   4101 		memmove(((uint8_t*)&tcplen)+stream->nread, data, todolen);
   4102 		stream->nread += todolen;
   4103 		data += todolen;
   4104 		datalen -= todolen;
   4105 		if(stream->nread == 2) {
   4106 			/* the initial length value is completed */
   4107 			stream->inlen = ntohs(tcplen);
   4108 			if(!doq_stream_datalen_complete(stream, table))
   4109 				return 0;
   4110 		} else {
   4111 			/* store for later */
   4112 			stream->inlen = tcplen;
   4113 			return 1;
   4114 		}
   4115 	}
   4116 	/* if there are more data bytes */
   4117 	if(datalen > 0) {
   4118 		size_t to_write = datalen;
   4119 		if(stream->nread-2 > stream->inlen) {
   4120 			verbose(VERB_ALGO, "doq stream buffer too small");
   4121 			return 0;
   4122 		}
   4123 		if(datalen > stream->inlen - (stream->nread-2))
   4124 			to_write = stream->inlen - (stream->nread-2);
   4125 		if(to_write > 0) {
   4126 			if(!stream->in) {
   4127 				verbose(VERB_ALGO, "doq: stream has "
   4128 					"no buffer");
   4129 				return 0;
   4130 			}
   4131 			memmove(stream->in+(stream->nread-2), data, to_write);
   4132 			stream->nread += to_write;
   4133 			data += to_write;
   4134 			datalen -= to_write;
   4135 			got_data = 1;
   4136 		}
   4137 	}
   4138 	/* Are there extra bytes received after the end? If so, log them. */
   4139 	if(datalen > 0) {
   4140 		if(verbosity >= VERB_ALGO)
   4141 			log_hex("doq stream has extra bytes received after end",
   4142 				(void*)data, datalen);
   4143 	}
   4144 	/* Is the input data complete? */
   4145 	if(got_data && stream->nread >= stream->inlen+2) {
   4146 		if(!stream->in) {
   4147 			verbose(VERB_ALGO, "doq: completed stream has "
   4148 				"no buffer");
   4149 			return 0;
   4150 		}
   4151 		*recv_done = 1;
   4152 	}
   4153 	return 1;
   4154 }
   4155 
   4156 /** doq receive FIN for a stream. No more bytes are going to arrive. */
   4157 static int
   4158 doq_stream_recv_fin(struct doq_conn* conn, struct doq_stream* stream, int
   4159 	recv_done)
   4160 {
   4161 	if(!stream->is_query_complete && !recv_done) {
   4162 		verbose(VERB_ALGO, "doq: stream recv FIN, but is "
   4163 			"not complete, have %d of %d bytes",
   4164 			((int)stream->nread)-2, (int)stream->inlen);
   4165 		if(!doq_stream_close(conn, stream, 1))
   4166 			return 0;
   4167 	}
   4168 	return 1;
   4169 }
   4170 
   4171 void doq_fill_rand(struct ub_randstate* rnd, uint8_t* buf, size_t len)
   4172 {
   4173 	size_t i;
   4174 	for(i=0; i<len; i++)
   4175 		buf[i] = ub_random(rnd)&0xff;
   4176 }
   4177 
   4178 /** generate new connection id, checks for duplicates.
   4179  * caller must hold lock on conid tree. */
   4180 static int
   4181 doq_conn_generate_new_conid(struct doq_conn* conn, uint8_t* data,
   4182 	size_t datalen)
   4183 {
   4184 	int max_try = 100;
   4185 	int i;
   4186 	for(i=0; i<max_try; i++) {
   4187 		doq_fill_rand(conn->doq_socket->rnd, data, datalen);
   4188 		if(!doq_conid_find(conn->table, data, datalen)) {
   4189 			/* Found an unused connection id. */
   4190 			return 1;
   4191 		}
   4192 	}
   4193 	verbose(VERB_ALGO, "doq_conn_generate_new_conid failed: could not "
   4194 		"generate random unused connection id value in %d attempts.",
   4195 		max_try);
   4196 	return 0;
   4197 }
   4198 
   4199 /** ngtcp2 rand callback function */
   4200 static void
   4201 doq_rand_cb(uint8_t* dest, size_t destlen, const ngtcp2_rand_ctx* rand_ctx)
   4202 {
   4203 	struct ub_randstate* rnd = (struct ub_randstate*)
   4204 		rand_ctx->native_handle;
   4205 	doq_fill_rand(rnd, dest, destlen);
   4206 }
   4207 
   4208 /** ngtcp2 get_new_connection_id callback function */
   4209 static int
   4210 doq_get_new_connection_id_cb(ngtcp2_conn* ATTR_UNUSED(conn), ngtcp2_cid* cid,
   4211 	uint8_t* token, size_t cidlen, void* user_data)
   4212 {
   4213 	struct doq_conn* doq_conn = (struct doq_conn*)user_data;
   4214 	/* Lock the conid tree, so we can check for duplicates while
   4215 	 * generating the id, and then insert it, whilst keeping the tree
   4216 	 * locked against other modifications, guaranteeing uniqueness. */
   4217 	lock_rw_wrlock(&doq_conn->table->conid_lock);
   4218 	if(!doq_conn_generate_new_conid(doq_conn, cid->data, cidlen)) {
   4219 		lock_rw_unlock(&doq_conn->table->conid_lock);
   4220 		return NGTCP2_ERR_CALLBACK_FAILURE;
   4221 	}
   4222 	cid->datalen = cidlen;
   4223 	if(ngtcp2_crypto_generate_stateless_reset_token(token,
   4224 		doq_conn->doq_socket->static_secret,
   4225 		doq_conn->doq_socket->static_secret_len, cid) != 0) {
   4226 		lock_rw_unlock(&doq_conn->table->conid_lock);
   4227 		return NGTCP2_ERR_CALLBACK_FAILURE;
   4228 	}
   4229 	if(!doq_conn_associate_conid(doq_conn, cid->data, cid->datalen)) {
   4230 		lock_rw_unlock(&doq_conn->table->conid_lock);
   4231 		return NGTCP2_ERR_CALLBACK_FAILURE;
   4232 	}
   4233 	lock_rw_unlock(&doq_conn->table->conid_lock);
   4234 	return 0;
   4235 }
   4236 
   4237 /** ngtcp2 remove_connection_id callback function */
   4238 static int
   4239 doq_remove_connection_id_cb(ngtcp2_conn* ATTR_UNUSED(conn),
   4240 	const ngtcp2_cid* cid, void* user_data)
   4241 {
   4242 	struct doq_conn* doq_conn = (struct doq_conn*)user_data;
   4243 	lock_rw_wrlock(&doq_conn->table->conid_lock);
   4244 	doq_conn_dissociate_conid(doq_conn, cid->data, cid->datalen);
   4245 	lock_rw_unlock(&doq_conn->table->conid_lock);
   4246 	return 0;
   4247 }
   4248 
   4249 /** doq submit a new token */
   4250 static int
   4251 doq_submit_new_token(struct doq_conn* conn)
   4252 {
   4253 	uint8_t token[NGTCP2_CRYPTO_MAX_REGULAR_TOKENLEN];
   4254 	ngtcp2_ssize tokenlen;
   4255 	int ret;
   4256 	const ngtcp2_path* path = ngtcp2_conn_get_path(conn->conn);
   4257 	ngtcp2_tstamp ts = doq_get_timestamp_nanosec();
   4258 
   4259 	tokenlen = ngtcp2_crypto_generate_regular_token(token,
   4260 		conn->doq_socket->static_secret,
   4261 		conn->doq_socket->static_secret_len, path->remote.addr,
   4262 		path->remote.addrlen, ts);
   4263 	if(tokenlen < 0) {
   4264 		log_err("doq ngtcp2_crypto_generate_regular_token failed");
   4265 		return 1;
   4266 	}
   4267 
   4268 	verbose(VERB_ALGO, "doq submit new token");
   4269 	ret = ngtcp2_conn_submit_new_token(conn->conn, token, tokenlen);
   4270 	if(ret != 0) {
   4271 		log_err("doq ngtcp2_conn_submit_new_token failed: %s",
   4272 			ngtcp2_strerror(ret));
   4273 		return 0;
   4274 	}
   4275 	return 1;
   4276 }
   4277 
   4278 /** ngtcp2 handshake_completed callback function */
   4279 static int
   4280 doq_handshake_completed_cb(ngtcp2_conn* ATTR_UNUSED(conn), void* user_data)
   4281 {
   4282 	struct doq_conn* doq_conn = (struct doq_conn*)user_data;
   4283 	verbose(VERB_ALGO, "doq handshake_completed callback");
   4284 	verbose(VERB_ALGO, "ngtcp2_conn_get_max_data_left is %d",
   4285 		(int)ngtcp2_conn_get_max_data_left(doq_conn->conn));
   4286 #ifdef HAVE_NGTCP2_CONN_GET_MAX_LOCAL_STREAMS_UNI
   4287 	verbose(VERB_ALGO, "ngtcp2_conn_get_max_local_streams_uni is %d",
   4288 		(int)ngtcp2_conn_get_max_local_streams_uni(doq_conn->conn));
   4289 #endif
   4290 	verbose(VERB_ALGO, "ngtcp2_conn_get_streams_uni_left is %d",
   4291 		(int)ngtcp2_conn_get_streams_uni_left(doq_conn->conn));
   4292 	verbose(VERB_ALGO, "ngtcp2_conn_get_streams_bidi_left is %d",
   4293 		(int)ngtcp2_conn_get_streams_bidi_left(doq_conn->conn));
   4294 	verbose(VERB_ALGO, "negotiated cipher name is %s",
   4295 		SSL_get_cipher_name(doq_conn->ssl));
   4296 	if(verbosity > VERB_ALGO) {
   4297 		const unsigned char* alpn = NULL;
   4298 		unsigned int alpnlen = 0;
   4299 		char alpnstr[128];
   4300 		SSL_get0_alpn_selected(doq_conn->ssl, &alpn, &alpnlen);
   4301 		if(alpnlen > sizeof(alpnstr)-1)
   4302 			alpnlen = sizeof(alpnstr)-1;
   4303 		memmove(alpnstr, alpn, alpnlen);
   4304 		alpnstr[alpnlen]=0;
   4305 		verbose(VERB_ALGO, "negotiated ALPN is '%s'", alpnstr);
   4306 	}
   4307 
   4308 	if(!doq_submit_new_token(doq_conn))
   4309 		return -1;
   4310 	return 0;
   4311 }
   4312 
   4313 /** ngtcp2 stream_open callback function */
   4314 static int
   4315 doq_stream_open_cb(ngtcp2_conn* ATTR_UNUSED(conn), int64_t stream_id,
   4316 	void* user_data)
   4317 {
   4318 	struct doq_conn* doq_conn = (struct doq_conn*)user_data;
   4319 	struct doq_stream* stream;
   4320 	verbose(VERB_ALGO, "doq new stream %x", (int)stream_id);
   4321 	if(doq_stream_find(doq_conn, stream_id)) {
   4322 		verbose(VERB_ALGO, "doq: stream with this id already exists");
   4323 		return 0;
   4324 	}
   4325 	if(stream_id != 0 && stream_id != 4 && /* allow one stream on a new connection */
   4326 		!doq_table_quic_size_available(doq_conn->doq_socket->table,
   4327 		doq_conn->doq_socket->cfg, sizeof(*stream)
   4328 		+ 100 /* estimated query in */
   4329 		+ 512 /* estimated response out */
   4330 		)) {
   4331 		int rv;
   4332 		verbose(VERB_ALGO, "doq: no mem for new stream");
   4333 		rv = ngtcp2_conn_shutdown_stream(doq_conn->conn,
   4334 #ifdef HAVE_NGTCP2_CONN_SHUTDOWN_STREAM4
   4335 			0,
   4336 #endif
   4337 			stream_id, NGTCP2_CONNECTION_REFUSED);
   4338 		if(rv != 0) {
   4339 			log_err("ngtcp2_conn_shutdown_stream failed: %s",
   4340 				ngtcp2_strerror(rv));
   4341 			return NGTCP2_ERR_CALLBACK_FAILURE;
   4342 		}
   4343 		return 0;
   4344 	}
   4345 	stream = doq_stream_create(stream_id);
   4346 	if(!stream) {
   4347 		log_err("doq: could not doq_stream_create: out of memory");
   4348 		return NGTCP2_ERR_CALLBACK_FAILURE;
   4349 	}
   4350 	doq_table_quic_size_add(doq_conn->doq_socket->table, sizeof(*stream));
   4351 	doq_conn_add_stream(doq_conn, stream);
   4352 	return 0;
   4353 }
   4354 
   4355 /** ngtcp2 recv_stream_data callback function */
   4356 static int
   4357 doq_recv_stream_data_cb(ngtcp2_conn* ATTR_UNUSED(conn), uint32_t flags,
   4358 	int64_t stream_id, uint64_t offset, const uint8_t* data,
   4359 	size_t datalen, void* user_data, void* ATTR_UNUSED(stream_user_data))
   4360 {
   4361 	int recv_done = 0;
   4362 	struct doq_conn* doq_conn = (struct doq_conn*)user_data;
   4363 	struct doq_stream* stream;
   4364 	verbose(VERB_ALGO, "doq recv stream data stream id %d offset %d "
   4365 		"datalen %d%s%s", (int)stream_id, (int)offset, (int)datalen,
   4366 		((flags&NGTCP2_STREAM_DATA_FLAG_FIN)!=0?" FIN":""),
   4367 #ifdef NGTCP2_STREAM_DATA_FLAG_0RTT
   4368 		((flags&NGTCP2_STREAM_DATA_FLAG_0RTT)!=0?" 0RTT":"")
   4369 #else
   4370 		((flags&NGTCP2_STREAM_DATA_FLAG_EARLY)!=0?" EARLY":"")
   4371 #endif
   4372 		);
   4373 	stream = doq_stream_find(doq_conn, stream_id);
   4374 	if(!stream) {
   4375 		verbose(VERB_ALGO, "doq: received stream data for "
   4376 			"unknown stream %d", (int)stream_id);
   4377 		return 0;
   4378 	}
   4379 	if(stream->is_closed) {
   4380 		verbose(VERB_ALGO, "doq: stream is closed, ignore recv data");
   4381 		return 0;
   4382 	}
   4383 	if(datalen != 0) {
   4384 		if(!doq_stream_recv_data(stream, data, datalen, &recv_done,
   4385 			doq_conn->doq_socket->table))
   4386 			return NGTCP2_ERR_CALLBACK_FAILURE;
   4387 	}
   4388 	if((flags&NGTCP2_STREAM_DATA_FLAG_FIN)!=0) {
   4389 		if(!doq_stream_recv_fin(doq_conn, stream, recv_done))
   4390 			return NGTCP2_ERR_CALLBACK_FAILURE;
   4391 	}
   4392 	ngtcp2_conn_extend_max_stream_offset(doq_conn->conn, stream_id,
   4393 		datalen);
   4394 	ngtcp2_conn_extend_max_offset(doq_conn->conn, datalen);
   4395 	if(recv_done) {
   4396 		if(!doq_stream_data_complete(doq_conn, stream))
   4397 			return NGTCP2_ERR_CALLBACK_FAILURE;
   4398 	}
   4399 	return 0;
   4400 }
   4401 
   4402 /** ngtcp2 stream_close callback function */
   4403 static int
   4404 doq_stream_close_cb(ngtcp2_conn* ATTR_UNUSED(conn), uint32_t flags,
   4405 	int64_t stream_id, uint64_t app_error_code, void* user_data,
   4406 	void* ATTR_UNUSED(stream_user_data))
   4407 {
   4408 	struct doq_conn* doq_conn = (struct doq_conn*)user_data;
   4409 	struct doq_stream* stream;
   4410 	if((flags&NGTCP2_STREAM_CLOSE_FLAG_APP_ERROR_CODE_SET)!=0)
   4411 		verbose(VERB_ALGO, "doq stream close for stream id %d %sapp_error_code %d",
   4412 		(int)stream_id,
   4413 		(((flags&NGTCP2_STREAM_CLOSE_FLAG_APP_ERROR_CODE_SET)!=0)?
   4414 		"APP_ERROR_CODE_SET ":""),
   4415 		(int)app_error_code);
   4416 	else
   4417 		verbose(VERB_ALGO, "doq stream close for stream id %d",
   4418 			(int)stream_id);
   4419 
   4420 	stream = doq_stream_find(doq_conn, stream_id);
   4421 	if(!stream) {
   4422 		verbose(VERB_ALGO, "doq: stream close for "
   4423 			"unknown stream %d", (int)stream_id);
   4424 		return 0;
   4425 	}
   4426 	if(!doq_stream_close(doq_conn, stream, 0))
   4427 		return NGTCP2_ERR_CALLBACK_FAILURE;
   4428 	return 0;
   4429 }
   4430 
   4431 /** ngtcp2 stream_reset callback function */
   4432 static int
   4433 doq_stream_reset_cb(ngtcp2_conn* ATTR_UNUSED(conn), int64_t stream_id,
   4434 	uint64_t final_size, uint64_t app_error_code, void* user_data,
   4435 	void* ATTR_UNUSED(stream_user_data))
   4436 {
   4437 	struct doq_conn* doq_conn = (struct doq_conn*)user_data;
   4438 	struct doq_stream* stream;
   4439 	verbose(VERB_ALGO, "doq stream reset for stream id %d final_size %d "
   4440 		"app_error_code %d", (int)stream_id, (int)final_size,
   4441 		(int)app_error_code);
   4442 
   4443 	stream = doq_stream_find(doq_conn, stream_id);
   4444 	if(!stream) {
   4445 		verbose(VERB_ALGO, "doq: stream reset for "
   4446 			"unknown stream %d", (int)stream_id);
   4447 		return 0;
   4448 	}
   4449 	if(!doq_stream_close(doq_conn, stream, 0))
   4450 		return NGTCP2_ERR_CALLBACK_FAILURE;
   4451 	return 0;
   4452 }
   4453 
   4454 /** ngtcp2 acked_stream_data_offset callback function */
   4455 static int
   4456 doq_acked_stream_data_offset_cb(ngtcp2_conn* ATTR_UNUSED(conn),
   4457 	int64_t stream_id, uint64_t offset, uint64_t datalen, void* user_data,
   4458 	void* ATTR_UNUSED(stream_user_data))
   4459 {
   4460 	struct doq_conn* doq_conn = (struct doq_conn*)user_data;
   4461 	struct doq_stream* stream;
   4462 	verbose(VERB_ALGO, "doq stream acked data for stream id %d offset %d "
   4463 		"datalen %d", (int)stream_id, (int)offset, (int)datalen);
   4464 
   4465 	stream = doq_stream_find(doq_conn, stream_id);
   4466 	if(!stream) {
   4467 		verbose(VERB_ALGO, "doq: stream acked data for "
   4468 			"unknown stream %d", (int)stream_id);
   4469 		return 0;
   4470 	}
   4471 	/* Acked the data from [offset .. offset+datalen). */
   4472 	if(stream->is_closed)
   4473 		return 0;
   4474 	if(offset+datalen >= stream->outlen) {
   4475 		doq_stream_remove_in_buffer(stream,
   4476 			doq_conn->doq_socket->table);
   4477 		doq_stream_remove_out_buffer(stream,
   4478 			doq_conn->doq_socket->table);
   4479 	}
   4480 	return 0;
   4481 }
   4482 
   4483 /** ngtc2p log_printf callback function */
   4484 static void
   4485 doq_log_printf_cb(void* ATTR_UNUSED(user_data), const char* fmt, ...)
   4486 {
   4487 	char buf[1024];
   4488 	va_list ap;
   4489 	va_start(ap, fmt);
   4490 	vsnprintf(buf, sizeof(buf), fmt, ap);
   4491 	verbose(VERB_ALGO, "libngtcp2: %s", buf);
   4492 	va_end(ap);
   4493 }
   4494 
   4495 #ifdef MAKE_QUIC_METHOD
   4496 /** the doq application tx key callback, false on failure */
   4497 static int
   4498 doq_application_tx_key_cb(struct doq_conn* conn)
   4499 {
   4500 	verbose(VERB_ALGO, "doq application tx key cb");
   4501 	/* The server does not want to open streams to the client,
   4502 	 * the client instead initiates by opening bidi streams. */
   4503 	verbose(VERB_ALGO, "doq ngtcp2_conn_get_max_data_left is %d",
   4504 		(int)ngtcp2_conn_get_max_data_left(conn->conn));
   4505 #ifdef HAVE_NGTCP2_CONN_GET_MAX_LOCAL_STREAMS_UNI
   4506 	verbose(VERB_ALGO, "doq ngtcp2_conn_get_max_local_streams_uni is %d",
   4507 		(int)ngtcp2_conn_get_max_local_streams_uni(conn->conn));
   4508 #endif
   4509 	verbose(VERB_ALGO, "doq ngtcp2_conn_get_streams_uni_left is %d",
   4510 		(int)ngtcp2_conn_get_streams_uni_left(conn->conn));
   4511 	verbose(VERB_ALGO, "doq ngtcp2_conn_get_streams_bidi_left is %d",
   4512 		(int)ngtcp2_conn_get_streams_bidi_left(conn->conn));
   4513 	return 1;
   4514 }
   4515 
   4516 /** quic_method set_encryption_secrets function */
   4517 static int
   4518 doq_set_encryption_secrets(SSL *ssl, OSSL_ENCRYPTION_LEVEL ossl_level,
   4519 	const uint8_t *read_secret, const uint8_t *write_secret,
   4520 	size_t secret_len)
   4521 {
   4522 	struct doq_conn* doq_conn = (struct doq_conn*)SSL_get_app_data(ssl);
   4523 #ifdef HAVE_NGTCP2_ENCRYPTION_LEVEL
   4524 	ngtcp2_encryption_level
   4525 #else
   4526 	ngtcp2_crypto_level
   4527 #endif
   4528 		level =
   4529 #ifdef USE_NGTCP2_CRYPTO_OSSL
   4530 		ngtcp2_crypto_ossl_from_ossl_encryption_level(ossl_level);
   4531 #elif defined(HAVE_NGTCP2_CRYPTO_QUICTLS_FROM_OSSL_ENCRYPTION_LEVEL)
   4532 		ngtcp2_crypto_quictls_from_ossl_encryption_level(ossl_level);
   4533 #else
   4534 		ngtcp2_crypto_openssl_from_ossl_encryption_level(ossl_level);
   4535 #endif
   4536 
   4537 	if(read_secret) {
   4538 		verbose(VERB_ALGO, "doq: ngtcp2_crypto_derive_and_install_rx_key for level %d ossl %d", (int)level, (int)ossl_level);
   4539 		if(ngtcp2_crypto_derive_and_install_rx_key(doq_conn->conn,
   4540 			NULL, NULL, NULL, level, read_secret, secret_len)
   4541 			!= 0) {
   4542 			log_err("ngtcp2_crypto_derive_and_install_rx_key "
   4543 				"failed");
   4544 			return 0;
   4545 		}
   4546 	}
   4547 
   4548 	if(write_secret) {
   4549 		verbose(VERB_ALGO, "doq: ngtcp2_crypto_derive_and_install_tx_key for level %d ossl %d", (int)level, (int)ossl_level);
   4550 		if(ngtcp2_crypto_derive_and_install_tx_key(doq_conn->conn,
   4551 			NULL, NULL, NULL, level, write_secret, secret_len)
   4552 			!= 0) {
   4553 			log_err("ngtcp2_crypto_derive_and_install_tx_key "
   4554 				"failed");
   4555 			return 0;
   4556 		}
   4557 		if(level == NGTCP2_CRYPTO_LEVEL_APPLICATION) {
   4558 			if(!doq_application_tx_key_cb(doq_conn))
   4559 				return 0;
   4560 		}
   4561 	}
   4562 	return 1;
   4563 }
   4564 
   4565 /** quic_method add_handshake_data function */
   4566 static int
   4567 doq_add_handshake_data(SSL *ssl, OSSL_ENCRYPTION_LEVEL ossl_level,
   4568 	const uint8_t *data, size_t len)
   4569 {
   4570 	struct doq_conn* doq_conn = (struct doq_conn*)SSL_get_app_data(ssl);
   4571 #ifdef HAVE_NGTCP2_ENCRYPTION_LEVEL
   4572 	ngtcp2_encryption_level
   4573 #else
   4574 	ngtcp2_crypto_level
   4575 #endif
   4576 		level =
   4577 #ifdef USE_NGTCP2_CRYPTO_OSSL
   4578 		ngtcp2_crypto_ossl_from_ossl_encryption_level(ossl_level);
   4579 #elif defined(HAVE_NGTCP2_CRYPTO_QUICTLS_FROM_OSSL_ENCRYPTION_LEVEL)
   4580 		ngtcp2_crypto_quictls_from_ossl_encryption_level(ossl_level);
   4581 #else
   4582 		ngtcp2_crypto_openssl_from_ossl_encryption_level(ossl_level);
   4583 #endif
   4584 	int rv;
   4585 
   4586 	verbose(VERB_ALGO, "doq_add_handshake_data: "
   4587 		"ngtcp2_con_submit_crypto_data level %d", (int)level);
   4588 	rv = ngtcp2_conn_submit_crypto_data(doq_conn->conn, level, data, len);
   4589 	if(rv != 0) {
   4590 		log_err("ngtcp2_conn_submit_crypto_data failed: %s",
   4591 			ngtcp2_strerror(rv));
   4592 		ngtcp2_conn_set_tls_error(doq_conn->conn, rv);
   4593 		return 0;
   4594 	}
   4595 	return 1;
   4596 }
   4597 
   4598 /** quic_method flush_flight function */
   4599 static int
   4600 doq_flush_flight(SSL* ATTR_UNUSED(ssl))
   4601 {
   4602 	return 1;
   4603 }
   4604 
   4605 /** quic_method send_alert function */
   4606 static int
   4607 doq_send_alert(SSL *ssl, enum ssl_encryption_level_t ATTR_UNUSED(level),
   4608 	uint8_t alert)
   4609 {
   4610 	struct doq_conn* doq_conn = (struct doq_conn*)SSL_get_app_data(ssl);
   4611 	doq_conn->tls_alert = alert;
   4612 	return 1;
   4613 }
   4614 #endif /* MAKE_QUIC_METHOD */
   4615 
   4616 /** ALPN select callback for the doq SSL context */
   4617 static int
   4618 doq_alpn_select_cb(SSL* ATTR_UNUSED(ssl), const unsigned char** out,
   4619 	unsigned char* outlen, const unsigned char* in, unsigned int inlen,
   4620 	void* ATTR_UNUSED(arg))
   4621 {
   4622 	/* select "doq" */
   4623 	int ret = SSL_select_next_proto((void*)out, outlen,
   4624 		(const unsigned char*)"\x03""doq", 4, in, inlen);
   4625 	if(ret == OPENSSL_NPN_NEGOTIATED)
   4626 		return SSL_TLSEXT_ERR_OK;
   4627 	verbose(VERB_ALGO, "doq alpn_select_cb: ALPN from client does "
   4628 		"not have 'doq'");
   4629 	return SSL_TLSEXT_ERR_ALERT_FATAL;
   4630 }
   4631 
   4632 void* quic_sslctx_create(char* key, char* pem, char* verifypem)
   4633 {
   4634 #ifdef HAVE_NGTCP2
   4635 	char* sid_ctx = "unbound server";
   4636 #ifdef MAKE_QUIC_METHOD
   4637 	SSL_QUIC_METHOD* quic_method;
   4638 #endif
   4639 	SSL_CTX* ctx = SSL_CTX_new(TLS_server_method());
   4640 	if(!ctx) {
   4641 		log_crypto_err("Could not SSL_CTX_new");
   4642 		return NULL;
   4643 	}
   4644 	if(!key || key[0] == 0) {
   4645 		log_err("doq: error, no tls-service-key file specified");
   4646 		SSL_CTX_free(ctx);
   4647 		return NULL;
   4648 	}
   4649 	if(!pem || pem[0] == 0) {
   4650 		log_err("doq: error, no tls-service-pem file specified");
   4651 		SSL_CTX_free(ctx);
   4652 		return NULL;
   4653 	}
   4654 	SSL_CTX_set_options(ctx,
   4655 		(SSL_OP_ALL & ~SSL_OP_DONT_INSERT_EMPTY_FRAGMENTS) |
   4656 		SSL_OP_SINGLE_ECDH_USE |
   4657 		SSL_OP_CIPHER_SERVER_PREFERENCE |
   4658 		SSL_OP_NO_ANTI_REPLAY);
   4659 	SSL_CTX_set_mode(ctx, SSL_MODE_RELEASE_BUFFERS);
   4660 	SSL_CTX_set_min_proto_version(ctx, TLS1_3_VERSION);
   4661 	SSL_CTX_set_max_proto_version(ctx, TLS1_3_VERSION);
   4662 #ifdef HAVE_SSL_CTX_SET_ALPN_SELECT_CB
   4663 	SSL_CTX_set_alpn_select_cb(ctx, doq_alpn_select_cb, NULL);
   4664 #endif
   4665 	SSL_CTX_set_default_verify_paths(ctx);
   4666 	if(!SSL_CTX_use_certificate_chain_file(ctx, pem)) {
   4667 		log_err("doq: error for cert file: %s", pem);
   4668 		log_crypto_err("doq: error in "
   4669 			"SSL_CTX_use_certificate_chain_file");
   4670 		SSL_CTX_free(ctx);
   4671 		return NULL;
   4672 	}
   4673 	if(!SSL_CTX_use_PrivateKey_file(ctx, key, SSL_FILETYPE_PEM)) {
   4674 		log_err("doq: error for private key file: %s", key);
   4675 		log_crypto_err("doq: error in SSL_CTX_use_PrivateKey_file");
   4676 		SSL_CTX_free(ctx);
   4677 		return NULL;
   4678 	}
   4679 	if(!SSL_CTX_check_private_key(ctx)) {
   4680 		log_err("doq: error for key file: %s", key);
   4681 		log_crypto_err("doq: error in SSL_CTX_check_private_key");
   4682 		SSL_CTX_free(ctx);
   4683 		return NULL;
   4684 	}
   4685 	SSL_CTX_set_session_id_context(ctx, (void*)sid_ctx, strlen(sid_ctx));
   4686 	if(verifypem && verifypem[0]) {
   4687 		if(!SSL_CTX_load_verify_locations(ctx, verifypem, NULL)) {
   4688 			log_err("doq: error for verify pem file: %s",
   4689 				verifypem);
   4690 			log_crypto_err("doq: error in "
   4691 				"SSL_CTX_load_verify_locations");
   4692 			SSL_CTX_free(ctx);
   4693 			return NULL;
   4694 		}
   4695 		SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file(
   4696 			verifypem));
   4697 		SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER|
   4698 			SSL_VERIFY_CLIENT_ONCE|
   4699 			SSL_VERIFY_FAIL_IF_NO_PEER_CERT, NULL);
   4700 	}
   4701 
   4702 	SSL_CTX_set_max_early_data(ctx, 0xffffffff);
   4703 #ifdef HAVE_NGTCP2_CRYPTO_QUICTLS_CONFIGURE_SERVER_CONTEXT
   4704 	if(ngtcp2_crypto_quictls_configure_server_context(ctx) != 0) {
   4705 		log_err("ngtcp2_crypto_quictls_configure_server_context failed");
   4706 		SSL_CTX_free(ctx);
   4707 		return NULL;
   4708 	}
   4709 #elif defined(MAKE_QUIC_METHOD)
   4710 	/* The quic_method needs to remain valid during the SSL_CTX
   4711 	 * lifetime, so we allocate it. It is freed with the
   4712 	 * doq_server_socket. */
   4713 	quic_method = calloc(1, sizeof(SSL_QUIC_METHOD));
   4714 	if(!quic_method) {
   4715 		log_err("calloc failed: out of memory");
   4716 		SSL_CTX_free(ctx);
   4717 		return NULL;
   4718 	}
   4719 	doq_socket->quic_method = quic_method;
   4720 	quic_method->set_encryption_secrets = doq_set_encryption_secrets;
   4721 	quic_method->add_handshake_data = doq_add_handshake_data;
   4722 	quic_method->flush_flight = doq_flush_flight;
   4723 	quic_method->send_alert = doq_send_alert;
   4724 	SSL_CTX_set_quic_method(ctx, doq_socket->quic_method);
   4725 #endif
   4726 	return ctx;
   4727 #else /* HAVE_NGTCP2 */
   4728 	(void)key; (void)pem; (void)verifypem;
   4729 	return NULL;
   4730 #endif /* HAVE_NGTCP2 */
   4731 }
   4732 
   4733 /** Get the ngtcp2_conn from ssl userdata of type ngtcp2_conn_ref */
   4734 static ngtcp2_conn* doq_conn_ref_get_conn(ngtcp2_crypto_conn_ref* conn_ref)
   4735 {
   4736 	struct doq_conn* conn = (struct doq_conn*)conn_ref->user_data;
   4737 	return conn->conn;
   4738 }
   4739 
   4740 /** create new SSL session for server connection */
   4741 static SSL*
   4742 doq_ssl_server_setup(SSL_CTX* ctx, struct doq_conn* conn)
   4743 {
   4744 #ifdef USE_NGTCP2_CRYPTO_OSSL
   4745 	int ret;
   4746 #endif
   4747 	SSL* ssl = SSL_new(ctx);
   4748 	if(!ssl) {
   4749 		log_crypto_err("doq: SSL_new failed");
   4750 		return NULL;
   4751 	}
   4752 #ifdef USE_NGTCP2_CRYPTO_OSSL
   4753 	if((ret=ngtcp2_crypto_ossl_ctx_new(&conn->ossl_ctx, NULL)) != 0) {
   4754 		log_err("doq: ngtcp2_crypto_ossl_ctx_new failed: %s",
   4755 			ngtcp2_strerror(ret));
   4756 		SSL_free(ssl);
   4757 		return NULL;
   4758 	}
   4759 	ngtcp2_crypto_ossl_ctx_set_ssl(conn->ossl_ctx, ssl);
   4760 	if(ngtcp2_crypto_ossl_configure_server_session(ssl) != 0) {
   4761 		log_err("doq: ngtcp2_crypto_ossl_configure_server_session failed");
   4762 		SSL_free(ssl);
   4763 		return NULL;
   4764 	}
   4765 #endif
   4766 #if defined(USE_NGTCP2_CRYPTO_OSSL) || defined(HAVE_NGTCP2_CRYPTO_QUICTLS_CONFIGURE_SERVER_CONTEXT)
   4767 	conn->conn_ref.get_conn = &doq_conn_ref_get_conn;
   4768 	conn->conn_ref.user_data = conn;
   4769 	SSL_set_app_data(ssl, &conn->conn_ref);
   4770 #else
   4771 	SSL_set_app_data(ssl, conn);
   4772 #endif
   4773 	SSL_set_accept_state(ssl);
   4774 #ifdef USE_NGTCP2_CRYPTO_OSSL
   4775 	SSL_set_quic_tls_early_data_enabled(ssl, 1);
   4776 #else
   4777 	SSL_set_quic_early_data_enabled(ssl, 1);
   4778 #endif
   4779 	return ssl;
   4780 }
   4781 
   4782 int
   4783 doq_conn_setup(struct doq_conn* conn, uint8_t* scid, size_t scidlen,
   4784 	uint8_t* ocid, size_t ocidlen, const uint8_t* token, size_t tokenlen)
   4785 {
   4786 	int rv;
   4787 	struct ngtcp2_cid dcid, sv_scid, scid_cid;
   4788 	struct ngtcp2_path path;
   4789 	struct ngtcp2_callbacks callbacks;
   4790 	struct ngtcp2_settings settings;
   4791 	struct ngtcp2_transport_params params;
   4792 	memset(&dcid, 0, sizeof(dcid));
   4793 	memset(&sv_scid, 0, sizeof(sv_scid));
   4794 	memset(&scid_cid, 0, sizeof(scid_cid));
   4795 	memset(&path, 0, sizeof(path));
   4796 	memset(&callbacks, 0, sizeof(callbacks));
   4797 	memset(&settings, 0, sizeof(settings));
   4798 	memset(&params, 0, sizeof(params));
   4799 
   4800 	ngtcp2_cid_init(&scid_cid, scid, scidlen);
   4801 	ngtcp2_cid_init(&dcid, conn->key.dcid, conn->key.dcidlen);
   4802 
   4803 	path.remote.addr = (struct sockaddr*)&conn->key.paddr.addr;
   4804 	path.remote.addrlen = conn->key.paddr.addrlen;
   4805 	path.local.addr = (struct sockaddr*)&conn->key.paddr.localaddr;
   4806 	path.local.addrlen = conn->key.paddr.localaddrlen;
   4807 
   4808 	callbacks.recv_client_initial = ngtcp2_crypto_recv_client_initial_cb;
   4809 	callbacks.recv_crypto_data = ngtcp2_crypto_recv_crypto_data_cb;
   4810 	callbacks.encrypt = ngtcp2_crypto_encrypt_cb;
   4811 	callbacks.decrypt = ngtcp2_crypto_decrypt_cb;
   4812 	callbacks.hp_mask = ngtcp2_crypto_hp_mask;
   4813 	callbacks.update_key = ngtcp2_crypto_update_key_cb;
   4814 	callbacks.delete_crypto_aead_ctx =
   4815 		ngtcp2_crypto_delete_crypto_aead_ctx_cb;
   4816 	callbacks.delete_crypto_cipher_ctx =
   4817 		ngtcp2_crypto_delete_crypto_cipher_ctx_cb;
   4818 	callbacks.get_path_challenge_data =
   4819 		ngtcp2_crypto_get_path_challenge_data_cb;
   4820 	callbacks.version_negotiation = ngtcp2_crypto_version_negotiation_cb;
   4821 	callbacks.rand = doq_rand_cb;
   4822 	callbacks.get_new_connection_id = doq_get_new_connection_id_cb;
   4823 	callbacks.remove_connection_id = doq_remove_connection_id_cb;
   4824 	callbacks.handshake_completed = doq_handshake_completed_cb;
   4825 	callbacks.stream_open = doq_stream_open_cb;
   4826 	callbacks.stream_close = doq_stream_close_cb;
   4827 	callbacks.stream_reset = doq_stream_reset_cb;
   4828 	callbacks.acked_stream_data_offset = doq_acked_stream_data_offset_cb;
   4829 	callbacks.recv_stream_data = doq_recv_stream_data_cb;
   4830 
   4831 	ngtcp2_settings_default(&settings);
   4832 	if(verbosity >= VERB_ALGO) {
   4833 		settings.log_printf = doq_log_printf_cb;
   4834 	}
   4835 	settings.rand_ctx.native_handle = conn->doq_socket->rnd;
   4836 	settings.initial_ts = doq_get_timestamp_nanosec();
   4837 	settings.max_stream_window = 6*1024*1024;
   4838 	settings.max_window = 6*1024*1024;
   4839 #ifdef HAVE_STRUCT_NGTCP2_SETTINGS_TOKENLEN
   4840 	settings.token = (void*)token;
   4841 	settings.tokenlen = tokenlen;
   4842 #else
   4843 	settings.token.base = (void*)token;
   4844 	settings.token.len = tokenlen;
   4845 #endif
   4846 
   4847 	ngtcp2_transport_params_default(&params);
   4848 	params.max_idle_timeout = conn->doq_socket->idle_timeout;
   4849 	params.active_connection_id_limit = 7;
   4850 	params.initial_max_stream_data_bidi_local = 256*1024;
   4851 	params.initial_max_stream_data_bidi_remote = 256*1024;
   4852 	params.initial_max_data = 1024*1024;
   4853 	/* DoQ uses bidi streams, so we allow 0 uni streams. */
   4854 	params.initial_max_streams_uni = 0;
   4855 	/* Initial max on number of bidi streams the remote end can open.
   4856 	 * That is the number of queries it can make, at first. */
   4857 	params.initial_max_streams_bidi = 10;
   4858 	if(ocid) {
   4859 		ngtcp2_cid_init(&params.original_dcid, ocid, ocidlen);
   4860 		ngtcp2_cid_init(&params.retry_scid, conn->key.dcid,
   4861 			conn->key.dcidlen);
   4862 		params.retry_scid_present = 1;
   4863 	} else {
   4864 		ngtcp2_cid_init(&params.original_dcid, conn->key.dcid,
   4865 			conn->key.dcidlen);
   4866 	}
   4867 #ifdef HAVE_STRUCT_NGTCP2_TRANSPORT_PARAMS_ORIGINAL_DCID_PRESENT
   4868 	params.original_dcid_present = 1;
   4869 #endif
   4870 	doq_fill_rand(conn->doq_socket->rnd, params.stateless_reset_token,
   4871 		sizeof(params.stateless_reset_token));
   4872 	sv_scid.datalen = conn->doq_socket->sv_scidlen;
   4873 	lock_rw_wrlock(&conn->table->conid_lock);
   4874 	if(!doq_conn_generate_new_conid(conn, sv_scid.data, sv_scid.datalen)) {
   4875 		lock_rw_unlock(&conn->table->conid_lock);
   4876 		return 0;
   4877 	}
   4878 
   4879 	rv = ngtcp2_conn_server_new(&conn->conn, &scid_cid, &sv_scid, &path,
   4880 		conn->version, &callbacks, &settings, &params, NULL, conn);
   4881 	if(rv != 0) {
   4882 		lock_rw_unlock(&conn->table->conid_lock);
   4883 		log_err("ngtcp2_conn_server_new failed: %s",
   4884 			ngtcp2_strerror(rv));
   4885 		return 0;
   4886 	}
   4887 	if(!doq_conn_setup_conids(conn)) {
   4888 		lock_rw_unlock(&conn->table->conid_lock);
   4889 		log_err("doq_conn_setup_conids failed: out of memory");
   4890 		return 0;
   4891 	}
   4892 	lock_rw_unlock(&conn->table->conid_lock);
   4893 	conn->ssl = doq_ssl_server_setup((SSL_CTX*)conn->doq_socket->ctx,
   4894 		conn);
   4895 	if(!conn->ssl) {
   4896 		log_err("doq_ssl_server_setup failed");
   4897 		return 0;
   4898 	}
   4899 #ifdef USE_NGTCP2_CRYPTO_OSSL
   4900 	ngtcp2_conn_set_tls_native_handle(conn->conn, conn->ossl_ctx);
   4901 #else
   4902 	ngtcp2_conn_set_tls_native_handle(conn->conn, conn->ssl);
   4903 #endif
   4904 	doq_conn_write_enable(conn);
   4905 	return 1;
   4906 }
   4907 
   4908 struct doq_conid*
   4909 doq_conid_find(struct doq_table* table, const uint8_t* data, size_t datalen)
   4910 {
   4911 	struct rbnode_type* node;
   4912 	struct doq_conid key;
   4913 	key.node.key = &key;
   4914 	key.cid = (void*)data;
   4915 	key.cidlen = datalen;
   4916 	log_assert(table != NULL);
   4917 	node = rbtree_search(table->conid_tree, &key);
   4918 	if(node)
   4919 		return (struct doq_conid*)node->key;
   4920 	return NULL;
   4921 }
   4922 
   4923 /** insert conid in the conid list */
   4924 static void
   4925 doq_conid_list_insert(struct doq_conn* conn, struct doq_conid* conid)
   4926 {
   4927 	conid->prev = NULL;
   4928 	conid->next = conn->conid_list;
   4929 	if(conn->conid_list)
   4930 		conn->conid_list->prev = conid;
   4931 	conn->conid_list = conid;
   4932 }
   4933 
   4934 /** remove conid from the conid list */
   4935 static void
   4936 doq_conid_list_remove(struct doq_conn* conn, struct doq_conid* conid)
   4937 {
   4938 	if(conid->prev)
   4939 		conid->prev->next = conid->next;
   4940 	else	conn->conid_list = conid->next;
   4941 	if(conid->next)
   4942 		conid->next->prev = conid->prev;
   4943 }
   4944 
   4945 /** create a doq_conid */
   4946 static struct doq_conid*
   4947 doq_conid_create(uint8_t* data, size_t datalen, struct doq_conn_key* key)
   4948 {
   4949 	struct doq_conid* conid;
   4950 	conid = calloc(1, sizeof(*conid));
   4951 	if(!conid)
   4952 		return NULL;
   4953 	conid->cid = memdup(data, datalen);
   4954 	if(!conid->cid) {
   4955 		free(conid);
   4956 		return NULL;
   4957 	}
   4958 	conid->cidlen = datalen;
   4959 	conid->node.key = conid;
   4960 	conid->key = *key;
   4961 	conid->key.dcid = memdup(key->dcid, key->dcidlen);
   4962 	if(!conid->key.dcid) {
   4963 		free(conid->cid);
   4964 		free(conid);
   4965 		return NULL;
   4966 	}
   4967 	return conid;
   4968 }
   4969 
   4970 void
   4971 doq_conid_delete(struct doq_conid* conid)
   4972 {
   4973 	if(!conid)
   4974 		return;
   4975 	free(conid->key.dcid);
   4976 	free(conid->cid);
   4977 	free(conid);
   4978 }
   4979 
   4980 /** return true if the conid is for the conn. */
   4981 static int
   4982 conid_is_for_conn(struct doq_conn* conn, struct doq_conid* conid)
   4983 {
   4984 	if(conid->key.dcidlen == conn->key.dcidlen &&
   4985 		memcmp(conid->key.dcid, conn->key.dcid, conid->key.dcidlen)==0
   4986 		&& conid->key.paddr.addrlen == conn->key.paddr.addrlen &&
   4987 		memcmp(&conid->key.paddr.addr, &conn->key.paddr.addr,
   4988 			conid->key.paddr.addrlen) == 0 &&
   4989 		conid->key.paddr.localaddrlen == conn->key.paddr.localaddrlen &&
   4990 		memcmp(&conid->key.paddr.localaddr, &conn->key.paddr.localaddr,
   4991 			conid->key.paddr.localaddrlen) == 0 &&
   4992 		conid->key.paddr.ifindex == conn->key.paddr.ifindex)
   4993 		return 1;
   4994 	return 0;
   4995 }
   4996 
   4997 int
   4998 doq_conn_associate_conid(struct doq_conn* conn, uint8_t* data, size_t datalen)
   4999 {
   5000 	struct doq_conid* conid;
   5001 	conid = doq_conid_find(conn->table, data, datalen);
   5002 	if(conid && !conid_is_for_conn(conn, conid)) {
   5003 		verbose(VERB_ALGO, "doq connection id already exists for "
   5004 			"another doq_conn. Ignoring second connection id.");
   5005 		/* Already exists to another conn, ignore it.
   5006 		 * This works, in that the conid is listed in the doq_conn
   5007 		 * conid_list element, and removed from there. So our conid
   5008 		 * tree and list are fine, when created and removed.
   5009 		 * The tree now does not have the lookup element pointing
   5010 		 * to this connection. */
   5011 		return 1;
   5012 	}
   5013 	if(conid)
   5014 		return 1; /* already inserted */
   5015 	conid = doq_conid_create(data, datalen, &conn->key);
   5016 	if(!conid)
   5017 		return 0;
   5018 	doq_conid_list_insert(conn, conid);
   5019 	(void)rbtree_insert(conn->table->conid_tree, &conid->node);
   5020 	return 1;
   5021 }
   5022 
   5023 void
   5024 doq_conn_dissociate_conid(struct doq_conn* conn, const uint8_t* data,
   5025 	size_t datalen)
   5026 {
   5027 	struct doq_conid* conid;
   5028 	conid = doq_conid_find(conn->table, data, datalen);
   5029 	if(conid && !conid_is_for_conn(conn, conid))
   5030 		return;
   5031 	if(conid) {
   5032 		(void)rbtree_delete(conn->table->conid_tree,
   5033 			conid->node.key);
   5034 		doq_conid_list_remove(conn, conid);
   5035 		doq_conid_delete(conid);
   5036 	}
   5037 }
   5038 
   5039 /** associate the scid array and also the dcid.
   5040  * caller must hold the locks on conn and doq_table.conid_lock. */
   5041 static int
   5042 doq_conn_setup_id_array_and_dcid(struct doq_conn* conn,
   5043 	struct ngtcp2_cid* scids, size_t num_scid)
   5044 {
   5045 	size_t i;
   5046 	for(i=0; i<num_scid; i++) {
   5047 		if(!doq_conn_associate_conid(conn, scids[i].data,
   5048 			scids[i].datalen))
   5049 			return 0;
   5050 	}
   5051 	if(!doq_conn_associate_conid(conn, conn->key.dcid, conn->key.dcidlen))
   5052 		return 0;
   5053 	return 1;
   5054 }
   5055 
   5056 int
   5057 doq_conn_setup_conids(struct doq_conn* conn)
   5058 {
   5059 	size_t num_scid =
   5060 #ifndef HAVE_NGTCP2_CONN_GET_NUM_SCID
   5061 		ngtcp2_conn_get_scid(conn->conn, NULL);
   5062 #else
   5063 		ngtcp2_conn_get_num_scid(conn->conn);
   5064 #endif
   5065 	if(num_scid <= 4) {
   5066 		struct ngtcp2_cid ids[4];
   5067 		/* Usually there are not that many scids when just accepted,
   5068 		 * like only 2. */
   5069 		ngtcp2_conn_get_scid(conn->conn, ids);
   5070 		return doq_conn_setup_id_array_and_dcid(conn, ids, num_scid);
   5071 	} else {
   5072 		struct ngtcp2_cid *scids = calloc(num_scid,
   5073 			sizeof(struct ngtcp2_cid));
   5074 		if(!scids)
   5075 			return 0;
   5076 		ngtcp2_conn_get_scid(conn->conn, scids);
   5077 		if(!doq_conn_setup_id_array_and_dcid(conn, scids, num_scid)) {
   5078 			free(scids);
   5079 			return 0;
   5080 		}
   5081 		free(scids);
   5082 	}
   5083 	return 1;
   5084 }
   5085 
   5086 void
   5087 doq_conn_clear_conids(struct doq_conn* conn)
   5088 {
   5089 	struct doq_conid* p, *next;
   5090 	if(!conn)
   5091 		return;
   5092 	p = conn->conid_list;
   5093 	while(p) {
   5094 		next = p->next;
   5095 		(void)rbtree_delete(conn->table->conid_tree, p->node.key);
   5096 		doq_conid_delete(p);
   5097 		p = next;
   5098 	}
   5099 	conn->conid_list = NULL;
   5100 }
   5101 
   5102 ngtcp2_tstamp doq_get_timestamp_nanosec(void)
   5103 {
   5104 #ifdef CLOCK_REALTIME
   5105 	struct timespec tp;
   5106 	memset(&tp, 0, sizeof(tp));
   5107 	/* Get a nanosecond time, that can be compared with the event base. */
   5108 	if(clock_gettime(CLOCK_REALTIME, &tp) == -1) {
   5109 		log_err("clock_gettime failed: %s", strerror(errno));
   5110 	}
   5111 	return ((uint64_t)tp.tv_sec)*((uint64_t)1000000000) +
   5112 		((uint64_t)tp.tv_nsec);
   5113 #else
   5114 	struct timeval tv;
   5115 	if(gettimeofday(&tv, NULL) < 0) {
   5116 		log_err("gettimeofday failed: %s", strerror(errno));
   5117 	}
   5118 	return ((uint64_t)tv.tv_sec)*((uint64_t)1000000000) +
   5119 		((uint64_t)tv.tv_usec)*((uint64_t)1000);
   5120 #endif /* CLOCK_REALTIME */
   5121 }
   5122 
   5123 /** doq start the closing period for the connection. */
   5124 static int
   5125 doq_conn_start_closing_period(struct comm_point* c, struct doq_conn* conn)
   5126 {
   5127 	struct ngtcp2_path_storage ps;
   5128 	struct ngtcp2_pkt_info pi;
   5129 	ngtcp2_ssize ret;
   5130 	if(!conn)
   5131 		return 1;
   5132 	if(
   5133 #ifdef HAVE_NGTCP2_CONN_IN_CLOSING_PERIOD
   5134 		ngtcp2_conn_in_closing_period(conn->conn)
   5135 #else
   5136 		ngtcp2_conn_is_in_closing_period(conn->conn)
   5137 #endif
   5138 		)
   5139 		return 1;
   5140 	if(
   5141 #ifdef HAVE_NGTCP2_CONN_IN_DRAINING_PERIOD
   5142 		ngtcp2_conn_in_draining_period(conn->conn)
   5143 #else
   5144 		ngtcp2_conn_is_in_draining_period(conn->conn)
   5145 #endif
   5146 		) {
   5147 		doq_conn_write_disable(conn);
   5148 		return 1;
   5149 	}
   5150 	ngtcp2_path_storage_zero(&ps);
   5151 	sldns_buffer_clear(c->doq_socket->pkt_buf);
   5152 	/* the call to ngtcp2_conn_write_connection_close causes the
   5153 	 * conn to be closed. It is now in the closing period. */
   5154 	ret = ngtcp2_conn_write_connection_close(conn->conn, &ps.path,
   5155 		&pi, sldns_buffer_begin(c->doq_socket->pkt_buf),
   5156 		sldns_buffer_remaining(c->doq_socket->pkt_buf),
   5157 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
   5158 		&conn->ccerr
   5159 #else
   5160 		&conn->last_error
   5161 #endif
   5162 		, doq_get_timestamp_nanosec());
   5163 	if(ret < 0) {
   5164 		log_err("doq ngtcp2_conn_write_connection_close failed: %s",
   5165 			ngtcp2_strerror(ret));
   5166 		return 0;
   5167 	}
   5168 	if(ret == 0) {
   5169 		return 0;
   5170 	}
   5171 	sldns_buffer_set_position(c->doq_socket->pkt_buf, ret);
   5172 	sldns_buffer_flip(c->doq_socket->pkt_buf);
   5173 
   5174 	/* The close packet is allocated, because it may have to be repeated.
   5175 	 * When incoming packets have this connection dcid. */
   5176 	conn->close_pkt = memdup(sldns_buffer_begin(c->doq_socket->pkt_buf),
   5177 		sldns_buffer_limit(c->doq_socket->pkt_buf));
   5178 	if(!conn->close_pkt) {
   5179 		log_err("doq: could not allocate close packet: out of memory");
   5180 		return 0;
   5181 	}
   5182 	conn->close_pkt_len = sldns_buffer_limit(c->doq_socket->pkt_buf);
   5183 	conn->close_ecn = pi.ecn;
   5184 	return 1;
   5185 }
   5186 
   5187 /** doq send the close packet for the connection, perhaps again. */
   5188 int
   5189 doq_conn_send_close(struct comm_point* c, struct doq_conn* conn)
   5190 {
   5191 	if(!conn)
   5192 		return 0;
   5193 	if(!conn->close_pkt)
   5194 		return 0;
   5195 	if(conn->close_pkt_len > sldns_buffer_capacity(c->doq_socket->pkt_buf))
   5196 		return 0;
   5197 	sldns_buffer_clear(c->doq_socket->pkt_buf);
   5198 	sldns_buffer_write(c->doq_socket->pkt_buf, conn->close_pkt, conn->close_pkt_len);
   5199 	sldns_buffer_flip(c->doq_socket->pkt_buf);
   5200 	verbose(VERB_ALGO, "doq send connection close");
   5201 	doq_send_pkt(c, &conn->key.paddr, conn->close_ecn);
   5202 	doq_conn_write_disable(conn);
   5203 	return 1;
   5204 }
   5205 
   5206 /** doq close the connection on error. If it returns a failure, it
   5207  * does not wait to send a close, and the connection can be dropped. */
   5208 static int
   5209 doq_conn_close_error(struct comm_point* c, struct doq_conn* conn)
   5210 {
   5211 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
   5212 	if(conn->ccerr.type == NGTCP2_CCERR_TYPE_IDLE_CLOSE)
   5213 		return 0;
   5214 #else
   5215 	if(conn->last_error.type ==
   5216 		NGTCP2_CONNECTION_CLOSE_ERROR_CODE_TYPE_TRANSPORT_IDLE_CLOSE)
   5217 		return 0;
   5218 #endif
   5219 	if(!doq_conn_start_closing_period(c, conn))
   5220 		return 0;
   5221 	if(
   5222 #ifdef HAVE_NGTCP2_CONN_IN_DRAINING_PERIOD
   5223 		ngtcp2_conn_in_draining_period(conn->conn)
   5224 #else
   5225 		ngtcp2_conn_is_in_draining_period(conn->conn)
   5226 #endif
   5227 		) {
   5228 		doq_conn_write_disable(conn);
   5229 		return 1;
   5230 	}
   5231 	doq_conn_write_enable(conn);
   5232 	if(!doq_conn_send_close(c, conn))
   5233 		return 0;
   5234 	return 1;
   5235 }
   5236 
   5237 int
   5238 doq_conn_recv(struct comm_point* c, struct doq_pkt_addr* paddr,
   5239 	struct doq_conn* conn, struct ngtcp2_pkt_info* pi, int* err_retry,
   5240 	int* err_drop)
   5241 {
   5242 	int ret;
   5243 	ngtcp2_tstamp ts;
   5244 	struct ngtcp2_path path;
   5245 	memset(&path, 0, sizeof(path));
   5246 	path.remote.addr = (struct sockaddr*)&paddr->addr;
   5247 	path.remote.addrlen = paddr->addrlen;
   5248 	path.local.addr = (struct sockaddr*)&paddr->localaddr;
   5249 	path.local.addrlen = paddr->localaddrlen;
   5250 	ts = doq_get_timestamp_nanosec();
   5251 
   5252 	ret = ngtcp2_conn_read_pkt(conn->conn, &path, pi,
   5253 		sldns_buffer_begin(c->doq_socket->pkt_buf),
   5254 		sldns_buffer_limit(c->doq_socket->pkt_buf), ts);
   5255 	if(ret != 0) {
   5256 		if(err_retry)
   5257 			*err_retry = 0;
   5258 		if(err_drop)
   5259 			*err_drop = 0;
   5260 		if(ret == NGTCP2_ERR_DRAINING) {
   5261 			verbose(VERB_ALGO, "ngtcp2_conn_read_pkt returned %s",
   5262 				ngtcp2_strerror(ret));
   5263 			doq_conn_write_disable(conn);
   5264 			return 0;
   5265 		} else if(ret == NGTCP2_ERR_DROP_CONN) {
   5266 			verbose(VERB_ALGO, "ngtcp2_conn_read_pkt returned %s",
   5267 				ngtcp2_strerror(ret));
   5268 			if(err_drop)
   5269 				*err_drop = 1;
   5270 			return 0;
   5271 		} else if(ret == NGTCP2_ERR_RETRY) {
   5272 			verbose(VERB_ALGO, "ngtcp2_conn_read_pkt returned %s",
   5273 				ngtcp2_strerror(ret));
   5274 			if(err_retry)
   5275 				*err_retry = 1;
   5276 			if(err_drop)
   5277 				*err_drop = 1;
   5278 			return 0;
   5279 		} else if(ret == NGTCP2_ERR_CRYPTO) {
   5280 			if(
   5281 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
   5282 				!conn->ccerr.error_code
   5283 #else
   5284 				!conn->last_error.error_code
   5285 #endif
   5286 				) {
   5287 				/* in picotls the tls alert may need to be
   5288 				 * copied, but this is with openssl. And there
   5289 				 * is conn->tls_alert. */
   5290 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
   5291 				ngtcp2_ccerr_set_tls_alert(&conn->ccerr,
   5292 					conn->tls_alert, NULL, 0);
   5293 #else
   5294 				ngtcp2_connection_close_error_set_transport_error_tls_alert(
   5295 					&conn->last_error, conn->tls_alert,
   5296 					NULL, 0);
   5297 #endif
   5298 			}
   5299 		} else {
   5300 			if(
   5301 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
   5302 				!conn->ccerr.error_code
   5303 #else
   5304 				!conn->last_error.error_code
   5305 #endif
   5306 				) {
   5307 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
   5308 				ngtcp2_ccerr_set_liberr(&conn->ccerr, ret,
   5309 					NULL, 0);
   5310 #else
   5311 				ngtcp2_connection_close_error_set_transport_error_liberr(
   5312 					&conn->last_error, ret, NULL, 0);
   5313 #endif
   5314 			}
   5315 		}
   5316 		log_err("ngtcp2_conn_read_pkt failed: %s",
   5317 			ngtcp2_strerror(ret));
   5318 		if(!doq_conn_close_error(c, conn)) {
   5319 			if(err_drop)
   5320 				*err_drop = 1;
   5321 		}
   5322 		return 0;
   5323 	}
   5324 	doq_conn_write_enable(conn);
   5325 	return 1;
   5326 }
   5327 
   5328 /** doq stream write is done */
   5329 static void
   5330 doq_stream_write_is_done(struct doq_conn* conn, struct doq_stream* stream)
   5331 {
   5332 	/* Cannot deallocate, the buffer may be needed for resends. */
   5333 	doq_stream_off_write_list(conn, stream);
   5334 }
   5335 
   5336 int
   5337 doq_conn_write_streams(struct comm_point* c, struct doq_conn* conn,
   5338 	int* err_drop)
   5339 {
   5340 	struct doq_stream* stream = conn->stream_write_first;
   5341 	ngtcp2_path_storage ps;
   5342 	ngtcp2_tstamp ts = doq_get_timestamp_nanosec();
   5343 	size_t num_packets = 0, max_packets = 65535;
   5344 	ngtcp2_path_storage_zero(&ps);
   5345 
   5346 	for(;;) {
   5347 		int64_t stream_id;
   5348 		uint32_t flags = 0;
   5349 		ngtcp2_pkt_info pi;
   5350 		ngtcp2_vec datav[2];
   5351 		size_t datav_count = 0;
   5352 		ngtcp2_ssize ret, ndatalen = 0;
   5353 		int fin;
   5354 
   5355 		if(stream) {
   5356 			/* data to send */
   5357 			verbose(VERB_ALGO, "doq: doq_conn write stream %d",
   5358 				(int)stream->stream_id);
   5359 			stream_id = stream->stream_id;
   5360 			fin = 1;
   5361 			if(stream->nwrite < 2) {
   5362 				datav[0].base = ((uint8_t*)&stream->
   5363 					outlen_wire) + stream->nwrite;
   5364 				datav[0].len = 2 - stream->nwrite;
   5365 				datav[1].base = stream->out;
   5366 				datav[1].len = stream->outlen;
   5367 				datav_count = 2;
   5368 			} else {
   5369 				datav[0].base = stream->out +
   5370 					(stream->nwrite-2);
   5371 				datav[0].len = stream->outlen -
   5372 					(stream->nwrite-2);
   5373 				datav_count = 1;
   5374 			}
   5375 		} else {
   5376 			/* no data to send */
   5377 			verbose(VERB_ALGO, "doq: doq_conn write stream -1");
   5378 			stream_id = -1;
   5379 			fin = 0;
   5380 			datav[0].base = NULL;
   5381 			datav[0].len = 0;
   5382 			datav_count = 1;
   5383 		}
   5384 
   5385 		/* if more streams, set it to write more */
   5386 		if(stream && stream->write_next)
   5387 			flags |= NGTCP2_WRITE_STREAM_FLAG_MORE;
   5388 		if(fin)
   5389 			flags |= NGTCP2_WRITE_STREAM_FLAG_FIN;
   5390 
   5391 		sldns_buffer_clear(c->doq_socket->pkt_buf);
   5392 		ret = ngtcp2_conn_writev_stream(conn->conn, &ps.path, &pi,
   5393 			sldns_buffer_begin(c->doq_socket->pkt_buf),
   5394 			sldns_buffer_remaining(c->doq_socket->pkt_buf),
   5395 			&ndatalen, flags, stream_id, datav, datav_count, ts);
   5396 		if(ret < 0) {
   5397 			if(ret == NGTCP2_ERR_WRITE_MORE) {
   5398 				verbose(VERB_ALGO, "doq: write more, ndatalen %d", (int)ndatalen);
   5399 				if(stream) {
   5400 					if(ndatalen >= 0)
   5401 						stream->nwrite += ndatalen;
   5402 					if(stream->nwrite >= stream->outlen+2)
   5403 						doq_stream_write_is_done(
   5404 							conn, stream);
   5405 					stream = stream->write_next;
   5406 				}
   5407 				continue;
   5408 			} else if(ret == NGTCP2_ERR_STREAM_DATA_BLOCKED) {
   5409 				verbose(VERB_ALGO, "doq: ngtcp2_conn_writev_stream returned NGTCP2_ERR_STREAM_DATA_BLOCKED");
   5410 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
   5411 				ngtcp2_ccerr_set_application_error(
   5412 					&conn->ccerr, -1, NULL, 0);
   5413 #else
   5414 				ngtcp2_connection_close_error_set_application_error(&conn->last_error, -1, NULL, 0);
   5415 #endif
   5416 				if(err_drop)
   5417 					*err_drop = 0;
   5418 				if(!doq_conn_close_error(c, conn)) {
   5419 					if(err_drop)
   5420 						*err_drop = 1;
   5421 				}
   5422 				return 0;
   5423 			} else if(ret == NGTCP2_ERR_STREAM_SHUT_WR) {
   5424 				verbose(VERB_ALGO, "doq: ngtcp2_conn_writev_stream returned NGTCP2_ERR_STREAM_SHUT_WR");
   5425 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
   5426 				ngtcp2_ccerr_set_application_error(
   5427 					&conn->ccerr, -1, NULL, 0);
   5428 #else
   5429 				ngtcp2_connection_close_error_set_application_error(&conn->last_error, -1, NULL, 0);
   5430 #endif
   5431 				if(err_drop)
   5432 					*err_drop = 0;
   5433 				if(!doq_conn_close_error(c, conn)) {
   5434 					if(err_drop)
   5435 						*err_drop = 1;
   5436 				}
   5437 				return 0;
   5438 			}
   5439 
   5440 			log_err("doq: ngtcp2_conn_writev_stream failed: %s",
   5441 				ngtcp2_strerror(ret));
   5442 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
   5443 			ngtcp2_ccerr_set_liberr(&conn->ccerr, ret, NULL, 0);
   5444 #else
   5445 			ngtcp2_connection_close_error_set_transport_error_liberr(
   5446 				&conn->last_error, ret, NULL, 0);
   5447 #endif
   5448 			if(err_drop)
   5449 				*err_drop = 0;
   5450 			if(!doq_conn_close_error(c, conn)) {
   5451 				if(err_drop)
   5452 					*err_drop = 1;
   5453 			}
   5454 			return 0;
   5455 		}
   5456 		verbose(VERB_ALGO, "doq: writev_stream pkt size %d ndatawritten %d",
   5457 			(int)ret, (int)ndatalen);
   5458 
   5459 		if(ndatalen >= 0 && stream) {
   5460 			stream->nwrite += ndatalen;
   5461 			if(stream->nwrite >= stream->outlen+2)
   5462 				doq_stream_write_is_done(conn, stream);
   5463 		}
   5464 		if(ret == 0) {
   5465 			/* congestion limited */
   5466 			doq_conn_write_disable(conn);
   5467 			ngtcp2_conn_update_pkt_tx_time(conn->conn, ts);
   5468 			return 1;
   5469 		}
   5470 		sldns_buffer_set_position(c->doq_socket->pkt_buf, ret);
   5471 		sldns_buffer_flip(c->doq_socket->pkt_buf);
   5472 		doq_send_pkt(c, &conn->key.paddr, pi.ecn);
   5473 
   5474 		if(c->doq_socket->have_blocked_pkt)
   5475 			break;
   5476 		if(++num_packets == max_packets)
   5477 			break;
   5478 		if(stream)
   5479 			stream = stream->write_next;
   5480 	}
   5481 	ngtcp2_conn_update_pkt_tx_time(conn->conn, ts);
   5482 	return 1;
   5483 }
   5484 
   5485 void
   5486 doq_conn_write_enable(struct doq_conn* conn)
   5487 {
   5488 	conn->write_interest = 1;
   5489 }
   5490 
   5491 void
   5492 doq_conn_write_disable(struct doq_conn* conn)
   5493 {
   5494 	conn->write_interest = 0;
   5495 }
   5496 
   5497 /** doq append the connection to the write list */
   5498 static void
   5499 doq_conn_write_list_append(struct doq_table* table, struct doq_conn* conn)
   5500 {
   5501 	if(conn->on_write_list)
   5502 		return;
   5503 	conn->write_prev = table->write_list_last;
   5504 	if(table->write_list_last)
   5505 		table->write_list_last->write_next = conn;
   5506 	else table->write_list_first = conn;
   5507 	conn->write_next = NULL;
   5508 	table->write_list_last = conn;
   5509 	conn->on_write_list = 1;
   5510 }
   5511 
   5512 void
   5513 doq_conn_write_list_remove(struct doq_table* table, struct doq_conn* conn)
   5514 {
   5515 	if(!conn->on_write_list)
   5516 		return;
   5517 	if(conn->write_next)
   5518 		conn->write_next->write_prev = conn->write_prev;
   5519 	else table->write_list_last = conn->write_prev;
   5520 	if(conn->write_prev)
   5521 		conn->write_prev->write_next = conn->write_next;
   5522 	else table->write_list_first = conn->write_next;
   5523 	conn->write_prev = NULL;
   5524 	conn->write_next = NULL;
   5525 	conn->on_write_list = 0;
   5526 }
   5527 
   5528 void
   5529 doq_conn_set_write_list(struct doq_table* table, struct doq_conn* conn)
   5530 {
   5531 	if(conn->write_interest && conn->on_write_list)
   5532 		return;
   5533 	if(!conn->write_interest && !conn->on_write_list)
   5534 		return;
   5535 	if(conn->write_interest)
   5536 		doq_conn_write_list_append(table, conn);
   5537 	else doq_conn_write_list_remove(table, conn);
   5538 }
   5539 
   5540 struct doq_conn*
   5541 doq_table_pop_first(struct doq_table* table)
   5542 {
   5543 	struct doq_conn* conn = table->write_list_first;
   5544 	if(!conn)
   5545 		return NULL;
   5546 	lock_basic_lock(&conn->lock);
   5547 	table->write_list_first = conn->write_next;
   5548 	if(conn->write_next)
   5549 		conn->write_next->write_prev = NULL;
   5550 	else table->write_list_last = NULL;
   5551 	conn->write_next = NULL;
   5552 	conn->write_prev = NULL;
   5553 	conn->on_write_list = 0;
   5554 	return conn;
   5555 }
   5556 
   5557 int
   5558 doq_conn_check_timer(struct doq_conn* conn, struct timeval* tv)
   5559 {
   5560 	ngtcp2_tstamp expiry = ngtcp2_conn_get_expiry(conn->conn);
   5561 	ngtcp2_tstamp now = doq_get_timestamp_nanosec();
   5562 	ngtcp2_tstamp t;
   5563 
   5564 	if(expiry <= now) {
   5565 		/* The timer has already expired, add with zero timeout.
   5566 		 * This should call the callback straight away. Calling it
   5567 		 * from the event callbacks is cleaner than calling it here,
   5568 		 * because then it is always called with the same locks and
   5569 		 * so on. This routine only has the conn.lock. */
   5570 		t = now;
   5571 	} else {
   5572 		t = expiry;
   5573 	}
   5574 
   5575 	/* convert to timeval */
   5576 	memset(tv, 0, sizeof(*tv));
   5577 	tv->tv_sec = t / NGTCP2_SECONDS;
   5578 	tv->tv_usec = (t / NGTCP2_MICROSECONDS)%1000000;
   5579 
   5580 	/* If we already have a timer, is it the right value? */
   5581 	if(conn->timer.timer_in_tree || conn->timer.timer_in_list) {
   5582 		if(conn->timer.time.tv_sec == tv->tv_sec &&
   5583 			conn->timer.time.tv_usec == tv->tv_usec)
   5584 			return 0;
   5585 	}
   5586 	return 1;
   5587 }
   5588 
   5589 /* doq print connection log */
   5590 static void
   5591 doq_conn_log_line(struct doq_conn* conn, char* s)
   5592 {
   5593 	char remotestr[256], localstr[256];
   5594 	addr_to_str((void*)&conn->key.paddr.addr, conn->key.paddr.addrlen,
   5595 		remotestr, sizeof(remotestr));
   5596 	addr_to_str((void*)&conn->key.paddr.localaddr,
   5597 		conn->key.paddr.localaddrlen, localstr, sizeof(localstr));
   5598 	log_info("doq conn %s %s %s", remotestr, localstr, s);
   5599 }
   5600 
   5601 int
   5602 doq_conn_handle_timeout(struct doq_conn* conn)
   5603 {
   5604 	ngtcp2_tstamp now = doq_get_timestamp_nanosec();
   5605 	int rv;
   5606 
   5607 	if(verbosity >= VERB_ALGO)
   5608 		doq_conn_log_line(conn, "timeout");
   5609 
   5610 	rv = ngtcp2_conn_handle_expiry(conn->conn, now);
   5611 	if(rv != 0) {
   5612 		verbose(VERB_ALGO, "ngtcp2_conn_handle_expiry failed: %s",
   5613 			ngtcp2_strerror(rv));
   5614 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
   5615 		ngtcp2_ccerr_set_liberr(&conn->ccerr, rv, NULL, 0);
   5616 #else
   5617 		ngtcp2_connection_close_error_set_transport_error_liberr(
   5618 			&conn->last_error, rv, NULL, 0);
   5619 #endif
   5620 		if(!doq_conn_close_error(conn->doq_socket->cp, conn)) {
   5621 			/* failed, return for deletion */
   5622 			return 0;
   5623 		}
   5624 		return 1;
   5625 	}
   5626 	doq_conn_write_enable(conn);
   5627 	if(!doq_conn_write_streams(conn->doq_socket->cp, conn, NULL)) {
   5628 		/* failed, return for deletion. */
   5629 		return 0;
   5630 	}
   5631 	return 1;
   5632 }
   5633 
   5634 void
   5635 doq_table_quic_size_add(struct doq_table* table, size_t add)
   5636 {
   5637 	lock_basic_lock(&table->size_lock);
   5638 	table->current_size += add;
   5639 	lock_basic_unlock(&table->size_lock);
   5640 }
   5641 
   5642 void
   5643 doq_table_quic_size_subtract(struct doq_table* table, size_t subtract)
   5644 {
   5645 	lock_basic_lock(&table->size_lock);
   5646 	if(table->current_size < subtract)
   5647 		table->current_size = 0;
   5648 	else	table->current_size -= subtract;
   5649 	lock_basic_unlock(&table->size_lock);
   5650 }
   5651 
   5652 int
   5653 doq_table_quic_size_available(struct doq_table* table,
   5654 	struct config_file* cfg, size_t mem)
   5655 {
   5656 	size_t cur;
   5657 	if (!table)
   5658 		return 0;
   5659 	lock_basic_lock(&table->size_lock);
   5660 	cur = table->current_size;
   5661 	lock_basic_unlock(&table->size_lock);
   5662 
   5663 	if(cur + mem > cfg->quic_size)
   5664 		return 0;
   5665 	return 1;
   5666 }
   5667 
   5668 size_t doq_table_quic_size_get(struct doq_table* table)
   5669 {
   5670 	size_t sz;
   5671 	if(!table)
   5672 		return 0;
   5673 	lock_basic_lock(&table->size_lock);
   5674 	sz = table->current_size;
   5675 	lock_basic_unlock(&table->size_lock);
   5676 	return sz;
   5677 }
   5678 #endif /* HAVE_NGTCP2 */
   5679