Home | History | Annotate | Line # | Download | only in dist
      1 /*
      2  * server.c -- nsd(8) network input/output
      3  *
      4  * Copyright (c) 2001-2006, NLnet Labs. All rights reserved.
      5  *
      6  * See LICENSE for the license.
      7  *
      8  */
      9 
     10 #include "config.h"
     11 
     12 #include <sys/types.h>
     13 #include <sys/param.h>
     14 #include <limits.h>
     15 #include <sys/socket.h>
     16 #include <sys/uio.h>
     17 #include <sys/wait.h>
     18 
     19 #include <netinet/in.h>
     20 #ifdef USE_TCP_FASTOPEN
     21   #include <netinet/tcp.h>
     22 #endif
     23 #include <arpa/inet.h>
     24 
     25 #include <assert.h>
     26 #include <ctype.h>
     27 #include <errno.h>
     28 #include <fcntl.h>
     29 #include <stddef.h>
     30 #include <stdio.h>
     31 #include <stdlib.h>
     32 #include <string.h>
     33 #include <time.h>
     34 #include <unistd.h>
     35 #include <signal.h>
     36 #include <netdb.h>
     37 #include <poll.h>
     38 #ifdef HAVE_SYS_RANDOM_H
     39 #include <sys/random.h>
     40 #endif
     41 #ifndef SHUT_WR
     42 #define SHUT_WR 1
     43 #endif
     44 #ifdef HAVE_MMAP
     45 #include <sys/mman.h>
     46 #endif /* HAVE_MMAP */
     47 #ifdef HAVE_OPENSSL_RAND_H
     48 #include <openssl/rand.h>
     49 #endif
     50 #ifdef HAVE_OPENSSL_SSL_H
     51 #include <openssl/ssl.h>
     52 #endif
     53 #ifdef HAVE_OPENSSL_ERR_H
     54 #include <openssl/err.h>
     55 #endif
     56 #ifdef HAVE_OPENSSL_OCSP_H
     57 #include <openssl/ocsp.h>
     58 #endif
     59 #ifndef USE_MINI_EVENT
     60 #  ifdef HAVE_EVENT_H
     61 #    include <event.h>
     62 #  else
     63 #    include <event2/event.h>
     64 #    include "event2/event_struct.h"
     65 #    include "event2/event_compat.h"
     66 #  endif
     67 #else
     68 #  include "mini_event.h"
     69 #endif
     70 
     71 #include "axfr.h"
     72 #include "namedb.h"
     73 #include "netio.h"
     74 #include "xfrd.h"
     75 #include "xfrd-tcp.h"
     76 #include "xfrd-disk.h"
     77 #include "difffile.h"
     78 #include "nsec3.h"
     79 #include "ipc.h"
     80 #include "udb.h"
     81 #include "remote.h"
     82 #include "lookup3.h"
     83 #include "rrl.h"
     84 #include "ixfr.h"
     85 #ifdef USE_DNSTAP
     86 #include "dnstap/dnstap_collector.h"
     87 #endif
     88 #include "verify.h"
     89 #include "util/proxy_protocol.h"
     90 #ifdef USE_XDP
     91 #include "xdp-server.h"
     92 #endif
     93 #ifdef USE_METRICS
     94 #include "metrics.h"
     95 #endif /* USE_METRICS */
     96 
     97 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */
     98 
     99 #ifdef USE_DNSTAP
    100 /*
    101  * log_addr() - the function to print sockaddr_in/sockaddr_in6 structures content
    102  * just like its done in Unbound via the same log_addr(VERB_LEVEL, const char*, sockaddr_storage*)
    103  */
    104 static void
    105 log_addr(const char* descr,
    106 #ifdef INET6
    107 	struct sockaddr_storage* addr
    108 #else
    109 	struct sockaddr_in* addr
    110 #endif
    111 	)
    112 {
    113 	char str_buf[64];
    114 	if(verbosity < 6)
    115 		return;
    116 	if(
    117 #ifdef INET6
    118 		addr->ss_family == AF_INET
    119 #else
    120 		addr->sin_family == AF_INET
    121 #endif
    122 		) {
    123 		struct sockaddr_in* s = (struct sockaddr_in*)addr;
    124 		inet_ntop(AF_INET, &s->sin_addr.s_addr, str_buf, sizeof(str_buf));
    125 		VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s->sin_port)));
    126 #ifdef INET6
    127 	} else {
    128 		struct sockaddr_in6* s6 = (struct sockaddr_in6*)addr;
    129 		inet_ntop(AF_INET6, &s6->sin6_addr.s6_addr, str_buf, sizeof(str_buf));
    130 		VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s6->sin6_port)));
    131 #endif
    132 	}
    133 }
    134 #endif /* USE_DNSTAP */
    135 
    136 #ifdef USE_TCP_FASTOPEN
    137   #define TCP_FASTOPEN_FILE "/proc/sys/net/ipv4/tcp_fastopen"
    138   #define TCP_FASTOPEN_SERVER_BIT_MASK 0x2
    139 #endif
    140 
    141 /* header state for the PROXYv2 header (for TCP) */
    142 enum pp2_header_state {
    143 	/* no header encounter yet */
    144 	pp2_header_none = 0,
    145 	/* read the static part of the header */
    146 	pp2_header_init,
    147 	/* read the full header */
    148 	pp2_header_done
    149 };
    150 
    151 /*
    152  * Data for the UDP handlers.
    153  */
    154 struct udp_handler_data
    155 {
    156 	struct nsd        *nsd;
    157 	struct nsd_socket *socket;
    158 	struct event       event;
    159 	/* if set, PROXYv2 is expected on this connection */
    160 	int pp2_enabled;
    161 };
    162 
    163 struct tcp_accept_handler_data {
    164 	struct nsd        *nsd;
    165 	struct nsd_socket *socket;
    166 	int                event_added;
    167 	struct event       event;
    168 #ifdef HAVE_SSL
    169 	/* handler accepts TLS connections on the dedicated port */
    170 	int                tls_accept;
    171 	int                tls_auth_accept;
    172 #endif
    173 	/* if set, PROXYv2 is expected on this connection */
    174 	int pp2_enabled;
    175 };
    176 
    177 #ifdef USE_XDP
    178 struct xdp_handler_data {
    179 	struct nsd        *nsd;
    180 	struct xdp_server *server;
    181 	struct event event;
    182 };
    183 #endif
    184 
    185 /*
    186  * These globals are used to enable the TCP accept handlers
    187  * when the number of TCP connection drops below the maximum
    188  * number of TCP connections.
    189  */
    190 static size_t tcp_accept_handler_count;
    191 static struct tcp_accept_handler_data *tcp_accept_handlers;
    192 
    193 static struct event slowaccept_event;
    194 static int slowaccept;
    195 
    196 #ifdef HAVE_SSL
    197 static unsigned char *ocspdata = NULL;
    198 static long ocspdata_len = 0;
    199 #endif
    200 
    201 #ifdef NONBLOCKING_IS_BROKEN
    202 /* Define NUM_RECV_PER_SELECT to 1 (one) to avoid opportunistically trying to
    203    read multiple times from a socket when reported ready by select. */
    204 # define NUM_RECV_PER_SELECT (1)
    205 #else /* !NONBLOCKING_IS_BROKEN */
    206 # define NUM_RECV_PER_SELECT (100)
    207 #endif /* NONBLOCKING_IS_BROKEN */
    208 
    209 #ifndef HAVE_MMSGHDR
    210 struct mmsghdr {
    211 	struct msghdr msg_hdr;
    212 	unsigned int  msg_len;
    213 };
    214 #endif
    215 
    216 static struct mmsghdr msgs[NUM_RECV_PER_SELECT];
    217 static struct iovec iovecs[NUM_RECV_PER_SELECT];
    218 static struct query *queries[NUM_RECV_PER_SELECT];
    219 #ifdef USE_XDP
    220 static struct query *xdp_queries[XDP_RX_BATCH_SIZE];
    221 #endif
    222 
    223 /*
    224  * Data for the TCP connection handlers.
    225  *
    226  * The TCP handlers use non-blocking I/O.  This is necessary to avoid
    227  * blocking the entire server on a slow TCP connection, but does make
    228  * reading from and writing to the socket more complicated.
    229  *
    230  * Basically, whenever a read/write would block (indicated by the
    231  * EAGAIN errno variable) we remember the position we were reading
    232  * from/writing to and return from the TCP reading/writing event
    233  * handler.  When the socket becomes readable/writable again we
    234  * continue from the same position.
    235  */
    236 struct tcp_handler_data
    237 {
    238 	/*
    239 	 * The region used to allocate all TCP connection related
    240 	 * data, including this structure.  This region is destroyed
    241 	 * when the connection is closed.
    242 	 */
    243 	region_type*		region;
    244 
    245 	/*
    246 	 * The global nsd structure.
    247 	 */
    248 	struct nsd*			nsd;
    249 
    250 	/*
    251 	 * The current query data for this TCP connection.
    252 	 */
    253 	query_type*			query;
    254 
    255 	/*
    256 	 * The query_state is used to remember if we are performing an
    257 	 * AXFR, if we're done processing, or if we should discard the
    258 	 * query and connection.
    259 	 */
    260 	query_state_type	query_state;
    261 
    262 	/*
    263 	 * The event for the file descriptor and tcp timeout
    264 	 */
    265 	struct event event;
    266 
    267 	/*
    268 	 * The bytes_transmitted field is used to remember the number
    269 	 * of bytes transmitted when receiving or sending a DNS
    270 	 * packet.  The count includes the two additional bytes used
    271 	 * to specify the packet length on a TCP connection.
    272 	 */
    273 	size_t				bytes_transmitted;
    274 
    275 	/* If the query is restarted and needs a reset */
    276 	int query_needs_reset;
    277 
    278 	/*
    279 	 * The number of queries handled by this specific TCP connection.
    280 	 */
    281 	int					query_count;
    282 
    283 	/*
    284 	 * The timeout in msec for this tcp connection
    285 	 */
    286 	int	tcp_timeout;
    287 
    288 	/*
    289 	 * If the connection is allowed to have further queries on it.
    290 	 */
    291 	int tcp_no_more_queries;
    292 
    293 #ifdef USE_DNSTAP
    294 	/* the socket of the accept socket to find proper service (local) address the socket is bound to. */
    295 	struct nsd_socket *socket;
    296 #endif /* USE_DNSTAP */
    297 
    298 	/* if set, PROXYv2 is expected on this connection */
    299 	int pp2_enabled;
    300 
    301 	/* header state for the PROXYv2 header (for TCP) */
    302 	enum pp2_header_state pp2_header_state;
    303 
    304 #ifdef HAVE_SSL
    305 	/*
    306 	 * TLS objects.
    307 	 */
    308 	SSL* tls;
    309 	SSL* tls_auth;
    310 
    311 	/*
    312 	 * TLS handshake state.
    313 	 */
    314 	enum { tls_hs_none, tls_hs_read, tls_hs_write,
    315 		tls_hs_read_event, tls_hs_write_event } shake_state;
    316 #endif
    317 	/* list of connections, for service of remaining tcp channels */
    318 	struct tcp_handler_data *prev, *next;
    319 };
    320 /* global that is the list of active tcp channels */
    321 static struct tcp_handler_data *tcp_active_list = NULL;
    322 
    323 /*
    324  * Handle incoming queries on the UDP server sockets.
    325  */
    326 static void handle_udp(int fd, short event, void* arg);
    327 
    328 /*
    329  * Handle incoming connections on the TCP sockets.  These handlers
    330  * usually wait for the NETIO_EVENT_READ event (indicating an incoming
    331  * connection) but are disabled when the number of current TCP
    332  * connections is equal to the maximum number of TCP connections.
    333  * Disabling is done by changing the handler to wait for the
    334  * NETIO_EVENT_NONE type.  This is done using the function
    335  * configure_tcp_accept_handlers.
    336  */
    337 static void handle_tcp_accept(int fd, short event, void* arg);
    338 
    339 /*
    340  * Handle incoming queries on a TCP connection.  The TCP connections
    341  * are configured to be non-blocking and the handler may be called
    342  * multiple times before a complete query is received.
    343  */
    344 static void handle_tcp_reading(int fd, short event, void* arg);
    345 
    346 /*
    347  * Handle outgoing responses on a TCP connection.  The TCP connections
    348  * are configured to be non-blocking and the handler may be called
    349  * multiple times before a complete response is sent.
    350  */
    351 static void handle_tcp_writing(int fd, short event, void* arg);
    352 
    353 #ifdef HAVE_SSL
    354 /* Create SSL object and associate fd */
    355 static SSL* incoming_ssl_fd(SSL_CTX* ctx, int fd);
    356 /*
    357  * Handle TLS handshake. May be called multiple times if incomplete.
    358  */
    359 static int tls_handshake(struct tcp_handler_data* data, int fd, int writing);
    360 
    361 /*
    362  * Handle incoming queries on a TLS over TCP connection.  The TLS
    363  * connections are configured to be non-blocking and the handler may
    364  * be called multiple times before a complete query is received.
    365  */
    366 static void handle_tls_reading(int fd, short event, void* arg);
    367 
    368 /*
    369  * Handle outgoing responses on a TLS over TCP connection.  The TLS
    370  * connections are configured to be non-blocking and the handler may
    371  * be called multiple times before a complete response is sent.
    372  */
    373 static void handle_tls_writing(int fd, short event, void* arg);
    374 #endif
    375 
    376 #ifdef USE_XDP
    377 static void handle_xdp(int fd, short event, void* arg);
    378 #endif
    379 
    380 /*
    381  * Send all children the quit nonblocking, then close pipe.
    382  */
    383 static void send_children_quit(struct nsd* nsd);
    384 /* same, for shutdown time, waits for child to exit to avoid restart issues */
    385 static void send_children_quit_and_wait(struct nsd* nsd);
    386 
    387 /* set childrens flags to send NSD_STATS to them */
    388 #ifdef BIND8_STATS
    389 static void set_children_stats(struct nsd* nsd);
    390 #endif /* BIND8_STATS */
    391 
    392 /*
    393  * Change the event types the HANDLERS are interested in to EVENT_TYPES.
    394  */
    395 static void configure_handler_event_types(short event_types);
    396 
    397 static uint16_t *compressed_dname_offsets = 0;
    398 static uint32_t compression_table_capacity = 0;
    399 static uint32_t compression_table_size = 0;
    400 static domain_type* compressed_dnames[MAXRRSPP];
    401 
    402 #ifdef USE_TCP_FASTOPEN
    403 /* Checks to see if the kernel value must be manually changed in order for
    404    TCP Fast Open to support server mode */
    405 static void report_tcp_fastopen_config() {
    406 
    407 	int tcp_fastopen_fp;
    408 	uint8_t tcp_fastopen_value;
    409 
    410 	if ( (tcp_fastopen_fp = open(TCP_FASTOPEN_FILE, O_RDONLY)) == -1 ) {
    411 		log_msg(LOG_INFO,"Error opening " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
    412 	}
    413 	if (read(tcp_fastopen_fp, &tcp_fastopen_value, 1) == -1 ) {
    414 		log_msg(LOG_INFO,"Error reading " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
    415 		close(tcp_fastopen_fp);
    416 	}
    417 	if (!(tcp_fastopen_value & TCP_FASTOPEN_SERVER_BIT_MASK)) {
    418 		log_msg(LOG_WARNING, "Error: TCP Fast Open support is available and configured in NSD by default.\n");
    419 		log_msg(LOG_WARNING, "However the kernel parameters are not configured to support TCP_FASTOPEN in server mode.\n");
    420 		log_msg(LOG_WARNING, "To enable TFO use the command:");
    421 		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=2' for pure server mode or\n");
    422 		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=3' for both client and server mode\n");
    423 		log_msg(LOG_WARNING, "NSD will not have TCP Fast Open available until this change is made.\n");
    424 		close(tcp_fastopen_fp);
    425 	}
    426 	close(tcp_fastopen_fp);
    427 }
    428 #endif
    429 
    430 /*
    431  * Remove the specified pid from the list of child pids.  Returns -1 if
    432  * the pid is not in the list, child_num otherwise.  The field is set to 0.
    433  */
    434 static int
    435 delete_child_pid(struct nsd *nsd, pid_t pid)
    436 {
    437 	size_t i;
    438 	for (i = 0; i < nsd->child_count; ++i) {
    439 		if (nsd->children[i].pid == pid) {
    440 			nsd->children[i].pid = 0;
    441 			if(!nsd->children[i].need_to_exit) {
    442 				if(nsd->children[i].child_fd != -1)
    443 					close(nsd->children[i].child_fd);
    444 				nsd->children[i].child_fd = -1;
    445 				if(nsd->children[i].handler)
    446 					nsd->children[i].handler->fd = -1;
    447 			}
    448 			return i;
    449 		}
    450 	}
    451 	return -1;
    452 }
    453 
    454 /*
    455  * Restart child servers if necessary.
    456  */
    457 static int
    458 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio,
    459 	int* xfrd_sock_p)
    460 {
    461 	size_t i;
    462 	int sv[2];
    463 
    464 	/* Fork the child processes... */
    465 	for (i = 0; i < nsd->child_count; ++i) {
    466 		if (nsd->children[i].pid <= 0) {
    467 			if (nsd->children[i].child_fd != -1)
    468 				close(nsd->children[i].child_fd);
    469 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) {
    470 				log_msg(LOG_ERR, "socketpair: %s",
    471 					strerror(errno));
    472 				return -1;
    473 			}
    474 			nsd->children[i].child_fd = sv[0];
    475 			nsd->children[i].parent_fd = sv[1];
    476 			nsd->children[i].pid = fork();
    477 			switch (nsd->children[i].pid) {
    478 			default: /* SERVER MAIN */
    479 				close(nsd->children[i].parent_fd);
    480 				nsd->children[i].parent_fd = -1;
    481 				if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) {
    482 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
    483 				}
    484 				if(!nsd->children[i].handler)
    485 				{
    486 					struct main_ipc_handler_data *ipc_data;
    487 					ipc_data = (struct main_ipc_handler_data*) region_alloc(
    488 						region, sizeof(struct main_ipc_handler_data));
    489 					ipc_data->nsd = nsd;
    490 					ipc_data->child = &nsd->children[i];
    491 					nsd->children[i].handler = (struct netio_handler*) region_alloc(
    492 						region, sizeof(struct netio_handler));
    493 					nsd->children[i].handler->fd = nsd->children[i].child_fd;
    494 					nsd->children[i].handler->timeout = NULL;
    495 					nsd->children[i].handler->user_data = ipc_data;
    496 					nsd->children[i].handler->event_types = NETIO_EVENT_READ;
    497 					nsd->children[i].handler->event_handler = parent_handle_child_command;
    498 					netio_add_handler(netio, nsd->children[i].handler);
    499 				}
    500 				/* restart - update fd */
    501 				nsd->children[i].handler->fd = nsd->children[i].child_fd;
    502 				break;
    503 			case 0: /* CHILD */
    504 #ifdef MEMCLEAN /* OS collects memory pages */
    505 				region_destroy(region);
    506 #endif
    507 				nsd->pid = 0;
    508 				nsd->child_count = 0;
    509 				nsd->server_kind = nsd->children[i].kind;
    510 				nsd->this_child = &nsd->children[i];
    511 				nsd->this_child->child_num = i;
    512 				/* remove signal flags inherited from parent
    513 				   the parent will handle them. */
    514 				nsd->signal_hint_reload_hup = 0;
    515 				nsd->signal_hint_reload = 0;
    516 				nsd->signal_hint_child = 0;
    517 				nsd->signal_hint_quit = 0;
    518 				nsd->signal_hint_shutdown = 0;
    519 				nsd->signal_hint_stats = 0;
    520 				nsd->signal_hint_statsusr = 0;
    521 				close(*xfrd_sock_p);
    522 				close(nsd->this_child->child_fd);
    523 				nsd->this_child->child_fd = -1;
    524 				if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) {
    525 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
    526 				}
    527 				server_child(nsd);
    528 				/* NOTREACH */
    529 				exit(0);
    530 			case -1:
    531 				log_msg(LOG_ERR, "fork failed: %s",
    532 					strerror(errno));
    533 				return -1;
    534 			}
    535 		}
    536 	}
    537 	return 0;
    538 }
    539 
    540 #ifdef BIND8_STATS
    541 static void set_bind8_alarm(struct nsd* nsd)
    542 {
    543 	/* resync so that the next alarm is on the next whole minute */
    544 	if(nsd->st_period > 0) /* % by 0 gives divbyzero error */
    545 		alarm(nsd->st_period - (time(NULL) % nsd->st_period));
    546 }
    547 #endif
    548 
    549 /* set zone stat ids for zones initially read in */
    550 static void
    551 zonestatid_tree_set(struct nsd* nsd)
    552 {
    553 	struct radnode* n;
    554 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
    555 		zone_type* zone = (zone_type*)n->elem;
    556 		zone->zonestatid = getzonestatid(nsd->options, zone->opts);
    557 	}
    558 }
    559 
    560 #ifdef USE_ZONE_STATS
    561 void
    562 server_zonestat_alloc(struct nsd* nsd)
    563 {
    564 	size_t num = (nsd->options->zonestatnames->count==0?1:
    565 			nsd->options->zonestatnames->count);
    566 	size_t sz = sizeof(struct nsdst)*num;
    567 	char tmpfile[256];
    568 	uint8_t z = 0;
    569 
    570 	/* file names */
    571 	nsd->zonestatfname[0] = 0;
    572 	nsd->zonestatfname[1] = 0;
    573 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0",
    574 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
    575 	nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile);
    576 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1",
    577 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
    578 	nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile);
    579 
    580 	/* file descriptors */
    581 	nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600);
    582 	if(nsd->zonestatfd[0] == -1) {
    583 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0],
    584 			strerror(errno));
    585 		exit(1);
    586 	}
    587 	nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600);
    588 	if(nsd->zonestatfd[1] == -1) {
    589 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1],
    590 			strerror(errno));
    591 		close(nsd->zonestatfd[0]);
    592 		unlink(nsd->zonestatfname[0]);
    593 		exit(1);
    594 	}
    595 
    596 #ifdef HAVE_MMAP
    597 	if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) {
    598 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0],
    599 			strerror(errno));
    600 		exit(1);
    601 	}
    602 	if(write(nsd->zonestatfd[0], &z, 1) == -1) {
    603 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
    604 			nsd->zonestatfname[0], strerror(errno));
    605 		exit(1);
    606 	}
    607 	if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) {
    608 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1],
    609 			strerror(errno));
    610 		exit(1);
    611 	}
    612 	if(write(nsd->zonestatfd[1], &z, 1) == -1) {
    613 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
    614 			nsd->zonestatfname[1], strerror(errno));
    615 		exit(1);
    616 	}
    617 	nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
    618 		MAP_SHARED, nsd->zonestatfd[0], 0);
    619 	if(nsd->zonestat[0] == MAP_FAILED) {
    620 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
    621 		unlink(nsd->zonestatfname[0]);
    622 		unlink(nsd->zonestatfname[1]);
    623 		exit(1);
    624 	}
    625 	nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
    626 		MAP_SHARED, nsd->zonestatfd[1], 0);
    627 	if(nsd->zonestat[1] == MAP_FAILED) {
    628 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
    629 		unlink(nsd->zonestatfname[0]);
    630 		unlink(nsd->zonestatfname[1]);
    631 		exit(1);
    632 	}
    633 	memset(nsd->zonestat[0], 0, sz);
    634 	memset(nsd->zonestat[1], 0, sz);
    635 	nsd->zonestatsize[0] = num;
    636 	nsd->zonestatsize[1] = num;
    637 	nsd->zonestatdesired = num;
    638 	nsd->zonestatsizenow = num;
    639 	nsd->zonestatnow = nsd->zonestat[0];
    640 #endif /* HAVE_MMAP */
    641 }
    642 
    643 void
    644 zonestat_remap(struct nsd* nsd, int idx, size_t sz)
    645 {
    646 #ifdef HAVE_MMAP
    647 #ifdef MREMAP_MAYMOVE
    648 	nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx],
    649 		sizeof(struct nsdst)*nsd->zonestatsize[idx], sz,
    650 		MREMAP_MAYMOVE);
    651 	if(nsd->zonestat[idx] == MAP_FAILED) {
    652 		log_msg(LOG_ERR, "mremap failed: %s", strerror(errno));
    653 		exit(1);
    654 	}
    655 #else /* !HAVE MREMAP */
    656 	if(msync(nsd->zonestat[idx],
    657 		sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0)
    658 		log_msg(LOG_ERR, "msync failed: %s", strerror(errno));
    659 	if(munmap(nsd->zonestat[idx],
    660 		sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0)
    661 		log_msg(LOG_ERR, "munmap failed: %s", strerror(errno));
    662 	nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz,
    663 		PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0);
    664 	if(nsd->zonestat[idx] == MAP_FAILED) {
    665 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
    666 		exit(1);
    667 	}
    668 #endif /* MREMAP */
    669 #endif /* HAVE_MMAP */
    670 }
    671 
    672 /* realloc the zonestat array for the one that is not currently in use,
    673  * to match the desired new size of the array (if applicable) */
    674 void
    675 server_zonestat_realloc(struct nsd* nsd)
    676 {
    677 #ifdef HAVE_MMAP
    678 	uint8_t z = 0;
    679 	size_t sz;
    680 	int idx = 0; /* index of the zonestat array that is not in use */
    681 	if(nsd->zonestatnow == nsd->zonestat[0])
    682 		idx = 1;
    683 	if(nsd->zonestatsize[idx] == nsd->zonestatdesired)
    684 		return;
    685 	sz = sizeof(struct nsdst)*nsd->zonestatdesired;
    686 	if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) {
    687 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx],
    688 			strerror(errno));
    689 		exit(1);
    690 	}
    691 	if(write(nsd->zonestatfd[idx], &z, 1) == -1) {
    692 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
    693 			nsd->zonestatfname[idx], strerror(errno));
    694 		exit(1);
    695 	}
    696 	zonestat_remap(nsd, idx, sz);
    697 	/* zero the newly allocated region */
    698 	if(nsd->zonestatdesired > nsd->zonestatsize[idx]) {
    699 		memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) *
    700 			nsd->zonestatsize[idx], 0, sizeof(struct nsdst) *
    701 			(nsd->zonestatdesired - nsd->zonestatsize[idx]));
    702 	}
    703 	nsd->zonestatsize[idx] = nsd->zonestatdesired;
    704 #endif /* HAVE_MMAP */
    705 }
    706 
    707 /* switchover to use the other array for the new children, that
    708  * briefly coexist with the old children.  And we want to avoid them
    709  * both writing to the same statistics arrays. */
    710 void
    711 server_zonestat_switch(struct nsd* nsd)
    712 {
    713 	if(nsd->zonestatnow == nsd->zonestat[0]) {
    714 		nsd->zonestatnow = nsd->zonestat[1];
    715 		nsd->zonestatsizenow = nsd->zonestatsize[1];
    716 	} else {
    717 		nsd->zonestatnow = nsd->zonestat[0];
    718 		nsd->zonestatsizenow = nsd->zonestatsize[0];
    719 	}
    720 }
    721 #endif /* USE_ZONE_STATS */
    722 
    723 #ifdef BIND8_STATS
    724 void
    725 server_stat_alloc(struct nsd* nsd)
    726 {
    727 	char tmpfile[256];
    728 	size_t sz = sizeof(struct nsdst) * nsd->child_count * 2;
    729 	uint8_t z = 0;
    730 
    731 	/* file name */
    732 	nsd->statfname = 0;
    733 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.stat",
    734 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
    735 	nsd->statfname = region_strdup(nsd->region, tmpfile);
    736 
    737 	/* file descriptor */
    738 	nsd->statfd = open(nsd->statfname, O_CREAT|O_RDWR, 0600);
    739 	if(nsd->statfd == -1) {
    740 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->statfname,
    741 			strerror(errno));
    742 		unlink(nsd->zonestatfname[0]);
    743 		unlink(nsd->zonestatfname[1]);
    744 		exit(1);
    745 	}
    746 
    747 #ifdef HAVE_MMAP
    748 	if(lseek(nsd->statfd, (off_t)sz-1, SEEK_SET) == -1) {
    749 		log_msg(LOG_ERR, "lseek %s: %s", nsd->statfname,
    750 			strerror(errno));
    751 		goto fail_exit;
    752 	}
    753 	if(write(nsd->statfd, &z, 1) == -1) {
    754 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
    755 			nsd->statfname, strerror(errno));
    756 		goto fail_exit;
    757 	}
    758 	nsd->stat_map = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
    759 		MAP_SHARED, nsd->statfd, 0);
    760 	if(nsd->stat_map == MAP_FAILED) {
    761 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
    762 fail_exit:
    763 		close(nsd->statfd);
    764 		unlink(nsd->statfname);
    765 		unlink(nsd->zonestatfname[0]);
    766 		unlink(nsd->zonestatfname[1]);
    767 		exit(1);
    768 	}
    769 	memset(nsd->stat_map, 0, sz);
    770 	nsd->stats_per_child[0] = nsd->stat_map;
    771 	nsd->stats_per_child[1] = &nsd->stat_map[nsd->child_count];
    772 	nsd->stat_current = 0;
    773 	nsd->st = &nsd->stats_per_child[nsd->stat_current][0];
    774 #endif /* HAVE_MMAP */
    775 }
    776 #endif /* BIND8_STATS */
    777 
    778 #ifdef BIND8_STATS
    779 void
    780 server_stat_free(struct nsd* nsd)
    781 {
    782 	unlink(nsd->statfname);
    783 }
    784 #endif /* BIND8_STATS */
    785 
    786 static void
    787 cleanup_dname_compression_tables(void *ptr)
    788 {
    789 	free(ptr);
    790 	compressed_dname_offsets = NULL;
    791 	compression_table_capacity = 0;
    792 }
    793 
    794 static void
    795 initialize_dname_compression_tables(struct nsd *nsd)
    796 {
    797 	size_t needed = domain_table_count(nsd->db->domains) + 1;
    798 	needed += EXTRA_DOMAIN_NUMBERS;
    799 	if(compression_table_capacity < needed) {
    800 		if(compressed_dname_offsets) {
    801 			region_remove_cleanup(nsd->db->region,
    802 				cleanup_dname_compression_tables,
    803 				compressed_dname_offsets);
    804 			free(compressed_dname_offsets);
    805 		}
    806 		compressed_dname_offsets = (uint16_t *) xmallocarray(
    807 			needed, sizeof(uint16_t));
    808 		region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables,
    809 			compressed_dname_offsets);
    810 		compression_table_capacity = needed;
    811 		compression_table_size=domain_table_count(nsd->db->domains)+1;
    812 	}
    813 	memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t));
    814 	compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */
    815 }
    816 
    817 static int
    818 set_cloexec(struct nsd_socket *sock)
    819 {
    820 	assert(sock != NULL);
    821 
    822 	if(fcntl(sock->s, F_SETFD, FD_CLOEXEC) == -1) {
    823 		const char *socktype =
    824 			sock->addr.ai_family == SOCK_DGRAM ? "udp" : "tcp";
    825 		log_msg(LOG_ERR, "fcntl(..., O_CLOEXEC) failed for %s: %s",
    826 			socktype, strerror(errno));
    827 		return -1;
    828 	}
    829 
    830 	return 1;
    831 }
    832 
    833 static int
    834 set_reuseport(struct nsd_socket *sock)
    835 {
    836 #ifdef SO_REUSEPORT
    837 	int on = 1;
    838 #ifdef SO_REUSEPORT_LB
    839 	/* FreeBSD 12 has SO_REUSEPORT_LB that does load balancing like
    840 	 * SO_REUSEPORT on Linux. This is what the users want with the config
    841 	 * option in nsd.conf; if we actually need local address and port reuse
    842 	 * they'll also need to have SO_REUSEPORT set for them, assume it was
    843 	 * _LB they want.
    844 	 */
    845 	int opt = SO_REUSEPORT_LB;
    846 	static const char optname[] = "SO_REUSEPORT_LB";
    847 #else /* !SO_REUSEPORT_LB */
    848 	int opt = SO_REUSEPORT;
    849 	static const char optname[] = "SO_REUSEPORT";
    850 #endif /* SO_REUSEPORT_LB */
    851 
    852 	if (0 == setsockopt(sock->s, SOL_SOCKET, opt, &on, sizeof(on))) {
    853 		return 1;
    854 	} else if(verbosity >= 3 || errno != ENOPROTOOPT) {
    855 		log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
    856 			optname, strerror(errno));
    857 	}
    858 	return -1;
    859 #else
    860 	(void)sock;
    861 #endif /* SO_REUSEPORT */
    862 
    863 	return 0;
    864 }
    865 
    866 static int
    867 set_reuseaddr(struct nsd_socket *sock)
    868 {
    869 #ifdef SO_REUSEADDR
    870 	int on = 1;
    871 	if(setsockopt(sock->s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == 0) {
    872 		return 1;
    873 	}
    874 	log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s",
    875 		strerror(errno));
    876 	return -1;
    877 #endif /* SO_REUSEADDR */
    878 	return 0;
    879 }
    880 
    881 static int
    882 set_rcvbuf(struct nsd_socket *sock, int rcv)
    883 {
    884 #ifdef SO_RCVBUF
    885 #ifdef SO_RCVBUFFORCE
    886 	if(0 == setsockopt(
    887 		sock->s, SOL_SOCKET, SO_RCVBUFFORCE, &rcv, sizeof(rcv)))
    888 	{
    889 		return 1;
    890 	}
    891 	if(errno == EPERM || errno == ENOBUFS) {
    892 		if(errno == ENOBUFS) {
    893 			VERBOSITY(2, (LOG_INFO, "setsockopt(..., SO_RCVBUFFORCE, %d) was not granted: %s",
    894 				rcv, strerror(errno)));
    895 		}
    896 		return 0;
    897 	}
    898 	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, %d) failed: %s",
    899 		rcv, strerror(errno));
    900 	return -1;
    901 #else /* !SO_RCVBUFFORCE */
    902 	if (0 == setsockopt(
    903 		sock->s, SOL_SOCKET, SO_RCVBUF, &rcv, sizeof(rcv)))
    904 	{
    905 		return 1;
    906 	}
    907 	if(errno == ENOSYS || errno == ENOBUFS) {
    908 		if(errno == ENOBUFS) {
    909 			VERBOSITY(2, (LOG_INFO, "setsockopt(..., SO_RCVBUF, %d) was not granted: %s",
    910 				rcv, strerror(errno)));
    911 		}
    912 		return 0;
    913 	}
    914 	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, %d) failed: %s",
    915 		rcv, strerror(errno));
    916 	return -1;
    917 #endif /* SO_RCVBUFFORCE */
    918 #endif /* SO_RCVBUF */
    919 
    920 	return 0;
    921 }
    922 
    923 static int
    924 set_sndbuf(struct nsd_socket *sock, int snd)
    925 {
    926 #ifdef SO_SNDBUF
    927 #ifdef SO_SNDBUFFORCE
    928 	if(0 == setsockopt(
    929 		sock->s, SOL_SOCKET, SO_SNDBUFFORCE, &snd, sizeof(snd)))
    930 	{
    931 		return 1;
    932 	}
    933 	if(errno == EPERM || errno == ENOBUFS) {
    934 		if(errno == ENOBUFS) {
    935 			VERBOSITY(2, (LOG_INFO, "setsockopt(..., SO_SNDBUFFORCE, %d) was not granted: %s",
    936 				snd, strerror(errno)));
    937 		}
    938 		return 0;
    939 	}
    940 	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, %d) failed: %s",
    941 		snd, strerror(errno));
    942 	return -1;
    943 #else /* !SO_SNDBUFFORCE */
    944 	if(0 == setsockopt(
    945 		sock->s, SOL_SOCKET, SO_SNDBUF, &snd, sizeof(snd)))
    946 	{
    947 		return 1;
    948 	}
    949 	if(errno == ENOSYS || errno == ENOBUFS) {
    950 		if(errno == ENOBUFS) {
    951 			VERBOSITY(2, (LOG_INFO, "setsockopt(..., SO_SNDBUF, %d) was not granted: %s",
    952 				snd, strerror(errno)));
    953 		}
    954 		return 0;
    955 	}
    956 	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, %d) failed: %s",
    957 		snd, strerror(errno));
    958 	return -1;
    959 #endif /* SO_SNDBUFFORCE */
    960 #endif /* SO_SNDBUF */
    961 
    962 	return 0;
    963 }
    964 
    965 static int
    966 set_nonblock(struct nsd_socket *sock)
    967 {
    968 	const char *socktype =
    969 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
    970 
    971 	if(fcntl(sock->s, F_SETFL, O_NONBLOCK) == -1) {
    972 		log_msg(LOG_ERR, "fctnl(..., O_NONBLOCK) failed for %s: %s",
    973 			socktype, strerror(errno));
    974 		return -1;
    975 	}
    976 
    977 	return 1;
    978 }
    979 
    980 #ifdef INET6
    981 static int
    982 set_ipv6_v6only(struct nsd_socket *sock)
    983 {
    984 #ifdef IPV6_V6ONLY
    985 	int on = 1;
    986 	const char *socktype =
    987 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
    988 
    989 	if(0 == setsockopt(
    990 		sock->s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on)))
    991 	{
    992 		return 1;
    993 	}
    994 
    995 	log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed for %s: %s",
    996 		socktype, strerror(errno));
    997 	return -1;
    998 #else
    999 	(void)sock;
   1000 #endif /* IPV6_V6ONLY */
   1001 
   1002 	return 0;
   1003 }
   1004 #endif /* INET6 */
   1005 
   1006 #ifdef INET6
   1007 static int
   1008 set_ipv6_use_min_mtu(struct nsd_socket *sock)
   1009 {
   1010 #if defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU)
   1011 #if defined(IPV6_USE_MIN_MTU)
   1012 	/* There is no fragmentation of IPv6 datagrams during forwarding in the
   1013 	 * network. Therefore we do not send UDP datagrams larger than the
   1014 	 * minimum IPv6 MTU of 1280 octets. The EDNS0 message length can be
   1015 	 * larger if the network stack supports IPV6_USE_MIN_MTU.
   1016 	 */
   1017 	int opt = IPV6_USE_MIN_MTU;
   1018 	int optval = 1;
   1019 	static const char optname[] = "IPV6_USE_MIN_MTU";
   1020 #elif defined(IPV6_MTU)
   1021 	/* On Linux, PMTUD is disabled by default for datagrams so set the MTU
   1022 	 * to the MIN MTU to get the same.
   1023 	 */
   1024 	int opt = IPV6_MTU;
   1025 	int optval = IPV6_MIN_MTU;
   1026 	static const char optname[] = "IPV6_MTU";
   1027 #endif
   1028 	if(0 == setsockopt(
   1029 		sock->s, IPPROTO_IPV6, opt, &optval, sizeof(optval)))
   1030 	{
   1031 		return 1;
   1032 	}
   1033 
   1034 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
   1035 		optname, strerror(errno));
   1036 	return -1;
   1037 #else
   1038 	(void)sock;
   1039 #endif /* INET6 */
   1040 
   1041 	return 0;
   1042 }
   1043 #endif /* INET6 */
   1044 
   1045 static int
   1046 set_ipv4_no_pmtu_disc(struct nsd_socket *sock)
   1047 {
   1048 	int ret = 0;
   1049 
   1050 #if defined(IP_MTU_DISCOVER)
   1051 	int opt = IP_MTU_DISCOVER;
   1052 	int optval;
   1053 # if defined(IP_PMTUDISC_OMIT)
   1054 	/* Linux 3.15 has IP_PMTUDISC_OMIT which makes sockets ignore PMTU
   1055 	 * information and send packets with DF=0. Fragmentation is allowed if
   1056 	 * and only if the packet size exceeds the outgoing interface MTU or
   1057 	 * the packet encounters smaller MTU link in network. This mitigates
   1058 	 * DNS fragmentation attacks by preventing forged PMTU information.
   1059 	 * FreeBSD already has same semantics without setting the option.
   1060 	 */
   1061 	optval = IP_PMTUDISC_OMIT;
   1062 	if(0 == setsockopt(
   1063 		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
   1064 	{
   1065 		return 1;
   1066 	}
   1067 
   1068 	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
   1069 		"IP_MTU_DISCOVER", "IP_PMTUDISC_OMIT", strerror(errno));
   1070 # endif /* IP_PMTUDISC_OMIT */
   1071 # if defined(IP_PMTUDISC_DONT)
   1072 	/* Use IP_PMTUDISC_DONT if IP_PMTUDISC_OMIT failed / undefined. */
   1073 	optval = IP_PMTUDISC_DONT;
   1074 	if(0 == setsockopt(
   1075 		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
   1076 	{
   1077 		return 1;
   1078 	}
   1079 
   1080 	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
   1081 		"IP_MTU_DISCOVER", "IP_PMTUDISC_DONT", strerror(errno));
   1082 # endif
   1083 	ret = -1;
   1084 #elif defined(IP_DONTFRAG)
   1085 	int off = 0;
   1086 	if (0 == setsockopt(
   1087 		sock->s, IPPROTO_IP, IP_DONTFRAG, &off, sizeof(off)))
   1088 	{
   1089 		return 1;
   1090 	}
   1091 
   1092 	log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s",
   1093 		strerror(errno));
   1094 	ret = -1;
   1095 #else
   1096 	(void)sock;
   1097 #endif
   1098 
   1099 	return ret;
   1100 }
   1101 
   1102 static int
   1103 set_ip_freebind(struct nsd_socket *sock)
   1104 {
   1105 #ifdef IP_FREEBIND
   1106 	int on = 1;
   1107 	const char *socktype =
   1108 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
   1109 	if(setsockopt(sock->s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) == 0)
   1110 	{
   1111 		return 1;
   1112 	}
   1113 	log_msg(LOG_ERR, "setsockopt(..., IP_FREEBIND, ...) failed for %s: %s",
   1114 		socktype, strerror(errno));
   1115 	return -1;
   1116 #else
   1117 	(void)sock;
   1118 #endif /* IP_FREEBIND */
   1119 
   1120 	return 0;
   1121 }
   1122 
   1123 static int
   1124 set_ip_transparent(struct nsd_socket *sock)
   1125 {
   1126 	/*
   1127 	The scandalous preprocessor blob here calls for some explanation :)
   1128 	POSIX does not specify an option to bind non-local IPs, so
   1129 	platforms developed several implementation-specific options,
   1130 	all set in the same way, but with different names.
   1131 	For additional complexity, some platform manage this setting
   1132 	differently for different address families (IPv4 vs IPv6).
   1133 	This scandalous preprocessor blob below abstracts such variability
   1134 	in the way which leaves the C code as lean and clear as possible.
   1135 	*/
   1136 
   1137 #if defined(IP_TRANSPARENT)
   1138 #	define NSD_SOCKET_OPTION_TRANSPARENT 						IP_TRANSPARENT
   1139 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		IPPROTO_IP
   1140 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"IP_TRANSPARENT"
   1141 // as of 2020-01, Linux does not support this on IPv6 programmatically
   1142 #elif defined(SO_BINDANY)
   1143 #	define NSD_SOCKET_OPTION_TRANSPARENT						SO_BINDANY
   1144 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		SOL_SOCKET
   1145 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"SO_BINDANY"
   1146 #elif defined(IP_BINDANY)
   1147 #	define NSD_SOCKET_OPTION_TRANSPARENT 						IP_BINDANY
   1148 #	define NSD_SOCKET_OPTION_TRANSPARENT6						IPV6_BINDANY
   1149 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		IPPROTO_IP
   1150 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6	IPPROTO_IPV6
   1151 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"IP_BINDANY"
   1152 #endif
   1153 
   1154 #ifndef NSD_SOCKET_OPTION_TRANSPARENT
   1155 	(void)sock;
   1156 #else
   1157 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT6
   1158 #		define NSD_SOCKET_OPTION_TRANSPARENT6 NSD_SOCKET_OPTION_TRANSPARENT
   1159 #	endif
   1160 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6
   1161 #		define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL
   1162 #	endif
   1163 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT_NAME6
   1164 #		define NSD_SOCKET_OPTION_TRANSPARENT_NAME6 NSD_SOCKET_OPTION_TRANSPARENT_NAME
   1165 #	endif
   1166 
   1167 	int on = 1;
   1168 	const char *socktype =
   1169 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
   1170 	const int is_ip6 = (sock->addr.ai_family == AF_INET6);
   1171 
   1172 	if(0 == setsockopt(
   1173 		sock->s,
   1174 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 : NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL,
   1175 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT6 : NSD_SOCKET_OPTION_TRANSPARENT,
   1176 		&on, sizeof(on)))
   1177 	{
   1178 		return 1;
   1179 	}
   1180 
   1181 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed for %s: %s",
   1182 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_NAME6 : NSD_SOCKET_OPTION_TRANSPARENT_NAME, socktype, strerror(errno));
   1183 	return -1;
   1184 #endif
   1185 
   1186 	return 0;
   1187 }
   1188 
   1189 static int
   1190 set_tcp_maxseg(struct nsd_socket *sock, int mss)
   1191 {
   1192 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
   1193 	if(setsockopt(sock->s, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == 0) {
   1194 		return 1;
   1195 	}
   1196 	log_msg(LOG_ERR, "setsockopt(..., TCP_MAXSEG, ...) failed for tcp: %s",
   1197 		strerror(errno));
   1198 	return -1;
   1199 #else
   1200 	log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported");
   1201 #endif
   1202 	return 0;
   1203 }
   1204 
   1205 #ifdef USE_TCP_FASTOPEN
   1206 static int
   1207 set_tcp_fastopen(struct nsd_socket *sock)
   1208 {
   1209 	/* qlen specifies how many outstanding TFO requests to allow. Limit is
   1210 	 * a defense against IP spoofing attacks as suggested in RFC7413.
   1211 	 */
   1212 	int qlen;
   1213 
   1214 #ifdef __APPLE__
   1215 	/* macOS X implementation only supports qlen of 1 via this call. The
   1216 	 * actual value is configured by the net.inet.tcp.fastopen_backlog
   1217 	 * kernel parameter.
   1218 	 */
   1219 	qlen = 1;
   1220 #else
   1221 	/* 5 is recommended on Linux. */
   1222 	qlen = 5;
   1223 #endif
   1224 	if (0 == setsockopt(
   1225 		sock->s, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen)))
   1226 	{
   1227 		return 1;
   1228 	}
   1229 
   1230 	if (errno == EPERM) {
   1231 		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s "
   1232 				 "; this could likely be because sysctl "
   1233 				 "net.inet.tcp.fastopen.enabled, "
   1234 				 "net.inet.tcp.fastopen.server_enable, or "
   1235 				 "net.ipv4.tcp_fastopen is disabled",
   1236 			strerror(errno));
   1237 	/* Squelch ENOPROTOOPT: FreeBSD server mode with kernel support
   1238 	 * disabled, except when verbosity enabled for debugging
   1239 	 */
   1240 	} else if(errno != ENOPROTOOPT || verbosity >= 3) {
   1241 		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s",
   1242 			strerror(errno));
   1243 	}
   1244 
   1245 	return (errno == ENOPROTOOPT ? 0 : -1);
   1246 }
   1247 #endif /* USE_TCP_FASTOPEN */
   1248 
   1249 static int
   1250 set_bindtodevice(struct nsd_socket *sock)
   1251 {
   1252 #if defined(SO_BINDTODEVICE)
   1253 	if(setsockopt(sock->s, SOL_SOCKET, SO_BINDTODEVICE,
   1254 		sock->device, strlen(sock->device)) == -1)
   1255 	{
   1256 		log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
   1257 		                 "SO_BINDTODEVICE", sock->device, strerror(errno));
   1258 		return -1;
   1259 	}
   1260 
   1261 	return 1;
   1262 #else
   1263 	(void)sock;
   1264 	return 0;
   1265 #endif
   1266 }
   1267 
   1268 static int
   1269 set_setfib(struct nsd_socket *sock)
   1270 {
   1271 #if defined(SO_SETFIB)
   1272 	if(setsockopt(sock->s, SOL_SOCKET, SO_SETFIB,
   1273 	              (const void *)&sock->fib, sizeof(sock->fib)) == -1)
   1274 	{
   1275 		log_msg(LOG_ERR, "setsockopt(..., %s, %d, ...) failed: %s",
   1276 		                 "SO_SETFIB", sock->fib, strerror(errno));
   1277 		return -1;
   1278 	}
   1279 
   1280 	return 1;
   1281 #else
   1282 	(void)sock;
   1283 	return 0;
   1284 #endif
   1285 }
   1286 
   1287 static int
   1288 open_udp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
   1289 {
   1290 	int rcv = nsd->options->receive_buffer_size;
   1291 	int snd = nsd->options->send_buffer_size;
   1292 
   1293 	if(-1 == (sock->s = socket(
   1294 		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
   1295 	{
   1296 #ifdef INET6
   1297 		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
   1298 		   (sock->addr.ai_family == AF_INET6) &&
   1299 		   (errno == EAFNOSUPPORT))
   1300 		{
   1301 			log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: "
   1302 				"not supported");
   1303 			return 0;
   1304 		}
   1305 #endif
   1306 		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
   1307 		return -1;
   1308 	}
   1309 
   1310 	set_cloexec(sock);
   1311 
   1312 	if(nsd->reuseport && reuseport_works && *reuseport_works)
   1313 		*reuseport_works = (set_reuseport(sock) == 1);
   1314 
   1315 	if(set_rcvbuf(sock, rcv) == -1)
   1316 		return -1;
   1317 
   1318 	if(set_sndbuf(sock, snd) == -1)
   1319 		return -1;
   1320 #ifdef INET6
   1321 	if(sock->addr.ai_family == AF_INET6) {
   1322 		if(set_ipv6_v6only(sock) == -1 ||
   1323 		   set_ipv6_use_min_mtu(sock) == -1)
   1324 			return -1;
   1325 	} else
   1326 #endif /* INET6 */
   1327 	if(sock->addr.ai_family == AF_INET) {
   1328 		if(set_ipv4_no_pmtu_disc(sock) == -1)
   1329 			return -1;
   1330 	}
   1331 
   1332 	/* Set socket to non-blocking. Otherwise, on operating systems
   1333 	 * with thundering herd problems, the UDP recv could block
   1334 	 * after select returns readable.
   1335 	 */
   1336 	set_nonblock(sock);
   1337 
   1338 	if(nsd->options->ip_freebind)
   1339 		(void)set_ip_freebind(sock);
   1340 	if(nsd->options->ip_transparent)
   1341 		(void)set_ip_transparent(sock);
   1342 	if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1)
   1343 		return -1;
   1344 	if(sock->fib != -1 && set_setfib(sock) == -1)
   1345 		return -1;
   1346 
   1347 	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
   1348 		char buf[256];
   1349 		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
   1350 		log_msg(LOG_ERR, "can't bind udp socket %s: %s",
   1351 			buf, strerror(errno));
   1352 		return -1;
   1353 	}
   1354 
   1355 	return 1;
   1356 }
   1357 
   1358 static int
   1359 open_tcp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
   1360 {
   1361 #ifdef USE_TCP_FASTOPEN
   1362 	report_tcp_fastopen_config();
   1363 #endif
   1364 
   1365 	(void)reuseport_works;
   1366 
   1367 	if(-1 == (sock->s = socket(
   1368 		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
   1369 	{
   1370 #ifdef INET6
   1371 		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
   1372 		   (sock->addr.ai_family == AF_INET6) &&
   1373 		   (errno == EAFNOSUPPORT))
   1374 		{
   1375 			log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: "
   1376 			                     "not supported");
   1377 			return 0;
   1378 		}
   1379 #endif /* INET6 */
   1380 		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
   1381 		return -1;
   1382 	}
   1383 
   1384 	set_cloexec(sock);
   1385 
   1386 	if(nsd->reuseport && reuseport_works && *reuseport_works)
   1387 		*reuseport_works = (set_reuseport(sock) == 1);
   1388 
   1389 	(void)set_reuseaddr(sock);
   1390 
   1391 #ifdef INET6
   1392 	if(sock->addr.ai_family == AF_INET6) {
   1393 		if (set_ipv6_v6only(sock) == -1 ||
   1394 		    set_ipv6_use_min_mtu(sock) == -1)
   1395 			return -1;
   1396 	}
   1397 #endif
   1398 
   1399 	if(nsd->tcp_mss > 0)
   1400 		set_tcp_maxseg(sock, nsd->tcp_mss);
   1401 	/* (StevensUNP p463), if TCP listening socket is blocking, then
   1402 	   it may block in accept, even if select() says readable. */
   1403 	(void)set_nonblock(sock);
   1404 	if(nsd->options->ip_freebind)
   1405 		(void)set_ip_freebind(sock);
   1406 	if(nsd->options->ip_transparent)
   1407 		(void)set_ip_transparent(sock);
   1408 	if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1)
   1409 		return -1;
   1410 	if(sock->fib != -1 && set_setfib(sock) == -1)
   1411 		return -1;
   1412 
   1413 	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
   1414 		char buf[256];
   1415 		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
   1416 		log_msg(LOG_ERR, "can't bind tcp socket %s: %s",
   1417 			buf, strerror(errno));
   1418 		return -1;
   1419 	}
   1420 
   1421 #ifdef USE_TCP_FASTOPEN
   1422 	(void)set_tcp_fastopen(sock);
   1423 #endif
   1424 
   1425 	if(listen(sock->s, nsd->options->tcp_listen_queue) == -1) {
   1426 		log_msg(LOG_ERR, "can't listen: %s", strerror(errno));
   1427 		return -1;
   1428 	}
   1429 
   1430 	return 1;
   1431 }
   1432 
   1433 /*
   1434  * Initialize the server, reuseport, create and bind the sockets.
   1435  */
   1436 int
   1437 server_init(struct nsd *nsd)
   1438 {
   1439 	size_t i;
   1440 	int reuseport = 1; /* Determine if REUSEPORT works. */
   1441 
   1442 	/* open server interface ports */
   1443 	for(i = 0; i < nsd->ifs; i++) {
   1444 		if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1 ||
   1445 		   open_tcp_socket(nsd, &nsd->tcp[i], &reuseport) == -1)
   1446 		{
   1447 			return -1;
   1448 		}
   1449 	}
   1450 
   1451 	if(nsd->reuseport && reuseport) {
   1452 		size_t ifs = nsd->ifs * nsd->reuseport;
   1453 
   1454 		/* increase the size of the interface arrays, there are going
   1455 		 * to be separate interface file descriptors for every server
   1456 		 * instance */
   1457 		region_remove_cleanup(nsd->region, free, nsd->udp);
   1458 		region_remove_cleanup(nsd->region, free, nsd->tcp);
   1459 
   1460 		nsd->udp = xrealloc(nsd->udp, ifs * sizeof(*nsd->udp));
   1461 		nsd->tcp = xrealloc(nsd->tcp, ifs * sizeof(*nsd->tcp));
   1462 		region_add_cleanup(nsd->region, free, nsd->udp);
   1463 		region_add_cleanup(nsd->region, free, nsd->tcp);
   1464 		if(ifs > nsd->ifs) {
   1465 			memset(&nsd->udp[nsd->ifs], 0,
   1466 				(ifs-nsd->ifs)*sizeof(*nsd->udp));
   1467 			memset(&nsd->tcp[nsd->ifs], 0,
   1468 				(ifs-nsd->ifs)*sizeof(*nsd->tcp));
   1469 		}
   1470 
   1471 		for(i = nsd->ifs; i < ifs; i++) {
   1472 			nsd->udp[i] = nsd->udp[i%nsd->ifs];
   1473 			nsd->udp[i].s = -1;
   1474 			if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1) {
   1475 				return -1;
   1476 			}
   1477 			nsd->tcp[i] = nsd->tcp[i%nsd->ifs];
   1478 			nsd->tcp[i].s = -1;
   1479 			if(open_tcp_socket(nsd, &nsd->tcp[i], &reuseport) == -1) {
   1480 				return -1;
   1481 			}
   1482 		}
   1483 
   1484 		nsd->ifs = ifs;
   1485 	} else {
   1486 		nsd->reuseport = 0;
   1487 	}
   1488 
   1489 	/* open server interface ports for verifiers */
   1490 	for(i = 0; i < nsd->verify_ifs; i++) {
   1491 		if(open_udp_socket(nsd, &nsd->verify_udp[i], NULL) == -1 ||
   1492 		   open_tcp_socket(nsd, &nsd->verify_tcp[i], NULL) == -1)
   1493 		{
   1494 			return -1;
   1495 		}
   1496 	}
   1497 
   1498 	return 0;
   1499 }
   1500 
   1501 /*
   1502  * Prepare the server for take off.
   1503  *
   1504  */
   1505 int
   1506 server_prepare(struct nsd *nsd)
   1507 {
   1508 #ifdef RATELIMIT
   1509 	/* set secret modifier for hashing (rate limits) */
   1510 #ifdef HAVE_GETRANDOM
   1511 	uint32_t v;
   1512 	if(getrandom(&v, sizeof(v), 0) == -1) {
   1513 		log_msg(LOG_ERR, "getrandom failed: %s", strerror(errno));
   1514 		exit(1);
   1515 	}
   1516 	hash_set_raninit(v);
   1517 #elif defined(HAVE_ARC4RANDOM)
   1518 	hash_set_raninit(arc4random());
   1519 #else
   1520 	uint32_t v = getpid() ^ time(NULL);
   1521 	srandom((unsigned long)v);
   1522 #  ifdef HAVE_SSL
   1523 	if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0)
   1524 		hash_set_raninit(v);
   1525 	else
   1526 #  endif
   1527 		hash_set_raninit(random());
   1528 #endif
   1529 	rrl_mmap_init(nsd->child_count, nsd->options->rrl_size,
   1530 		nsd->options->rrl_ratelimit,
   1531 		nsd->options->rrl_whitelist_ratelimit,
   1532 		nsd->options->rrl_slip,
   1533 		nsd->options->rrl_ipv4_prefix_length,
   1534 		nsd->options->rrl_ipv6_prefix_length);
   1535 #endif /* RATELIMIT */
   1536 
   1537 	/* Open the database... */
   1538 	if ((nsd->db = namedb_open(nsd->options)) == NULL) {
   1539 		log_msg(LOG_ERR, "unable to open the database: %s", strerror(errno));
   1540 		unlink(nsd->task[0]->fname);
   1541 		unlink(nsd->task[1]->fname);
   1542 #ifdef USE_ZONE_STATS
   1543 		unlink(nsd->zonestatfname[0]);
   1544 		unlink(nsd->zonestatfname[1]);
   1545 #endif
   1546 #ifdef BIND8_STATS
   1547 		server_stat_free(nsd);
   1548 #endif
   1549 		xfrd_del_tempdir(nsd);
   1550 		return -1;
   1551 	}
   1552 	/* check if zone files can be read */
   1553 	/* NULL for taskudb because we send soainfo in a moment, batched up,
   1554 	 * for all zones */
   1555 	namedb_check_zonefiles(nsd, nsd->options, NULL, NULL);
   1556 	zonestatid_tree_set(nsd);
   1557 
   1558 	compression_table_capacity = 0;
   1559 	initialize_dname_compression_tables(nsd);
   1560 
   1561 #ifdef	BIND8_STATS
   1562 	/* Initialize times... */
   1563 	time(&nsd->st->boot);
   1564 	nsd->st->reloadcount = 0;
   1565 	set_bind8_alarm(nsd);
   1566 #endif /* BIND8_STATS */
   1567 
   1568 	return 0;
   1569 }
   1570 
   1571 /*
   1572  * Fork the required number of servers.
   1573  */
   1574 static int
   1575 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio,
   1576 	int* xfrd_sock_p)
   1577 {
   1578 	size_t i;
   1579 
   1580 	/* Start all child servers initially.  */
   1581 	for (i = 0; i < nsd->child_count; ++i) {
   1582 		nsd->children[i].pid = 0;
   1583 	}
   1584 
   1585 	return restart_child_servers(nsd, region, netio, xfrd_sock_p);
   1586 }
   1587 
   1588 static void
   1589 server_close_socket(struct nsd_socket *sock)
   1590 {
   1591 	if(sock->s != -1) {
   1592 		close(sock->s);
   1593 		sock->s = -1;
   1594 	}
   1595 }
   1596 
   1597 void
   1598 server_close_all_sockets(struct nsd_socket sockets[], size_t n)
   1599 {
   1600 	size_t i;
   1601 
   1602 	/* Close all the sockets... */
   1603 	for (i = 0; i < n; ++i) {
   1604 		server_close_socket(&sockets[i]);
   1605 	}
   1606 }
   1607 
   1608 /*
   1609  * Close the sockets, shutdown the server and exit.
   1610  * Does not return.
   1611  */
   1612 void
   1613 server_shutdown(struct nsd *nsd)
   1614 {
   1615 	size_t i;
   1616 
   1617 	server_close_all_sockets(nsd->udp, nsd->ifs);
   1618 	server_close_all_sockets(nsd->tcp, nsd->ifs);
   1619 	/* CHILD: close command channel to parent */
   1620 	if(nsd->this_child && nsd->this_child->parent_fd != -1)
   1621 	{
   1622 		close(nsd->this_child->parent_fd);
   1623 		nsd->this_child->parent_fd = -1;
   1624 	}
   1625 	/* SERVER: close command channels to children */
   1626 	if(!nsd->this_child)
   1627 	{
   1628 		for(i=0; i < nsd->child_count; ++i)
   1629 			if(nsd->children[i].child_fd != -1)
   1630 			{
   1631 				close(nsd->children[i].child_fd);
   1632 				nsd->children[i].child_fd = -1;
   1633 			}
   1634 	}
   1635 
   1636 	tsig_finalize();
   1637 	daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */
   1638 #ifdef USE_METRICS
   1639 	daemon_metrics_delete(nsd->metrics);
   1640 #endif /* USE_METRICS */
   1641 #ifdef HAVE_SSL
   1642 	if (nsd->tls_ctx)
   1643 		SSL_CTX_free(nsd->tls_ctx);
   1644 	if (nsd->tls_auth_ctx)
   1645 		SSL_CTX_free(nsd->tls_auth_ctx);
   1646 #endif
   1647 
   1648 #ifdef MEMCLEAN /* OS collects memory pages */
   1649 #ifdef RATELIMIT
   1650 	rrl_mmap_deinit_keep_mmap();
   1651 #endif
   1652 #ifdef USE_DNSTAP
   1653 	dt_collector_destroy(nsd->dt_collector, nsd);
   1654 #endif
   1655 	udb_base_free_keep_mmap(nsd->task[0]);
   1656 	udb_base_free_keep_mmap(nsd->task[1]);
   1657 	namedb_free_ixfr(nsd->db);
   1658 	namedb_close(nsd->db);
   1659 	nsd_options_destroy(nsd->options);
   1660 	region_destroy(nsd->region);
   1661 #endif
   1662 	log_finalize();
   1663 	exit(0);
   1664 }
   1665 
   1666 void
   1667 server_prepare_xfrd(struct nsd* nsd)
   1668 {
   1669 	char tmpfile[256];
   1670 	size_t i;
   1671 	/* create task mmaps */
   1672 	nsd->mytask = 0;
   1673 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0",
   1674 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
   1675 	nsd->task[0] = task_file_create(tmpfile);
   1676 	if(!nsd->task[0]) {
   1677 #ifdef USE_ZONE_STATS
   1678 		unlink(nsd->zonestatfname[0]);
   1679 		unlink(nsd->zonestatfname[1]);
   1680 #endif
   1681 #ifdef BIND8_STATS
   1682 		server_stat_free(nsd);
   1683 #endif
   1684 		xfrd_del_tempdir(nsd);
   1685 		exit(1);
   1686 	}
   1687 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1",
   1688 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
   1689 	nsd->task[1] = task_file_create(tmpfile);
   1690 	if(!nsd->task[1]) {
   1691 		unlink(nsd->task[0]->fname);
   1692 #ifdef USE_ZONE_STATS
   1693 		unlink(nsd->zonestatfname[0]);
   1694 		unlink(nsd->zonestatfname[1]);
   1695 #endif
   1696 #ifdef BIND8_STATS
   1697 		server_stat_free(nsd);
   1698 #endif
   1699 		xfrd_del_tempdir(nsd);
   1700 		exit(1);
   1701 	}
   1702 	assert(udb_base_get_userdata(nsd->task[0])->data == 0);
   1703 	assert(udb_base_get_userdata(nsd->task[1])->data == 0);
   1704 	/* create xfrd listener structure */
   1705 	nsd->xfrd_listener = region_alloc(nsd->region,
   1706 		sizeof(netio_handler_type));
   1707 	nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*)
   1708 		region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data));
   1709 	nsd->xfrd_listener->fd = -1;
   1710 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd =
   1711 		nsd;
   1712 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn =
   1713 		xfrd_tcp_create(nsd->region, QIOBUFSZ);
   1714 	/* setup sockets to pass NOTIFY messages from the serve processes */
   1715 	nsd->serve2xfrd_fd_send = region_alloc_array(
   1716 			nsd->region, 2 * nsd->child_count, sizeof(int));
   1717 	nsd->serve2xfrd_fd_recv= region_alloc_array(
   1718 			nsd->region, 2 * nsd->child_count, sizeof(int));
   1719 	for(i=0; i < 2 * nsd->child_count; i++) {
   1720 		int pipefd[2];
   1721 		pipefd[0] = -1; /* For receiving by parent (xfrd) */
   1722 		pipefd[1] = -1; /* For sending   by child  (server childs) */
   1723 		if(pipe(pipefd) < 0) {
   1724                         log_msg(LOG_ERR, "fatal error: cannot create NOTIFY "
   1725 				"communication channel: %s", strerror(errno));
   1726 			exit(1);
   1727                 }
   1728                 nsd->serve2xfrd_fd_recv[i] = pipefd[0];
   1729                 nsd->serve2xfrd_fd_send[i] = pipefd[1];
   1730 	}
   1731 	nsd->serve2xfrd_fd_swap = nsd->serve2xfrd_fd_send + nsd->child_count;
   1732 }
   1733 
   1734 
   1735 void
   1736 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active)
   1737 {
   1738 	pid_t pid;
   1739 	int sockets[2] = {0,0};
   1740 	struct ipc_handler_conn_data *data;
   1741 	size_t i;
   1742 
   1743 	if(nsd->xfrd_listener->fd != -1)
   1744 		close(nsd->xfrd_listener->fd);
   1745 	if(del_db) {
   1746 		/* recreate taskdb that xfrd was using, it may be corrupt */
   1747 		/* we (or reload) use nsd->mytask, and xfrd uses the other */
   1748 		char* tmpfile = nsd->task[1-nsd->mytask]->fname;
   1749 		nsd->task[1-nsd->mytask]->fname = NULL;
   1750 		/* free alloc already, so udb does not shrink itself */
   1751 		udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc);
   1752 		nsd->task[1-nsd->mytask]->alloc = NULL;
   1753 		udb_base_free(nsd->task[1-nsd->mytask]);
   1754 		/* create new file, overwrite the old one */
   1755 		nsd->task[1-nsd->mytask] = task_file_create(tmpfile);
   1756 		free(tmpfile);
   1757 	}
   1758 	if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) {
   1759 		log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno));
   1760 		return;
   1761 	}
   1762 	pid = fork();
   1763 	switch (pid) {
   1764 	case -1:
   1765 		log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno));
   1766 		break;
   1767 	default:
   1768 		/* PARENT: close first socket, use second one */
   1769 		close(sockets[0]);
   1770 		if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) {
   1771 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
   1772 		}
   1773 		if(del_db) xfrd_free_namedb(nsd);
   1774 		/* use other task than I am using, since if xfrd died and is
   1775 		 * restarted, the reload is using nsd->mytask */
   1776 		nsd->mytask = 1 - nsd->mytask;
   1777 
   1778 		/* close the send site of the serve2xfrd fds */
   1779 		assert(nsd->serve2xfrd_fd_send < nsd->serve2xfrd_fd_swap);
   1780 		for(i = 0; i < 2 * nsd->child_count; i++) {
   1781 			if(nsd->serve2xfrd_fd_send[i] != -1) {
   1782 				close(nsd->serve2xfrd_fd_send[i]);
   1783 				nsd->serve2xfrd_fd_send[i] = -1;
   1784 			}
   1785 		}
   1786 #ifdef HAVE_SETPROCTITLE
   1787 		setproctitle("xfrd");
   1788 #endif
   1789 #ifdef USE_LOG_PROCESS_ROLE
   1790 		log_set_process_role("xfrd");
   1791 #endif
   1792 #ifdef HAVE_CPUSET_T
   1793 		if(nsd->use_cpu_affinity) {
   1794 			set_cpu_affinity(nsd->xfrd_cpuset);
   1795 		}
   1796 #endif
   1797 
   1798 		xfrd_init(sockets[1], nsd, del_db, reload_active, pid);
   1799 		/* ENOTREACH */
   1800 		break;
   1801 	case 0:
   1802 		/* CHILD: close second socket, use first one */
   1803 		close(sockets[1]);
   1804 		if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) {
   1805 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
   1806 		}
   1807 		nsd->xfrd_listener->fd = sockets[0];
   1808 		/* close the receive site of the serve2xfrd fds */
   1809 		for(i = 0; i < 2 * nsd->child_count; i++) {
   1810 			if(nsd->serve2xfrd_fd_recv[i] != -1) {
   1811 				close(nsd->serve2xfrd_fd_recv[i]);
   1812 				nsd->serve2xfrd_fd_recv[i] = -1;
   1813 			}
   1814 		}
   1815 #ifdef HAVE_SETPROCTITLE
   1816 		setproctitle("main");
   1817 #endif
   1818 #ifdef USE_LOG_PROCESS_ROLE
   1819 		log_set_process_role("main");
   1820 #endif
   1821 		break;
   1822 	}
   1823 	/* server-parent only */
   1824 	nsd->xfrd_listener->timeout = NULL;
   1825 	nsd->xfrd_listener->event_types = NETIO_EVENT_READ;
   1826 	nsd->xfrd_listener->event_handler = parent_handle_xfrd_command;
   1827 	/* clear ongoing ipc reads */
   1828 	data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data;
   1829 	data->conn->is_reading = 0;
   1830 }
   1831 
   1832 /** add all soainfo to taskdb */
   1833 static void
   1834 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb)
   1835 {
   1836 	struct radnode* n;
   1837 	udb_ptr task_last; /* last task, mytask is empty so NULL */
   1838 	/* add all SOA INFO to mytask */
   1839 	udb_ptr_init(&task_last, taskudb);
   1840 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
   1841 		task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0);
   1842 	}
   1843 	udb_ptr_unlink(&task_last, taskudb);
   1844 }
   1845 
   1846 void
   1847 server_send_soa_xfrd(struct nsd* nsd, int shortsoa)
   1848 {
   1849 	/* normally this exchanges the SOA from nsd->xfrd and the expire back.
   1850 	 *   parent fills one taskdb with soas, xfrd fills other with expires.
   1851 	 *   then they exchange and process.
   1852 	 * shortsoa: xfrd crashes and needs to be restarted and one taskdb
   1853 	 *   may be in use by reload.  Fill SOA in taskdb and give to xfrd.
   1854 	 *   expire notifications can be sent back via a normal reload later
   1855 	 *   (xfrd will wait for current running reload to finish if any).
   1856 	 */
   1857 	sig_atomic_t cmd = 0;
   1858 	pid_t mypid;
   1859 	int xfrd_sock = nsd->xfrd_listener->fd;
   1860 	struct udb_base* taskudb = nsd->task[nsd->mytask];
   1861 	udb_ptr t;
   1862 	if(!shortsoa) {
   1863 		if(nsd->signal_hint_shutdown) {
   1864 		shutdown:
   1865 			log_msg(LOG_WARNING, "signal received, shutting down...");
   1866 			server_close_all_sockets(nsd->udp, nsd->ifs);
   1867 			server_close_all_sockets(nsd->tcp, nsd->ifs);
   1868 			daemon_remote_close(nsd->rc);
   1869 			/* Unlink it if possible... */
   1870 			unlinkpid(nsd->pidfile, nsd->username);
   1871 			unlink(nsd->task[0]->fname);
   1872 			unlink(nsd->task[1]->fname);
   1873 #ifdef USE_ZONE_STATS
   1874 			unlink(nsd->zonestatfname[0]);
   1875 			unlink(nsd->zonestatfname[1]);
   1876 #endif
   1877 #ifdef BIND8_STATS
   1878 			server_stat_free(nsd);
   1879 #endif
   1880 			server_shutdown(nsd);
   1881 			/* ENOTREACH */
   1882 			exit(0);
   1883 		}
   1884 	}
   1885 	if(shortsoa) {
   1886 		/* put SOA in xfrd task because mytask may be in use */
   1887 		taskudb = nsd->task[1-nsd->mytask];
   1888 	}
   1889 
   1890 	add_all_soa_to_task(nsd, taskudb);
   1891 	if(!shortsoa) {
   1892 		/* wait for xfrd to signal task is ready, RELOAD signal */
   1893 		if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) ||
   1894 			cmd != NSD_RELOAD) {
   1895 			log_msg(LOG_ERR, "did not get start signal from xfrd");
   1896 			exit(1);
   1897 		}
   1898 		if(nsd->signal_hint_shutdown) {
   1899 			goto shutdown;
   1900 		}
   1901 	}
   1902 	/* give xfrd our task, signal it with RELOAD_DONE */
   1903 	task_process_sync(taskudb);
   1904 	cmd = NSD_RELOAD_DONE;
   1905 	if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
   1906 		log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
   1907 			(int)nsd->pid, strerror(errno));
   1908 	}
   1909 	mypid = getpid();
   1910 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
   1911 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
   1912 			strerror(errno));
   1913 	}
   1914 
   1915 	if(!shortsoa) {
   1916 		/* process the xfrd task works (expiry data) */
   1917 		nsd->mytask = 1 - nsd->mytask;
   1918 		taskudb = nsd->task[nsd->mytask];
   1919 		task_remap(taskudb);
   1920 		udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb));
   1921 		while(!udb_ptr_is_null(&t)) {
   1922 			task_process_expire(nsd->db, TASKLIST(&t));
   1923 			udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next);
   1924 		}
   1925 		udb_ptr_unlink(&t, taskudb);
   1926 		task_clear(taskudb);
   1927 
   1928 		/* tell xfrd that the task is emptied, signal with RELOAD_DONE */
   1929 		cmd = NSD_RELOAD_DONE;
   1930 		if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
   1931 			log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
   1932 				(int)nsd->pid, strerror(errno));
   1933 		}
   1934 	}
   1935 }
   1936 
   1937 #ifdef HAVE_SSL
   1938 static void
   1939 log_crypto_from_err(int level, const char* str, unsigned long err)
   1940 {
   1941 	/* error:[error code]:[library name]:[function name]:[reason string] */
   1942 	char buf[128];
   1943 	unsigned long e;
   1944 	ERR_error_string_n(err, buf, sizeof(buf));
   1945 	log_msg(level, "%s crypto %s", str, buf);
   1946 	while( (e=ERR_get_error()) ) {
   1947 		ERR_error_string_n(e, buf, sizeof(buf));
   1948 		log_msg(level, "and additionally crypto %s", buf);
   1949 	}
   1950 }
   1951 
   1952 void
   1953 log_crypto_err(const char* str)
   1954 {
   1955 	log_crypto_from_err(LOG_ERR, str, ERR_get_error());
   1956 }
   1957 
   1958 void
   1959 log_crypto_warning(const char* str)
   1960 {
   1961 	log_crypto_from_err(LOG_WARNING, str, ERR_get_error());
   1962 }
   1963 
   1964 /** true if the ssl handshake error has to be squelched from the logs */
   1965 static int
   1966 squelch_err_ssl_handshake(unsigned long err)
   1967 {
   1968 	if(verbosity >= 3)
   1969 		return 0; /* only squelch on low verbosity */
   1970 	/* this is very specific, we could filter on ERR_GET_REASON()
   1971 	 * (the third element in ERR_PACK) */
   1972 	if(err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTPS_PROXY_REQUEST) ||
   1973 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTP_REQUEST) ||
   1974 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_WRONG_VERSION_NUMBER) ||
   1975 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_READ_BYTES, SSL_R_SSLV3_ALERT_BAD_CERTIFICATE)
   1976 #ifdef SSL_F_TLS_POST_PROCESS_CLIENT_HELLO
   1977 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_POST_PROCESS_CLIENT_HELLO, SSL_R_NO_SHARED_CIPHER)
   1978 #endif
   1979 #ifdef SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO
   1980 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNKNOWN_PROTOCOL)
   1981 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNSUPPORTED_PROTOCOL)
   1982 #  ifdef SSL_R_VERSION_TOO_LOW
   1983 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_VERSION_TOO_LOW)
   1984 #  endif
   1985 #endif
   1986 		)
   1987 		return 1;
   1988 	return 0;
   1989 }
   1990 
   1991 void
   1992 perform_openssl_init(void)
   1993 {
   1994 	/* init SSL library */
   1995 #ifdef HAVE_ERR_LOAD_CRYPTO_STRINGS
   1996 	ERR_load_crypto_strings();
   1997 #endif
   1998 #if defined(HAVE_ERR_LOAD_SSL_STRINGS) && !defined(DEPRECATED_ERR_LOAD_SSL_STRINGS)
   1999 	ERR_load_SSL_strings();
   2000 #endif
   2001 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_CRYPTO)
   2002 	OpenSSL_add_all_algorithms();
   2003 #else
   2004 	OPENSSL_init_crypto(OPENSSL_INIT_ADD_ALL_CIPHERS
   2005 		| OPENSSL_INIT_ADD_ALL_DIGESTS
   2006 		| OPENSSL_INIT_LOAD_CRYPTO_STRINGS, NULL);
   2007 #endif
   2008 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_SSL)
   2009 	(void)SSL_library_init();
   2010 #else
   2011 	OPENSSL_init_ssl(0, NULL);
   2012 #endif
   2013 
   2014 	if(!RAND_status()) {
   2015 		/* try to seed it */
   2016 		unsigned char buf[256];
   2017 		unsigned int v, seed=(unsigned)time(NULL) ^ (unsigned)getpid();
   2018 		size_t i;
   2019 		v = seed;
   2020 		for(i=0; i<256/sizeof(v); i++) {
   2021 			memmove(buf+i*sizeof(v), &v, sizeof(v));
   2022 			v = v*seed + (unsigned int)i;
   2023 		}
   2024 		RAND_seed(buf, 256);
   2025 		log_msg(LOG_WARNING, "warning: no entropy, seeding openssl PRNG with time");
   2026 	}
   2027 }
   2028 
   2029 static int
   2030 get_ocsp(char *filename, unsigned char **ocsp)
   2031 {
   2032 	BIO *bio;
   2033 	OCSP_RESPONSE *response;
   2034 	int len = -1;
   2035 	unsigned char *p, *buf;
   2036 	assert(filename);
   2037 
   2038 	if ((bio = BIO_new_file(filename, "r")) == NULL) {
   2039 		log_crypto_err("get_ocsp: BIO_new_file failed");
   2040 		return -1;
   2041 	}
   2042 
   2043 	if ((response = d2i_OCSP_RESPONSE_bio(bio, NULL)) == NULL) {
   2044 		log_crypto_err("get_ocsp: d2i_OCSP_RESPONSE_bio failed");
   2045 		BIO_free(bio);
   2046 		return -1;
   2047 	}
   2048 
   2049 	if ((len = i2d_OCSP_RESPONSE(response, NULL)) <= 0) {
   2050 		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #1 failed");
   2051 		OCSP_RESPONSE_free(response);
   2052 		BIO_free(bio);
   2053 		return -1;
   2054 	}
   2055 
   2056 	if ((buf = malloc((size_t) len)) == NULL) {
   2057 		log_msg(LOG_ERR, "get_ocsp: malloc failed");
   2058 		OCSP_RESPONSE_free(response);
   2059 		BIO_free(bio);
   2060 		return -1;
   2061 	}
   2062 
   2063 	p = buf;
   2064 	if ((len = i2d_OCSP_RESPONSE(response, &p)) <= 0) {
   2065 		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #2 failed");
   2066 		free(buf);
   2067 		OCSP_RESPONSE_free(response);
   2068 		BIO_free(bio);
   2069 		return -1;
   2070 	}
   2071 
   2072 	OCSP_RESPONSE_free(response);
   2073 	BIO_free(bio);
   2074 
   2075 	*ocsp = buf;
   2076 	return len;
   2077 }
   2078 
   2079 /* further setup ssl ctx after the keys are loaded */
   2080 static void
   2081 listen_sslctx_setup_2(void* ctxt)
   2082 {
   2083 	SSL_CTX* ctx = (SSL_CTX*)ctxt;
   2084 	(void)ctx;
   2085 #if HAVE_DECL_SSL_CTX_SET_ECDH_AUTO
   2086 	if(!SSL_CTX_set_ecdh_auto(ctx,1)) {
   2087 		/* ENOTREACH */
   2088 		log_crypto_err("Error in SSL_CTX_ecdh_auto, not enabling ECDHE");
   2089 	}
   2090 #elif defined(HAVE_DECL_SSL_CTX_SET_TMP_ECDH) && defined(NID_X9_62_prime256v1) && defined(HAVE_EC_KEY_NEW_BY_CURVE_NAME)
   2091 	if(1) {
   2092 		EC_KEY *ecdh = EC_KEY_new_by_curve_name (NID_X9_62_prime256v1);
   2093 		if (!ecdh) {
   2094 			log_crypto_err("could not find p256, not enabling ECDHE");
   2095 		} else {
   2096 			if (1 != SSL_CTX_set_tmp_ecdh (ctx, ecdh)) {
   2097 				log_crypto_err("Error in SSL_CTX_set_tmp_ecdh, not enabling ECDHE");
   2098 			}
   2099 			EC_KEY_free (ecdh);
   2100 		}
   2101 	}
   2102 #endif
   2103 }
   2104 
   2105 static int
   2106 add_ocsp_data_cb(SSL *s, void* ATTR_UNUSED(arg))
   2107 {
   2108 	if(ocspdata) {
   2109 		unsigned char *p;
   2110 		if ((p=malloc(ocspdata_len)) == NULL) {
   2111 			log_msg(LOG_ERR, "add_ocsp_data_cb: malloc failure");
   2112 			return SSL_TLSEXT_ERR_NOACK;
   2113 		}
   2114 		memcpy(p, ocspdata, ocspdata_len);
   2115 		if ((SSL_set_tlsext_status_ocsp_resp(s, p, ocspdata_len)) != 1) {
   2116 			log_crypto_err("Error in SSL_set_tlsext_status_ocsp_resp");
   2117 			free(p);
   2118 			return SSL_TLSEXT_ERR_NOACK;
   2119 		}
   2120 		return SSL_TLSEXT_ERR_OK;
   2121 	} else {
   2122 		return SSL_TLSEXT_ERR_NOACK;
   2123 	}
   2124 }
   2125 
   2126 static int
   2127 server_alpn_cb(SSL* ATTR_UNUSED(s),
   2128 		const unsigned char** out, unsigned char* outlen,
   2129 		const unsigned char* in, unsigned int inlen,
   2130 		void* ATTR_UNUSED(arg))
   2131 {
   2132 	static const unsigned char alpns[] = { 3, 'd', 'o', 't' };
   2133 	unsigned char* tmp_out;
   2134 
   2135 	SSL_select_next_proto(&tmp_out, outlen, alpns, sizeof(alpns), in, inlen);
   2136 	*out = tmp_out;
   2137 	return SSL_TLSEXT_ERR_OK;
   2138 }
   2139 
   2140 SSL_CTX*
   2141 server_tls_ctx_setup(char* key, char* pem, char* verifypem)
   2142 {
   2143 	SSL_CTX *ctx = SSL_CTX_new(SSLv23_server_method());
   2144 	if(!ctx) {
   2145 		log_crypto_err("could not SSL_CTX_new");
   2146 		return NULL;
   2147 	}
   2148 	/* no SSLv2, SSLv3 because has defects */
   2149 #if SSL_OP_NO_SSLv2 != 0
   2150 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2) & SSL_OP_NO_SSLv2) != SSL_OP_NO_SSLv2){
   2151 		log_crypto_err("could not set SSL_OP_NO_SSLv2");
   2152 		SSL_CTX_free(ctx);
   2153 		return NULL;
   2154 	}
   2155 #endif
   2156 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv3) & SSL_OP_NO_SSLv3)
   2157 		!= SSL_OP_NO_SSLv3){
   2158 		log_crypto_err("could not set SSL_OP_NO_SSLv3");
   2159 		SSL_CTX_free(ctx);
   2160 		return 0;
   2161 	}
   2162 #if defined(SSL_OP_NO_TLSv1) && defined(SSL_OP_NO_TLSv1_1)
   2163 	/* if we have tls 1.1 disable 1.0 */
   2164 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1) & SSL_OP_NO_TLSv1)
   2165 		!= SSL_OP_NO_TLSv1){
   2166 		log_crypto_err("could not set SSL_OP_NO_TLSv1");
   2167 		SSL_CTX_free(ctx);
   2168 		return 0;
   2169 	}
   2170 #endif
   2171 #if defined(SSL_OP_NO_TLSv1_1) && defined(SSL_OP_NO_TLSv1_2)
   2172 	/* if we have tls 1.2 disable 1.1 */
   2173 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_1) & SSL_OP_NO_TLSv1_1)
   2174 		!= SSL_OP_NO_TLSv1_1){
   2175 		log_crypto_err("could not set SSL_OP_NO_TLSv1_1");
   2176 		SSL_CTX_free(ctx);
   2177 		return 0;
   2178 	}
   2179 #endif
   2180 #if defined(SSL_OP_NO_TLSv1_2) && defined(SSL_OP_NO_TLSv1_3)
   2181 	/* if we have tls 1.3 disable 1.2 */
   2182 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_2) & SSL_OP_NO_TLSv1_2)
   2183 		!= SSL_OP_NO_TLSv1_2){
   2184 		log_crypto_err("could not set SSL_OP_NO_TLSv1_2");
   2185 		SSL_CTX_free(ctx);
   2186 		return 0;
   2187 	}
   2188 #endif
   2189 #if defined(SSL_OP_NO_RENEGOTIATION)
   2190 	/* disable client renegotiation */
   2191 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_RENEGOTIATION) &
   2192 		SSL_OP_NO_RENEGOTIATION) != SSL_OP_NO_RENEGOTIATION) {
   2193 		log_crypto_err("could not set SSL_OP_NO_RENEGOTIATION");
   2194 		SSL_CTX_free(ctx);
   2195 		return 0;
   2196 	}
   2197 #endif
   2198 #if defined(SSL_OP_IGNORE_UNEXPECTED_EOF)
   2199 	/* disable client renegotiation */
   2200 	if((SSL_CTX_set_options(ctx, SSL_OP_IGNORE_UNEXPECTED_EOF) &
   2201 		SSL_OP_IGNORE_UNEXPECTED_EOF) != SSL_OP_IGNORE_UNEXPECTED_EOF) {
   2202 		log_crypto_warning("could not set SSL_OP_IGNORE_UNEXPECTED_EOF");
   2203 	}
   2204 #endif
   2205 #if defined(SHA256_DIGEST_LENGTH) && defined(SSL_TXT_CHACHA20)
   2206 	/* if we detect system-wide crypto policies, use those */
   2207 	if (access( "/etc/crypto-policies/config", F_OK ) != 0 ) {
   2208 		/* if we have sha256, set the cipher list to have no known vulns */
   2209 		if(!SSL_CTX_set_cipher_list(ctx, "ECDHE+AESGCM:ECDHE+CHACHA20"))
   2210 			log_crypto_err("could not set cipher list with SSL_CTX_set_cipher_list");
   2211 	}
   2212 #endif
   2213 	if((SSL_CTX_set_options(ctx, SSL_OP_CIPHER_SERVER_PREFERENCE) &
   2214 		SSL_OP_CIPHER_SERVER_PREFERENCE) !=
   2215 		SSL_OP_CIPHER_SERVER_PREFERENCE) {
   2216 		log_crypto_err("could not set SSL_OP_CIPHER_SERVER_PREFERENCE");
   2217 		SSL_CTX_free(ctx);
   2218 		return 0;
   2219 	}
   2220 #ifdef HAVE_SSL_CTX_SET_SECURITY_LEVEL
   2221 	SSL_CTX_set_security_level(ctx, 0);
   2222 #endif
   2223 	if(!SSL_CTX_use_certificate_chain_file(ctx, pem)) {
   2224 		log_msg(LOG_ERR, "error for cert file: %s", pem);
   2225 		log_crypto_err("error in SSL_CTX use_certificate_chain_file");
   2226 		SSL_CTX_free(ctx);
   2227 		return NULL;
   2228 	}
   2229 	if(!SSL_CTX_use_PrivateKey_file(ctx, key, SSL_FILETYPE_PEM)) {
   2230 		log_msg(LOG_ERR, "error for private key file: %s", key);
   2231 		log_crypto_err("Error in SSL_CTX use_PrivateKey_file");
   2232 		SSL_CTX_free(ctx);
   2233 		return NULL;
   2234 	}
   2235 	if(!SSL_CTX_check_private_key(ctx)) {
   2236 		log_msg(LOG_ERR, "error for key file: %s", key);
   2237 		log_crypto_err("Error in SSL_CTX check_private_key");
   2238 		SSL_CTX_free(ctx);
   2239 		return NULL;
   2240 	}
   2241 	listen_sslctx_setup_2(ctx);
   2242 	if(verifypem && verifypem[0]) {
   2243 		if(!SSL_CTX_load_verify_locations(ctx, verifypem, NULL)) {
   2244 			log_crypto_err("Error in SSL_CTX verify locations");
   2245 			SSL_CTX_free(ctx);
   2246 			return NULL;
   2247 		}
   2248 		SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file(verifypem));
   2249 		SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER | SSL_VERIFY_FAIL_IF_NO_PEER_CERT, NULL);
   2250 	}
   2251 	SSL_CTX_set_alpn_select_cb(ctx, server_alpn_cb, NULL);
   2252 	return ctx;
   2253 }
   2254 
   2255 SSL_CTX*
   2256 server_tls_ctx_create(struct nsd* nsd, char* verifypem, char* ocspfile)
   2257 {
   2258 	char *key, *pem;
   2259 	SSL_CTX *ctx;
   2260 
   2261 	key = nsd->options->tls_service_key;
   2262 	pem = nsd->options->tls_service_pem;
   2263 	if(!key || key[0] == 0) {
   2264 		log_msg(LOG_ERR, "error: no tls-service-key file specified");
   2265 		return NULL;
   2266 	}
   2267 	if(!pem || pem[0] == 0) {
   2268 		log_msg(LOG_ERR, "error: no tls-service-pem file specified");
   2269 		return NULL;
   2270 	}
   2271 
   2272 	/* NOTE:This mimics the existing code in Unbound 1.5.1 by supporting SSL but
   2273 	 * raft-ietf-uta-tls-bcp-08 recommends only using TLSv1.2*/
   2274 	ctx = server_tls_ctx_setup(key, pem, verifypem);
   2275 	if(!ctx) {
   2276 		log_msg(LOG_ERR, "could not setup server TLS context");
   2277 		return NULL;
   2278 	}
   2279 	if(ocspfile && ocspfile[0]) {
   2280 		if ((ocspdata_len = get_ocsp(ocspfile, &ocspdata)) < 0) {
   2281 			log_crypto_err("Error reading OCSPfile");
   2282 			SSL_CTX_free(ctx);
   2283 			return NULL;
   2284 		} else {
   2285 			VERBOSITY(2, (LOG_INFO, "ocspfile %s loaded", ocspfile));
   2286 			if(!SSL_CTX_set_tlsext_status_cb(ctx, add_ocsp_data_cb)) {
   2287 				log_crypto_err("Error in SSL_CTX_set_tlsext_status_cb");
   2288 				SSL_CTX_free(ctx);
   2289 				return NULL;
   2290 			}
   2291 		}
   2292 	}
   2293 	return ctx;
   2294 }
   2295 
   2296 /* check if tcp_handler_accept_data created for TLS dedicated port */
   2297 int
   2298 using_tls_port(struct sockaddr* addr, const char* tls_port)
   2299 {
   2300 	in_port_t port = 0;
   2301 
   2302 	if (addr->sa_family == AF_INET)
   2303 		port = ((struct sockaddr_in*)addr)->sin_port;
   2304 #ifndef HAVE_STRUCT_SOCKADDR_IN6
   2305 	else
   2306 		port = ((struct sockaddr_in6*)addr)->sin6_port;
   2307 #endif /* HAVE_STRUCT_SOCKADDR_IN6 */
   2308 	if (atoi(tls_port) == ntohs(port))
   2309 		return 1;
   2310 
   2311 	return 0;
   2312 }
   2313 #endif
   2314 
   2315 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */
   2316 ssize_t
   2317 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout)
   2318 {
   2319 	uint8_t* buf = (uint8_t*) p;
   2320 	ssize_t total = 0;
   2321 	struct pollfd fd;
   2322 	memset(&fd, 0, sizeof(fd));
   2323 	fd.fd = s;
   2324 	fd.events = POLLIN;
   2325 
   2326 	while( total < sz) {
   2327 		ssize_t ret;
   2328 		ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000);
   2329 		if(ret == -1) {
   2330 			if(errno == EAGAIN)
   2331 				/* blocking read */
   2332 				continue;
   2333 			if(errno == EINTR) {
   2334 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
   2335 					return -1;
   2336 				/* other signals can be handled later */
   2337 				continue;
   2338 			}
   2339 			/* some error */
   2340 			return -1;
   2341 		}
   2342 		if(ret == 0) {
   2343 			/* operation timed out */
   2344 			return -2;
   2345 		}
   2346 		ret = read(s, buf+total, sz-total);
   2347 		if(ret == -1) {
   2348 			if(errno == EAGAIN)
   2349 				/* blocking read */
   2350 				continue;
   2351 			if(errno == EINTR) {
   2352 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
   2353 					return -1;
   2354 				/* other signals can be handled later */
   2355 				continue;
   2356 			}
   2357 			/* some error */
   2358 			return -1;
   2359 		}
   2360 		if(ret == 0) {
   2361 			/* closed connection! */
   2362 			return 0;
   2363 		}
   2364 		total += ret;
   2365 	}
   2366 	return total;
   2367 }
   2368 
   2369 static void
   2370 reload_process_non_xfr_tasks(struct nsd* nsd, udb_ptr* xfrs2process,
   2371 		udb_ptr* last_task)
   2372 {
   2373 	udb_ptr t, next, xfr_tail;
   2374 	udb_base* u = nsd->task[nsd->mytask];
   2375 	udb_ptr_init(&next, u);
   2376 	udb_ptr_init(&xfr_tail, u);
   2377 	udb_ptr_new(&t, u, udb_base_get_userdata(u));
   2378 	udb_base_set_userdata(u, 0);
   2379 	/* Execute all tasks except of type "task_apply_xfr". */
   2380 	while(!udb_ptr_is_null(&t)) {
   2381 		/* store next in list so this one can be deleted or reused */
   2382 		udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next);
   2383 		udb_rptr_zero(&TASKLIST(&t)->next, u);
   2384 
   2385 		if(TASKLIST(&t)->task_type != task_apply_xfr) {
   2386 			/* process task t */
   2387 			/* append results for task t and update last_task */
   2388 			task_process_in_reload(nsd, u, last_task, &t);
   2389 
   2390 		} else if(udb_ptr_is_null(xfrs2process)) {
   2391 			udb_ptr_set_ptr( xfrs2process, u, &t);
   2392 			udb_ptr_set_ptr(&xfr_tail, u, &t);
   2393 		} else {
   2394 			udb_rptr_set_ptr(&TASKLIST(&xfr_tail)->next, u, &t);
   2395 			udb_ptr_set_ptr(&xfr_tail, u, &t);
   2396 		}
   2397 		/* go to next */
   2398 		udb_ptr_set_ptr(&t, u, &next);
   2399 	}
   2400 	/* t and next are already unlinked (because they are null) */
   2401 	udb_ptr_unlink(&xfr_tail, u);
   2402 }
   2403 
   2404 static size_t
   2405 reload_process_xfr_tasks(struct nsd* nsd, int cmdsocket, udb_ptr* xfrs2process)
   2406 {
   2407 	sig_atomic_t cmd = NSD_QUIT_SYNC;
   2408 	udb_ptr next;
   2409 	udb_base* u = nsd->task[nsd->mytask];
   2410 	size_t xfrs_processed = 0;
   2411 
   2412 	udb_ptr_init(&next, u);
   2413 	while(!udb_ptr_is_null(xfrs2process)) {
   2414 		/* store next in list so this one can be deleted or reused */
   2415 		udb_ptr_set_rptr(&next, u, &TASKLIST(xfrs2process)->next);
   2416 		udb_rptr_zero(&TASKLIST(xfrs2process)->next, u);
   2417 
   2418 		/* process xfr task at xfrs2process */
   2419 		assert(TASKLIST(xfrs2process)->task_type == task_apply_xfr);
   2420 		task_process_apply_xfr(nsd, u, xfrs2process);
   2421 		xfrs_processed += 1;
   2422 
   2423 		/* go to next */
   2424 		udb_ptr_set_ptr(xfrs2process, u, &next);
   2425 
   2426 		/* if the "old-main" has quit, we must quit too, poll the fd for cmds */
   2427 		if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) != sizeof(cmd))
   2428 			; /* pass */
   2429 		else if (cmd != NSD_QUIT)
   2430 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from old-main %d", (int)cmd));
   2431 		else {
   2432 			udb_ptr_unlink(&next, u);
   2433 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
   2434 			/* unlink files of remainder of tasks */
   2435 			while(!udb_ptr_is_null(xfrs2process)) {
   2436 				assert(TASKLIST(xfrs2process)->task_type == task_apply_xfr);
   2437 				xfrd_unlink_xfrfile(nsd, TASKLIST(xfrs2process)->yesno);
   2438 				udb_ptr_set_rptr(xfrs2process, u, &TASKLIST(xfrs2process)->next);
   2439 			}
   2440 			exit(0);
   2441 		}
   2442 	}
   2443 	/* xfrs2process and next are already unlinked (because they are null) */
   2444 	return xfrs_processed;
   2445 }
   2446 
   2447 static void server_verify(struct nsd *nsd, int cmdsocket,
   2448 	struct sigaction* old_sigchld);
   2449 
   2450 struct quit_sync_event_data {
   2451 	struct event_base* base;
   2452 	size_t read;
   2453 	union {
   2454 		uint8_t buf[sizeof(sig_atomic_t)];
   2455 		sig_atomic_t cmd;
   2456 	} to_read;
   2457 };
   2458 
   2459 static void server_reload_handle_sigchld(int sig, short event,
   2460 		void* ATTR_UNUSED(arg))
   2461 {
   2462 	assert(sig == SIGCHLD);
   2463 	assert((event & EV_SIGNAL));
   2464 
   2465 	/* reap the exited old-serve child(s) */
   2466 	while(waitpid(-1, NULL, WNOHANG) > 0) {
   2467 		/* pass */
   2468 	}
   2469 }
   2470 
   2471 static void server_reload_handle_quit_sync_ack(int cmdsocket, short event,
   2472 		void* arg)
   2473 {
   2474 	struct quit_sync_event_data* cb_data =
   2475 		(struct quit_sync_event_data*)arg;
   2476 	ssize_t r;
   2477 
   2478 	if((event & EV_TIMEOUT)) {
   2479 		sig_atomic_t cmd = NSD_QUIT_SYNC;
   2480 
   2481 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main"));
   2482 		if (!write_socket(cmdsocket, &cmd, sizeof(cmd))) {
   2483 			log_msg(LOG_ERR, "problems sending command from "
   2484 				"reload to old-main: %s", strerror(errno));
   2485 		}
   2486 		/* Wait for cmdsocket to become readable or for next timeout,
   2487 		 * (this works because event is added EV_TIMEOUT|EV_PERSIST).
   2488 		 */
   2489 		return;
   2490 	}
   2491 	assert((event & EV_READ));
   2492 	assert(cb_data->read < sizeof(cb_data->to_read.cmd));
   2493 
   2494 	r = read(cmdsocket, cb_data->to_read.buf + cb_data->read,
   2495 			sizeof(cb_data->to_read.cmd) - cb_data->read);
   2496 	if(r == 0) {
   2497 		DEBUG(DEBUG_IPC, 1, (LOG_WARNING,
   2498 			"reload: old-main quit during quit sync"));
   2499 		cb_data->to_read.cmd = NSD_RELOAD;
   2500 
   2501 	} else if(r == -1) {
   2502 		if(errno == EAGAIN || errno == EINTR)
   2503 			return;
   2504 
   2505 		log_msg(LOG_ERR, "reload: could not wait for parent to quit: "
   2506 			"%s", strerror(errno));
   2507 		cb_data->to_read.cmd = NSD_RELOAD;
   2508 
   2509 	} else if (cb_data->read + r  < sizeof(cb_data->to_read.cmd)) {
   2510 		/* More to read */
   2511 		cb_data->read += r;
   2512 		return;
   2513 
   2514 	} else {
   2515 		assert(cb_data->read + r == sizeof(cb_data->to_read.cmd));
   2516 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d",
   2517 					(int)cb_data->to_read.cmd));
   2518 	}
   2519 	/* Done */
   2520 	event_base_loopexit(cb_data->base, NULL);
   2521 }
   2522 
   2523 /*
   2524  * Reload the database, stop parent, re-fork children and continue.
   2525  * as server_main.
   2526  */
   2527 static void
   2528 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio,
   2529 	int cmdsocket, udb_ptr* xfrs2process, udb_ptr* last_task)
   2530 {
   2531 	pid_t mypid;
   2532 	sig_atomic_t cmd;
   2533 	struct sigaction old_sigchld, ign_sigchld;
   2534 	struct radnode* node;
   2535 	zone_type* zone;
   2536 	enum soainfo_hint hint;
   2537 	struct quit_sync_event_data cb_data;
   2538 	struct event signal_event, cmd_event;
   2539 	struct timeval reload_sync_timeout;
   2540 	size_t xfrs_processed = 0;
   2541 	/* For swapping filedescriptors from the serve childs to the xfrd
   2542 	 * and/or the dnstap collector */
   2543 	int *swap_fd_send;
   2544 
   2545 	/* ignore SIGCHLD from the previous server_main that used this pid */
   2546 	memset(&ign_sigchld, 0, sizeof(ign_sigchld));
   2547 	ign_sigchld.sa_handler = SIG_IGN;
   2548 	sigaction(SIGCHLD, &ign_sigchld, &old_sigchld);
   2549 
   2550 #ifdef HAVE_CPUSET_T
   2551 	if(nsd->use_cpu_affinity) {
   2552 		set_cpu_affinity(nsd->cpuset);
   2553 	}
   2554 #endif
   2555 
   2556 	/* see what tasks we got from xfrd */
   2557 	xfrs_processed = reload_process_xfr_tasks(nsd, cmdsocket, xfrs2process);
   2558 
   2559 #ifndef NDEBUG
   2560 	if(nsd_debug_level >= 1)
   2561 		region_log_stats(nsd->db->region);
   2562 #endif /* NDEBUG */
   2563 	initialize_dname_compression_tables(nsd);
   2564 
   2565 #ifdef BIND8_STATS
   2566 	/* Restart dumping stats if required.  */
   2567 	time(&nsd->st->boot);
   2568 	set_bind8_alarm(nsd);
   2569 	/* Switch to a different set of stat array for new server processes,
   2570 	 * because they can briefly coexist with the old processes. They
   2571 	 * have their own stat structure. */
   2572 	nsd->stat_current = (nsd->stat_current==0?1:0);
   2573 #endif
   2574 #ifdef USE_ZONE_STATS
   2575 	server_zonestat_realloc(nsd); /* realloc for new children */
   2576 	server_zonestat_switch(nsd);
   2577 #endif
   2578 
   2579 	if(nsd->options->verify_enable) {
   2580 #ifdef RATELIMIT
   2581 		/* allocate resources for rate limiting. use a slot that is guaranteed
   2582 		   not mapped to a file so no persistent data is overwritten */
   2583 		rrl_init(nsd->child_count + 1);
   2584 #endif
   2585 
   2586 		/* spin-up server and execute verifiers for each zone */
   2587 		server_verify(nsd, cmdsocket, &old_sigchld);
   2588 #ifdef RATELIMIT
   2589 		/* deallocate rate limiting resources */
   2590 		rrl_deinit(nsd->child_count + 1);
   2591 #endif
   2592 	}
   2593 
   2594 	if(xfrs_processed) for( node = radix_first(nsd->db->zonetree)
   2595 	                      ; node != NULL; node = radix_next(node)) {
   2596 
   2597 		zone = (zone_type *)node->elem;
   2598 		if(zone->is_updated) {
   2599 			if(zone->is_bad) {
   2600 				nsd->mode = NSD_RELOAD_FAILED;
   2601 				hint = soainfo_bad;
   2602 			} else {
   2603 				hint = soainfo_ok;
   2604 			}
   2605 			/* update(s), verified or not, possibly with subsequent
   2606 			   skipped update(s). skipped update(s) are picked up
   2607 			   by failed update check in xfrd */
   2608 			task_new_soainfo(nsd->task[nsd->mytask], last_task,
   2609 			                 zone, hint);
   2610 		} else if(zone->is_skipped) {
   2611 			/* corrupt or inconsistent update without preceding
   2612 			   update(s), communicate soainfo_gone */
   2613 			task_new_soainfo(nsd->task[nsd->mytask], last_task,
   2614 			                 zone, soainfo_gone);
   2615 		}
   2616 		zone->is_updated = 0;
   2617 		zone->is_skipped = 0;
   2618 	}
   2619 
   2620 	if(nsd->mode == NSD_RELOAD_FAILED) {
   2621 		exit(NSD_RELOAD_FAILED);
   2622 	}
   2623 #ifdef BIND8_STATS
   2624 	nsd->stats_per_child[nsd->stat_current][0].reloadcount =
   2625 		nsd->stats_per_child[(nsd->stat_current==0?1:0)][0].reloadcount+1;
   2626 	nsd->stats_per_child[nsd->stat_current][0].db_mem =
   2627 		region_get_mem(nsd->db->region);
   2628 #endif
   2629 
   2630 	/* listen for the signals of failed children again */
   2631 	sigaction(SIGCHLD, &old_sigchld, NULL);
   2632 #ifdef USE_DNSTAP
   2633 	if (nsd->dt_collector) {
   2634 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: swap dnstap collector pipes"));
   2635 		/* Swap fd_send with fd_swap so old serve child and new serve
   2636 		 * childs will not write to the same pipe ends simultaneously */
   2637 		swap_fd_send = nsd->dt_collector_fd_send;
   2638 		nsd->dt_collector_fd_send = nsd->dt_collector_fd_swap;
   2639 		nsd->dt_collector_fd_swap = swap_fd_send;
   2640 
   2641 	}
   2642 #endif
   2643 	swap_fd_send = nsd->serve2xfrd_fd_send;
   2644 	nsd->serve2xfrd_fd_send = nsd->serve2xfrd_fd_swap;
   2645 	nsd->serve2xfrd_fd_swap = swap_fd_send;
   2646 	/* Start new child processes */
   2647 	if (server_start_children(nsd, server_region, netio, &nsd->
   2648 		xfrd_listener->fd) != 0) {
   2649 		send_children_quit(nsd);
   2650 		exit(1);
   2651 	}
   2652 
   2653 	/* if the old-main has quit, we must quit too, poll the fd for cmds */
   2654 	if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
   2655 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
   2656 		if(cmd == NSD_QUIT) {
   2657 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
   2658 			send_children_quit(nsd);
   2659 			exit(0);
   2660 		}
   2661 	}
   2662 
   2663 	/* Send quit command to old-main: blocking, wait for receipt.
   2664 	 * The old-main process asks the old-serve processes to quit, however
   2665 	 * if a reload succeeded before, this process is the parent of the
   2666 	 * old-serve processes, so we need to reap the children for it.
   2667 	 */
   2668 	DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main"));
   2669 	cmd = NSD_QUIT_SYNC;
   2670 	if (!write_socket(cmdsocket, &cmd, sizeof(cmd)))
   2671 	{
   2672 		log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s",
   2673 			strerror(errno));
   2674 	}
   2675 
   2676 	reload_sync_timeout.tv_sec = RELOAD_SYNC_TIMEOUT;
   2677 	reload_sync_timeout.tv_usec = 0;
   2678 
   2679 	cb_data.base = nsd_child_event_base();
   2680 	cb_data.to_read.cmd = cmd;
   2681 	cb_data.read = 0;
   2682 
   2683 	event_set(&signal_event, SIGCHLD, EV_SIGNAL|EV_PERSIST,
   2684 	    server_reload_handle_sigchld, NULL);
   2685 	if(event_base_set(cb_data.base, &signal_event) != 0
   2686 	|| signal_add(&signal_event, NULL) != 0) {
   2687 		log_msg(LOG_ERR, "NSD quit sync: could not add signal event");
   2688 	}
   2689 
   2690 	event_set(&cmd_event, cmdsocket, EV_READ|EV_TIMEOUT|EV_PERSIST,
   2691 	    server_reload_handle_quit_sync_ack, &cb_data);
   2692 	if(event_base_set(cb_data.base, &cmd_event) != 0
   2693 	|| event_add(&cmd_event, &reload_sync_timeout) != 0) {
   2694 		log_msg(LOG_ERR, "NSD quit sync: could not add command event");
   2695 	}
   2696 
   2697 	/* short-lived main loop */
   2698 	event_base_dispatch(cb_data.base);
   2699 
   2700 	/* remove command and signal event handlers */
   2701 	event_del(&cmd_event);
   2702 	signal_del(&signal_event);
   2703 	/* restore the ordinary signal handler for SIGCHLD */
   2704 	sigaction(SIGCHLD, &old_sigchld, NULL);
   2705 	event_base_free(cb_data.base);
   2706 	cmd = cb_data.to_read.cmd;
   2707 
   2708 	if(cmd == NSD_QUIT) {
   2709 		/* small race condition possible here, parent got quit cmd. */
   2710 		send_children_quit(nsd);
   2711 		exit(1);
   2712 	}
   2713 	assert(cmd == NSD_RELOAD);
   2714 	udb_ptr_set(last_task, nsd->task[nsd->mytask], 0);
   2715 	task_process_sync(nsd->task[nsd->mytask]);
   2716 #ifdef USE_ZONE_STATS
   2717 	server_zonestat_realloc(nsd); /* realloc for next children */
   2718 #endif
   2719 
   2720 	/* send soainfo to the xfrd process, signal it that reload is done,
   2721 	 * it picks up the taskudb */
   2722 	cmd = NSD_RELOAD_DONE;
   2723 	if(!write_socket(nsd->xfrd_listener->fd, &cmd,  sizeof(cmd))) {
   2724 		log_msg(LOG_ERR, "problems sending reload_done xfrd: %s",
   2725 			strerror(errno));
   2726 	}
   2727 	mypid = getpid();
   2728 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
   2729 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
   2730 			strerror(errno));
   2731 	}
   2732 
   2733 	/* try to reopen file */
   2734 	if (nsd->file_rotation_ok)
   2735 		log_reopen(nsd->log_filename, 1);
   2736 	/* exit reload, continue as new server_main */
   2737 }
   2738 
   2739 /*
   2740  * Get the mode depending on the signal hints that have been received.
   2741  * Multiple signal hints can be received and will be handled in turn.
   2742  */
   2743 static sig_atomic_t
   2744 server_signal_mode(struct nsd *nsd)
   2745 {
   2746 	if(nsd->signal_hint_quit) {
   2747 		nsd->signal_hint_quit = 0;
   2748 		return NSD_QUIT;
   2749 	}
   2750 	else if(nsd->signal_hint_shutdown) {
   2751 		nsd->signal_hint_shutdown = 0;
   2752 		return NSD_SHUTDOWN;
   2753 	}
   2754 	else if(nsd->signal_hint_child) {
   2755 		nsd->signal_hint_child = 0;
   2756 		return NSD_REAP_CHILDREN;
   2757 	}
   2758 	else if(nsd->signal_hint_reload) {
   2759 		nsd->signal_hint_reload = 0;
   2760 		return NSD_RELOAD;
   2761 	}
   2762 	else if(nsd->signal_hint_reload_hup) {
   2763 		nsd->signal_hint_reload_hup = 0;
   2764 		return NSD_RELOAD_REQ;
   2765 	}
   2766 	else if(nsd->signal_hint_stats) {
   2767 		nsd->signal_hint_stats = 0;
   2768 #ifdef BIND8_STATS
   2769 		set_bind8_alarm(nsd);
   2770 #endif
   2771 		return NSD_STATS;
   2772 	}
   2773 	else if(nsd->signal_hint_statsusr) {
   2774 		nsd->signal_hint_statsusr = 0;
   2775 		return NSD_STATS;
   2776 	}
   2777 	return NSD_RUN;
   2778 }
   2779 
   2780 /*
   2781  * The main server simply waits for signals and child processes to
   2782  * terminate.  Child processes are restarted as necessary.
   2783  */
   2784 void
   2785 server_main(struct nsd *nsd)
   2786 {
   2787 	region_type *server_region = region_create(xalloc, free);
   2788 	netio_type *netio = netio_create(server_region);
   2789 	netio_handler_type reload_listener;
   2790 	int reload_sockets[2] = {-1, -1};
   2791 	/* pointer to the xfr tasks that will be processed in a second pass */
   2792 	udb_ptr xfrs2process;
   2793 	/* pointer to results of task processing */
   2794 	udb_ptr last_task;
   2795 	struct timespec timeout_spec;
   2796 	int status;
   2797 	pid_t child_pid;
   2798 	pid_t reload_pid = -1;
   2799 	sig_atomic_t mode;
   2800 
   2801 	/* Ensure we are the main process */
   2802 	assert(nsd->server_kind == NSD_SERVER_MAIN);
   2803 
   2804 	/* Add listener for the XFRD process */
   2805 	netio_add_handler(netio, nsd->xfrd_listener);
   2806 
   2807 #ifdef BIND8_STATS
   2808 	nsd->st = &nsd->stat_map[0];
   2809 	nsd->st->db_disk = 0;
   2810 	nsd->st->db_mem = region_get_mem(nsd->db->region);
   2811 #endif
   2812 	memset(&xfrs2process, 0, sizeof(xfrs2process));
   2813 	memset(&last_task, 0, sizeof(last_task));
   2814 
   2815 	/* Start the child processes that handle incoming queries */
   2816 	if (server_start_children(nsd, server_region, netio,
   2817 		&nsd->xfrd_listener->fd) != 0) {
   2818 		send_children_quit(nsd);
   2819 		exit(1);
   2820 	}
   2821 	reload_listener.fd = -1;
   2822 
   2823 	/* This_child MUST be 0, because this is the parent process */
   2824 	assert(nsd->this_child == 0);
   2825 
   2826 	/* Run the server until we get a shutdown signal */
   2827 	while ((mode = nsd->mode) != NSD_SHUTDOWN) {
   2828 		/* Did we receive a signal that changes our mode? */
   2829 		if(mode == NSD_RUN) {
   2830 			nsd->mode = mode = server_signal_mode(nsd);
   2831 		}
   2832 
   2833 		switch (mode) {
   2834 		case NSD_RUN:
   2835 			/* see if any child processes terminated */
   2836 			while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) {
   2837 				int is_child = delete_child_pid(nsd, child_pid);
   2838 				if (is_child != -1 && nsd->children[is_child].need_to_exit) {
   2839 					if(nsd->children[is_child].child_fd == -1)
   2840 						nsd->children[is_child].has_exited = 1;
   2841 					parent_check_all_children_exited(nsd);
   2842 				} else if(is_child != -1) {
   2843 					log_msg(LOG_WARNING,
   2844 					       "server %d died unexpectedly with status %d, restarting",
   2845 					       (int) child_pid, status);
   2846 					restart_child_servers(nsd, server_region, netio,
   2847 						&nsd->xfrd_listener->fd);
   2848 				} else if (child_pid == reload_pid) {
   2849 					sig_atomic_t cmd = NSD_RELOAD_FAILED;
   2850 					pid_t mypid;
   2851 					log_msg(LOG_WARNING,
   2852 					       "Reload process %d failed with status %d, continuing with old database",
   2853 					       (int) child_pid, status);
   2854 #ifdef HAVE_SETPROCTITLE
   2855 					setproctitle("main");
   2856 #endif
   2857 #ifdef USE_LOG_PROCESS_ROLE
   2858 					log_set_process_role("main");
   2859 #endif
   2860 					reload_pid = -1;
   2861 					if(reload_listener.fd != -1) close(reload_listener.fd);
   2862 					netio_remove_handler(netio, &reload_listener);
   2863 					reload_listener.fd = -1;
   2864 					reload_listener.event_types = NETIO_EVENT_NONE;
   2865 					task_process_sync(nsd->task[nsd->mytask]);
   2866 					/* inform xfrd reload attempt ended */
   2867 					if(!write_socket(nsd->xfrd_listener->fd,
   2868 						&cmd, sizeof(cmd))) {
   2869 						log_msg(LOG_ERR, "problems "
   2870 						  "sending SOAEND to xfrd: %s",
   2871 						  strerror(errno));
   2872 					}
   2873 					mypid = getpid();
   2874 					if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
   2875 						log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
   2876 							strerror(errno));
   2877 					}
   2878 #ifdef USE_DNSTAP
   2879 				} else if(nsd->dt_collector && child_pid == nsd->dt_collector->dt_pid) {
   2880 					log_msg(LOG_WARNING,
   2881 					       "dnstap-collector %d terminated with status %d",
   2882 					       (int) child_pid, status);
   2883 					if(nsd->dt_collector) {
   2884 						dt_collector_close(nsd->dt_collector, nsd);
   2885 						dt_collector_destroy(nsd->dt_collector, nsd);
   2886 						nsd->dt_collector = NULL;
   2887 					}
   2888 					/* Only respawn a crashed (or exited)
   2889 					 * dnstap-collector when not reloading,
   2890 					 * to not induce a reload during a
   2891 					 * reload (which would seriously
   2892 					 * disrupt nsd procedures and lead to
   2893 					 * unpredictable results)!
   2894 					 *
   2895 					 * This will *leave* a dnstap-collector
   2896 					 * process terminated, but because
   2897 					 * signalling of the reload process to
   2898 					 * the main process to respawn in this
   2899 					 * situation will be cumbersome, and
   2900 					 * because this situation is so
   2901 					 * specific (and therefore hopefully
   2902 					 * extremely rare or non-existing at
   2903 					 * all), plus the fact that we are left
   2904 					 * with a perfectly function NSD
   2905 					 * (besides not logging dnstap
   2906 					 * messages), I consider it acceptable
   2907 					 * to leave this unresolved.
   2908 					 */
   2909 					if(reload_pid == -1 && nsd->options->dnstap_enable) {
   2910 						nsd->dt_collector = dt_collector_create(nsd);
   2911 						dt_collector_start(nsd->dt_collector, nsd);
   2912 						nsd->mode = NSD_RELOAD_REQ;
   2913 					}
   2914 #endif
   2915 				} else if(status != 0) {
   2916 					/* check for status, because we get
   2917 					 * the old-servermain because reload
   2918 					 * is the process-parent of old-main,
   2919 					 * and we get older server-processes
   2920 					 * that are exiting after a reload */
   2921 					log_msg(LOG_WARNING,
   2922 					       "process %d terminated with status %d",
   2923 					       (int) child_pid, status);
   2924 				}
   2925 			}
   2926 			if (child_pid == -1) {
   2927 				if (errno == EINTR) {
   2928 					continue;
   2929 				}
   2930 				if (errno != ECHILD)
   2931 					log_msg(LOG_WARNING, "wait failed: %s", strerror(errno));
   2932 			}
   2933 			if (nsd->mode != NSD_RUN)
   2934 				break;
   2935 
   2936 			/* timeout to collect processes. In case no sigchild happens. */
   2937 			timeout_spec.tv_sec = 1;
   2938 			timeout_spec.tv_nsec = 0;
   2939 
   2940 			/* listen on ports, timeout for collecting terminated children */
   2941 			if(netio_dispatch(netio, &timeout_spec, 0) == -1) {
   2942 				if (errno != EINTR) {
   2943 					log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno));
   2944 				}
   2945 			}
   2946 			if(nsd->restart_children) {
   2947 				restart_child_servers(nsd, server_region, netio,
   2948 					&nsd->xfrd_listener->fd);
   2949 				nsd->restart_children = 0;
   2950 			}
   2951 			if(nsd->reload_failed) {
   2952 				sig_atomic_t cmd = NSD_RELOAD_FAILED;
   2953 				pid_t mypid;
   2954 				nsd->reload_failed = 0;
   2955 				log_msg(LOG_WARNING,
   2956 				       "Reload process %d failed, continuing with old database",
   2957 				       (int) reload_pid);
   2958 #ifdef HAVE_SETPROCTITLE
   2959 				setproctitle("main");
   2960 #endif
   2961 #ifdef USE_LOG_PROCESS_ROLE
   2962 				log_set_process_role("main");
   2963 #endif
   2964 				reload_pid = -1;
   2965 				if(reload_listener.fd != -1) close(reload_listener.fd);
   2966 				netio_remove_handler(netio, &reload_listener);
   2967 				reload_listener.fd = -1;
   2968 				reload_listener.event_types = NETIO_EVENT_NONE;
   2969 				task_process_sync(nsd->task[nsd->mytask]);
   2970 				/* inform xfrd reload attempt ended */
   2971 				if(!write_socket(nsd->xfrd_listener->fd,
   2972 					&cmd, sizeof(cmd))) {
   2973 					log_msg(LOG_ERR, "problems "
   2974 					  "sending SOAEND to xfrd: %s",
   2975 					  strerror(errno));
   2976 				}
   2977 				mypid = getpid();
   2978 				if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
   2979 					log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
   2980 						strerror(errno));
   2981 				}
   2982 			}
   2983 
   2984 			break;
   2985 		case NSD_RELOAD_REQ: {
   2986 			sig_atomic_t cmd = NSD_RELOAD_REQ;
   2987 			log_msg(LOG_WARNING, "SIGHUP received, reloading...");
   2988 			DEBUG(DEBUG_IPC,1, (LOG_INFO,
   2989 				"main: ipc send reload_req to xfrd"));
   2990 			if(!write_socket(nsd->xfrd_listener->fd,
   2991 				&cmd, sizeof(cmd))) {
   2992 				log_msg(LOG_ERR, "server_main: could not send "
   2993 				"reload_req to xfrd: %s", strerror(errno));
   2994 			}
   2995 			nsd->mode = NSD_RUN;
   2996 			} break;
   2997 		case NSD_RELOAD:
   2998 			/* Continue to run nsd after reload */
   2999 			nsd->mode = NSD_RUN;
   3000 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading..."));
   3001 			if (reload_pid != -1) {
   3002 				log_msg(LOG_WARNING, "Reload already in progress (pid = %d)",
   3003 				       (int) reload_pid);
   3004 				break;
   3005 			}
   3006 
   3007 			/* switch the mytask to keep track of who owns task*/
   3008 			nsd->mytask = 1 - nsd->mytask;
   3009 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) {
   3010 				log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno));
   3011 				reload_pid = -1;
   3012 				break;
   3013 			}
   3014 			/* Execute the tasks that cannot fail */
   3015 #ifdef HAVE_SETPROCTITLE
   3016 			setproctitle("load");
   3017 #endif
   3018 #ifdef USE_LOG_PROCESS_ROLE
   3019 			log_set_process_role("load");
   3020 #endif
   3021 			/* Already process the non xfr tasks, so that a failed
   3022 			 * transfer (which can exit) will not nullify the
   3023 			 * effects of the other tasks that will not exit.
   3024 			 */
   3025 			task_remap(nsd->task[nsd->mytask]);
   3026 			udb_ptr_init(&xfrs2process, nsd->task[nsd->mytask]);
   3027 			udb_ptr_init(&last_task   , nsd->task[nsd->mytask]);
   3028 			/* last_task and xfrs2process MUST be unlinked in all
   3029 			 * possible branches of the fork() below.
   3030 			 * server_reload() will unlink them, but for failed
   3031 			 * fork and for the "old-main" (child) process, we MUST
   3032 			 * unlink them in the case statement below.
   3033 			 * Unlink by setting the value to 0, because
   3034 			 * reload_process_non_xfr_tasks() may clear (and
   3035 			 * implicitly unlink) xfrs2process.
   3036 			 */
   3037 			reload_process_non_xfr_tasks(nsd, &xfrs2process
   3038 			                                , &last_task);
   3039 			/* Do actual reload */
   3040 			reload_pid = fork();
   3041 			switch (reload_pid) {
   3042 			case -1:
   3043 				log_msg(LOG_ERR, "fork failed: %s", strerror(errno));
   3044 				udb_ptr_set(&last_task, nsd->task[nsd->mytask], 0);
   3045 				udb_ptr_set(&xfrs2process, nsd->task[nsd->mytask], 0);
   3046 				break;
   3047 			default:
   3048 				/* PARENT */
   3049 				close(reload_sockets[0]);
   3050 				server_reload(nsd, server_region, netio
   3051 				                 , reload_sockets[1]
   3052 				                 , &xfrs2process
   3053 						 , &last_task);
   3054 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main"));
   3055 				close(reload_sockets[1]);
   3056 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed"));
   3057 				/* drop stale xfrd ipc data */
   3058 				((struct ipc_handler_conn_data*)nsd->
   3059 					xfrd_listener->user_data)
   3060 					->conn->is_reading = 0;
   3061 				reload_pid = -1;
   3062 				reload_listener.fd = -1;
   3063 				reload_listener.event_types = NETIO_EVENT_NONE;
   3064 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run"));
   3065 				break;
   3066 			case 0:
   3067 				/* CHILD */
   3068 				/* server_main keep running until NSD_QUIT_SYNC
   3069 				 * received from reload. */
   3070 				close(reload_sockets[1]);
   3071 #ifdef HAVE_SETPROCTITLE
   3072 				setproctitle("old-main");
   3073 #endif
   3074 #ifdef USE_LOG_PROCESS_ROLE
   3075 				log_set_process_role("old-main");
   3076 #endif
   3077 				udb_ptr_set(&last_task, nsd->task[nsd->mytask], 0);
   3078 				udb_ptr_set(&xfrs2process, nsd->task[nsd->mytask], 0);
   3079 				reload_listener.fd = reload_sockets[0];
   3080 				reload_listener.timeout = NULL;
   3081 				reload_listener.user_data = nsd;
   3082 				reload_listener.event_types = NETIO_EVENT_READ;
   3083 				reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */
   3084 				netio_add_handler(netio, &reload_listener);
   3085 				reload_pid = getppid();
   3086 				break;
   3087 			}
   3088 			if(reload_pid == -1) {
   3089 				/* Reset proctitle after "load" process exited
   3090 				 * or when fork() failed
   3091 				 */
   3092 #ifdef HAVE_SETPROCTITLE
   3093 				setproctitle("main");
   3094 #endif
   3095 #ifdef USE_LOG_PROCESS_ROLE
   3096 				log_set_process_role("main");
   3097 #endif
   3098 			}
   3099 			break;
   3100 		case NSD_QUIT_SYNC:
   3101 			/* synchronisation of xfrd, parent and reload */
   3102 			if(!nsd->quit_sync_done && reload_listener.fd != -1) {
   3103 				sig_atomic_t cmd = NSD_RELOAD;
   3104 				/* stop xfrd ipc writes in progress */
   3105 				DEBUG(DEBUG_IPC,1, (LOG_INFO,
   3106 					"main: ipc send indication reload"));
   3107 				if(!write_socket(nsd->xfrd_listener->fd,
   3108 					&cmd, sizeof(cmd))) {
   3109 					log_msg(LOG_ERR, "server_main: could not send reload "
   3110 					"indication to xfrd: %s", strerror(errno));
   3111 				}
   3112 				/* wait for ACK from xfrd */
   3113 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd"));
   3114 				nsd->quit_sync_done = 1;
   3115 			}
   3116 			nsd->mode = NSD_RUN;
   3117 			break;
   3118 		case NSD_QUIT:
   3119 			/* silent shutdown during reload */
   3120 			if(reload_listener.fd != -1) {
   3121 				/* acknowledge the quit, to sync reload that we will really quit now */
   3122 				sig_atomic_t cmd = NSD_RELOAD;
   3123 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload"));
   3124 				if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
   3125 					log_msg(LOG_ERR, "server_main: "
   3126 						"could not ack quit: %s", strerror(errno));
   3127 				}
   3128 				close(reload_listener.fd);
   3129 			}
   3130 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence"));
   3131 			/* only quit children after xfrd has acked */
   3132 			send_children_quit(nsd);
   3133 
   3134 #ifdef MEMCLEAN /* OS collects memory pages */
   3135 			region_destroy(server_region);
   3136 #endif
   3137 			server_shutdown(nsd);
   3138 
   3139 			/* ENOTREACH */
   3140 			break;
   3141 		case NSD_SHUTDOWN:
   3142 			break;
   3143 		case NSD_REAP_CHILDREN:
   3144 			/* continue; wait for child in run loop */
   3145 			nsd->mode = NSD_RUN;
   3146 			break;
   3147 		case NSD_STATS:
   3148 #ifdef BIND8_STATS
   3149 			set_children_stats(nsd);
   3150 #endif
   3151 			nsd->mode = NSD_RUN;
   3152 			break;
   3153 		default:
   3154 			log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode);
   3155 			nsd->mode = NSD_RUN;
   3156 			break;
   3157 		}
   3158 	}
   3159 	log_msg(LOG_WARNING, "signal received, shutting down...");
   3160 
   3161 	/* close opened ports to avoid race with restart of nsd */
   3162 	server_close_all_sockets(nsd->udp, nsd->ifs);
   3163 	server_close_all_sockets(nsd->tcp, nsd->ifs);
   3164 	daemon_remote_close(nsd->rc);
   3165 	send_children_quit_and_wait(nsd);
   3166 
   3167 	/* Unlink it if possible... */
   3168 	unlinkpid(nsd->pidfile, nsd->username);
   3169 	unlink(nsd->task[0]->fname);
   3170 	unlink(nsd->task[1]->fname);
   3171 #ifdef USE_ZONE_STATS
   3172 	unlink(nsd->zonestatfname[0]);
   3173 	unlink(nsd->zonestatfname[1]);
   3174 #endif
   3175 #ifdef BIND8_STATS
   3176 	server_stat_free(nsd);
   3177 #endif
   3178 #ifdef USE_DNSTAP
   3179 	dt_collector_close(nsd->dt_collector, nsd);
   3180 #endif
   3181 
   3182 	if(reload_listener.fd != -1) {
   3183 		sig_atomic_t cmd = NSD_QUIT;
   3184 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
   3185 			"main: ipc send quit to reload-process"));
   3186 		if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
   3187 			log_msg(LOG_ERR, "server_main: could not send quit to reload: %s",
   3188 				strerror(errno));
   3189 		}
   3190 		fsync(reload_listener.fd);
   3191 		close(reload_listener.fd);
   3192 		/* wait for reload to finish processing */
   3193 		while(1) {
   3194 			if(waitpid(reload_pid, NULL, 0) == -1) {
   3195 				if(errno == EINTR) continue;
   3196 				if(errno == ECHILD) break;
   3197 				log_msg(LOG_ERR, "waitpid(reload %d): %s",
   3198 					(int)reload_pid, strerror(errno));
   3199 			}
   3200 			break;
   3201 		}
   3202 	}
   3203 	if(nsd->xfrd_listener->fd != -1) {
   3204 		/* complete quit, stop xfrd */
   3205 		sig_atomic_t cmd = NSD_QUIT;
   3206 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
   3207 			"main: ipc send quit to xfrd"));
   3208 		if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) {
   3209 			log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s",
   3210 				strerror(errno));
   3211 		}
   3212 		fsync(nsd->xfrd_listener->fd);
   3213 		close(nsd->xfrd_listener->fd);
   3214 		(void)kill(nsd->pid, SIGTERM);
   3215 	}
   3216 
   3217 #ifdef USE_XDP
   3218 	xdp_server_cleanup(&nsd->xdp.xdp_server);
   3219 #endif
   3220 
   3221 #ifdef MEMCLEAN /* OS collects memory pages */
   3222 	region_destroy(server_region);
   3223 #endif
   3224 	server_shutdown(nsd);
   3225 }
   3226 
   3227 static query_state_type
   3228 server_process_query(struct nsd *nsd, struct query *query, uint32_t *now_p)
   3229 {
   3230 	return query_process(query, nsd, now_p);
   3231 }
   3232 
   3233 static query_state_type
   3234 server_process_query_udp(struct nsd *nsd, struct query *query, uint32_t *now_p)
   3235 {
   3236 #ifdef RATELIMIT
   3237 	if(query_process(query, nsd, now_p) != QUERY_DISCARDED) {
   3238 		if(query->edns.cookie_status != COOKIE_VALID
   3239 		&& query->edns.cookie_status != COOKIE_VALID_REUSE
   3240 		&& rrl_process_query(query))
   3241 			return rrl_slip(query);
   3242 		else	return QUERY_PROCESSED;
   3243 	}
   3244 	return QUERY_DISCARDED;
   3245 #else
   3246 	return query_process(query, nsd, now_p);
   3247 #endif
   3248 }
   3249 
   3250 const char*
   3251 nsd_event_vs(void)
   3252 {
   3253 #ifdef USE_MINI_EVENT
   3254 	return "";
   3255 #else
   3256 	return event_get_version();
   3257 #endif
   3258 }
   3259 
   3260 #if !defined(USE_MINI_EVENT) && defined(EV_FEATURE_BACKENDS)
   3261 static const char* ub_ev_backend2str(int b)
   3262 {
   3263 	switch(b) {
   3264 	case EVBACKEND_SELECT:	return "select";
   3265 	case EVBACKEND_POLL:	return "poll";
   3266 	case EVBACKEND_EPOLL:	return "epoll";
   3267 	case EVBACKEND_KQUEUE:	return "kqueue";
   3268 	case EVBACKEND_DEVPOLL: return "devpoll";
   3269 	case EVBACKEND_PORT:	return "evport";
   3270 	}
   3271 	return "unknown";
   3272 }
   3273 #endif
   3274 
   3275 const char*
   3276 nsd_event_method(void)
   3277 {
   3278 #ifdef USE_MINI_EVENT
   3279 	return "select";
   3280 #else
   3281 	struct event_base* b = nsd_child_event_base();
   3282 	const char* m;
   3283 #  ifdef EV_FEATURE_BACKENDS
   3284 	m = ub_ev_backend2str(ev_backend((struct ev_loop*)b));
   3285 #  elif defined(HAVE_EVENT_BASE_GET_METHOD)
   3286 	m = event_base_get_method(b);
   3287 #  else
   3288 	m = "?";
   3289 #  endif
   3290 #  ifdef MEMCLEAN
   3291 	event_base_free(b);
   3292 #  endif
   3293 	return m;
   3294 #endif
   3295 }
   3296 
   3297 struct event_base*
   3298 nsd_child_event_base(void)
   3299 {
   3300 	struct event_base* base;
   3301 #ifdef USE_MINI_EVENT
   3302 	static time_t secs;
   3303 	static struct timeval now;
   3304 	base = event_init(&secs, &now);
   3305 #else
   3306 #  if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP)
   3307 	/* libev */
   3308 	base = (struct event_base *)ev_default_loop(EVFLAG_AUTO);
   3309 #  else
   3310 	/* libevent */
   3311 #    ifdef HAVE_EVENT_BASE_NEW
   3312 	base = event_base_new();
   3313 #    else
   3314 	base = event_init();
   3315 #    endif
   3316 #  endif
   3317 #endif
   3318 	return base;
   3319 }
   3320 
   3321 static void
   3322 add_udp_handler(
   3323 	struct nsd *nsd,
   3324 	struct nsd_socket *sock,
   3325 	struct udp_handler_data *data)
   3326 {
   3327 	struct event *handler = &data->event;
   3328 
   3329 	data->nsd = nsd;
   3330 	data->socket = sock;
   3331 
   3332 	if(nsd->options->proxy_protocol_port &&
   3333 		sockaddr_uses_proxy_protocol_port(nsd->options,
   3334 		(struct sockaddr *)&sock->addr.ai_addr)) {
   3335 		data->pp2_enabled = 1;
   3336 	}
   3337 
   3338 	memset(handler, 0, sizeof(*handler));
   3339 	event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_udp, data);
   3340 	if(event_base_set(nsd->event_base, handler) != 0)
   3341 		log_msg(LOG_ERR, "nsd udp: event_base_set failed");
   3342 	if(event_add(handler, NULL) != 0)
   3343 		log_msg(LOG_ERR, "nsd udp: event_add failed");
   3344 }
   3345 
   3346 void
   3347 add_tcp_handler(
   3348 	struct nsd *nsd,
   3349 	struct nsd_socket *sock,
   3350 	struct tcp_accept_handler_data *data)
   3351 {
   3352 	struct event *handler = &data->event;
   3353 
   3354 	data->nsd = nsd;
   3355 	data->socket = sock;
   3356 
   3357 	if(nsd->options->proxy_protocol_port &&
   3358 		sockaddr_uses_proxy_protocol_port(nsd->options,
   3359 		(struct sockaddr *)&sock->addr.ai_addr)) {
   3360 		data->pp2_enabled = 1;
   3361 	}
   3362 
   3363 #ifdef HAVE_SSL
   3364 	if (nsd->tls_ctx &&
   3365 	    nsd->options->tls_port &&
   3366 	    using_tls_port((struct sockaddr *)&sock->addr.ai_addr, nsd->options->tls_port))
   3367 	{
   3368 		data->tls_accept = 1;
   3369 		if(verbosity >= 2) {
   3370 			char buf[48];
   3371 			addrport2str((void*)(struct sockaddr_storage*)&sock->addr.ai_addr, buf, sizeof(buf));
   3372 			VERBOSITY(5, (LOG_NOTICE, "setup TCP for TLS service on interface %s", buf));
   3373 		}
   3374 	} else {
   3375 		data->tls_accept = 0;
   3376 	}
   3377 	if (nsd->tls_auth_ctx &&
   3378 	    nsd->options->tls_auth_port &&
   3379 	    using_tls_port((struct sockaddr *)&sock->addr.ai_addr, nsd->options->tls_auth_port))
   3380 	{
   3381 		data->tls_auth_accept = 1;
   3382 		if(verbosity >= 2) {
   3383 			char buf[48];
   3384 			addrport2str((void*)(struct sockaddr_storage*)&sock->addr.ai_addr, buf, sizeof(buf));
   3385 			VERBOSITY(4, (LOG_NOTICE, "setup TCP for TLS-AUTH service on interface %s", buf));
   3386 		}
   3387 
   3388 	} else {
   3389 		data->tls_auth_accept = 0;
   3390 	}
   3391 #endif
   3392 
   3393 	memset(handler, 0, sizeof(*handler));
   3394 	event_set(handler, sock->s, EV_PERSIST|EV_READ,	handle_tcp_accept, data);
   3395 	if(event_base_set(nsd->event_base, handler) != 0)
   3396 		log_msg(LOG_ERR, "nsd tcp: event_base_set failed");
   3397 	if(event_add(handler, NULL) != 0)
   3398 		log_msg(LOG_ERR, "nsd tcp: event_add failed");
   3399 	data->event_added = 1;
   3400 }
   3401 
   3402 #ifdef USE_XDP
   3403 static void
   3404 add_xdp_handler(struct nsd *nsd,
   3405 	            struct xdp_server *xdp,
   3406 	            struct xdp_handler_data *data) {
   3407 
   3408 	int sock;
   3409 	struct event *handler = &data->event;
   3410 
   3411 	data->nsd = nsd;
   3412 	data->server = xdp;
   3413 
   3414 	memset(handler, 0, sizeof(*handler));
   3415 	sock = xsk_socket__fd(xdp->xsks[xdp->queue_index].xsk);
   3416 	if (sock < 0) {
   3417 		log_msg(LOG_ERR, "xdp: xsk socket file descriptor is invalid: %s",
   3418 		        strerror(errno));
   3419 		return;
   3420 	}
   3421 	// TODO: check which EV_flags are needed
   3422 	event_set(handler, sock, EV_PERSIST|EV_READ, handle_xdp, data);
   3423 	if (event_base_set(nsd->event_base, handler) != 0)
   3424 		log_msg(LOG_ERR, "nsd xdp: event_base_set failed");
   3425 	if (event_add(handler, NULL) != 0)
   3426 		log_msg(LOG_ERR, "nsd xdp: event_add failed");
   3427 }
   3428 #endif
   3429 
   3430 /*
   3431  * Serve DNS request to verifiers (short-lived)
   3432  */
   3433 static void server_verify(struct nsd *nsd, int cmdsocket,
   3434 	struct sigaction* old_sigchld)
   3435 {
   3436 	size_t size = 0;
   3437 	struct event cmd_event, signal_event, exit_event;
   3438 	struct zone *zone;
   3439 
   3440 	assert(nsd != NULL);
   3441 
   3442 	zone = verify_next_zone(nsd, NULL);
   3443 	if(zone == NULL)
   3444 		return;
   3445 
   3446 	nsd->server_region = region_create(xalloc, free);
   3447 	nsd->event_base = nsd_child_event_base();
   3448 
   3449 	nsd->next_zone_to_verify = zone;
   3450 	nsd->verifier_count = 0;
   3451 	nsd->verifier_limit = nsd->options->verifier_count;
   3452 	size = sizeof(struct verifier) * nsd->verifier_limit;
   3453 	if(pipe(nsd->verifier_pipe) == -1) {
   3454 		log_msg(LOG_ERR, "verify: could not create pipe: %s",
   3455 				strerror(errno));
   3456 		goto fail_pipe;
   3457 	}
   3458 	fcntl(nsd->verifier_pipe[0], F_SETFD, FD_CLOEXEC);
   3459 	fcntl(nsd->verifier_pipe[1], F_SETFD, FD_CLOEXEC);
   3460 	nsd->verifiers = region_alloc_zero(nsd->server_region, size);
   3461 
   3462 	for(size_t i = 0; i < nsd->verifier_limit; i++) {
   3463 		nsd->verifiers[i].nsd = nsd;
   3464 		nsd->verifiers[i].zone = NULL;
   3465 		nsd->verifiers[i].pid = -1;
   3466 		nsd->verifiers[i].output_stream.fd = -1;
   3467 		nsd->verifiers[i].output_stream.priority = LOG_INFO;
   3468 		nsd->verifiers[i].error_stream.fd = -1;
   3469 		nsd->verifiers[i].error_stream.priority = LOG_ERR;
   3470 	}
   3471 
   3472 	event_set(&cmd_event, cmdsocket, EV_READ|EV_PERSIST, verify_handle_command, nsd);
   3473 	if(event_base_set(nsd->event_base, &cmd_event) != 0 ||
   3474 	   event_add(&cmd_event, NULL) != 0)
   3475 	{
   3476 		log_msg(LOG_ERR, "verify: could not add command event");
   3477 		goto fail;
   3478 	}
   3479 
   3480 	event_set(&signal_event, SIGCHLD, EV_SIGNAL|EV_PERSIST, verify_handle_signal, nsd);
   3481 	if(event_base_set(nsd->event_base, &signal_event) != 0 ||
   3482 	   signal_add(&signal_event, NULL) != 0)
   3483 	{
   3484 		log_msg(LOG_ERR, "verify: could not add signal event");
   3485 		goto fail;
   3486 	}
   3487 
   3488 	event_set(&exit_event, nsd->verifier_pipe[0], EV_READ|EV_PERSIST, verify_handle_exit, nsd);
   3489 	if(event_base_set(nsd->event_base, &exit_event) != 0 ||
   3490 	   event_add(&exit_event, NULL) != 0)
   3491   {
   3492 		log_msg(LOG_ERR, "verify: could not add exit event");
   3493 		goto fail;
   3494 	}
   3495 
   3496 	memset(msgs, 0, sizeof(msgs));
   3497 	for (int i = 0; i < NUM_RECV_PER_SELECT; i++) {
   3498 		queries[i] = query_create(nsd->server_region,
   3499 			compressed_dname_offsets,
   3500 			compression_table_size, compressed_dnames);
   3501 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
   3502 		iovecs[i].iov_base = buffer_begin(queries[i]->packet);
   3503 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
   3504 		msgs[i].msg_hdr.msg_iov = &iovecs[i];
   3505 		msgs[i].msg_hdr.msg_iovlen = 1;
   3506 		msgs[i].msg_hdr.msg_name = &queries[i]->remote_addr;
   3507 		msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen;
   3508 	}
   3509 
   3510 	for (size_t i = 0; i < nsd->verify_ifs; i++) {
   3511 		struct udp_handler_data *data;
   3512 		data = region_alloc_zero(
   3513 			nsd->server_region, sizeof(*data));
   3514 		add_udp_handler(nsd, &nsd->verify_udp[i], data);
   3515 	}
   3516 
   3517 	tcp_accept_handler_count = nsd->verify_ifs;
   3518 	tcp_accept_handlers = region_alloc_array(nsd->server_region,
   3519 		nsd->verify_ifs, sizeof(*tcp_accept_handlers));
   3520 
   3521 	for (size_t i = 0; i < nsd->verify_ifs; i++) {
   3522 		struct tcp_accept_handler_data *data;
   3523 		data = &tcp_accept_handlers[i];
   3524 		memset(data, 0, sizeof(*data));
   3525 		add_tcp_handler(nsd, &nsd->verify_tcp[i], data);
   3526 	}
   3527 
   3528 	while(nsd->next_zone_to_verify != NULL &&
   3529 	      nsd->verifier_count < nsd->verifier_limit)
   3530 	{
   3531 		verify_zone(nsd, nsd->next_zone_to_verify);
   3532 		nsd->next_zone_to_verify
   3533 			= verify_next_zone(nsd, nsd->next_zone_to_verify);
   3534 	}
   3535 
   3536 	/* short-lived main loop */
   3537 	event_base_dispatch(nsd->event_base);
   3538 
   3539 	/* remove command and exit event handlers */
   3540 	event_del(&exit_event);
   3541 	event_del(&cmd_event);
   3542 
   3543 	assert(nsd->next_zone_to_verify == NULL || nsd->mode == NSD_QUIT);
   3544 	assert(nsd->verifier_count == 0 || nsd->mode == NSD_QUIT);
   3545 	signal_del(&signal_event);
   3546 fail:
   3547 	sigaction(SIGCHLD, old_sigchld, NULL);
   3548 	close(nsd->verifier_pipe[0]);
   3549 	close(nsd->verifier_pipe[1]);
   3550 fail_pipe:
   3551 	event_base_free(nsd->event_base);
   3552 	region_destroy(nsd->server_region);
   3553 
   3554 	nsd->event_base = NULL;
   3555 	nsd->server_region = NULL;
   3556 	nsd->verifier_limit = 0;
   3557 	nsd->verifier_pipe[0] = -1;
   3558 	nsd->verifier_pipe[1] = -1;
   3559 	nsd->verifiers = NULL;
   3560 }
   3561 
   3562 /*
   3563  * Serve DNS requests.
   3564  */
   3565 void
   3566 server_child(struct nsd *nsd)
   3567 {
   3568 	size_t i, from, numifs;
   3569 	region_type *server_region = region_create(xalloc, free);
   3570 	struct event_base* event_base = nsd_child_event_base();
   3571 	sig_atomic_t mode;
   3572 #ifdef USE_LOG_PROCESS_ROLE
   3573 	static char child_name[20];
   3574 #endif
   3575 
   3576 	if(!event_base) {
   3577 		log_msg(LOG_ERR, "nsd server could not create event base");
   3578 		exit(1);
   3579 	}
   3580 	nsd->event_base = event_base;
   3581 	nsd->server_region = server_region;
   3582 
   3583 #ifdef RATELIMIT
   3584 	rrl_init(nsd->this_child->child_num);
   3585 #endif
   3586 
   3587 	assert(nsd->server_kind != NSD_SERVER_MAIN);
   3588 
   3589 #ifdef HAVE_SETPROCTITLE
   3590 	setproctitle("server %d", nsd->this_child->child_num + 1);
   3591 #endif
   3592 #ifdef USE_LOG_PROCESS_ROLE
   3593 	snprintf(child_name, sizeof(child_name), "srv%d",
   3594 		nsd->this_child->child_num + 1);
   3595 	log_set_process_role(child_name);
   3596 #endif
   3597 	DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started"));
   3598 
   3599 #ifdef HAVE_CPUSET_T
   3600 	if(nsd->use_cpu_affinity) {
   3601 		set_cpu_affinity(nsd->this_child->cpuset);
   3602 	}
   3603 #endif
   3604 #ifdef BIND8_STATS
   3605 	nsd->st = &nsd->stats_per_child[nsd->stat_current]
   3606 		[nsd->this_child->child_num];
   3607 	nsd->st->boot = nsd->stat_map[0].boot;
   3608 	memcpy(&nsd->stat_proc, nsd->st, sizeof(nsd->stat_proc));
   3609 #endif
   3610 
   3611 	if (!(nsd->server_kind & NSD_SERVER_TCP)) {
   3612 		server_close_all_sockets(nsd->tcp, nsd->ifs);
   3613 	}
   3614 	if (!(nsd->server_kind & NSD_SERVER_UDP)) {
   3615 		server_close_all_sockets(nsd->udp, nsd->ifs);
   3616 	}
   3617 
   3618 	if (nsd->this_child->parent_fd != -1) {
   3619 		struct event *handler;
   3620 		struct ipc_handler_conn_data* user_data =
   3621 			(struct ipc_handler_conn_data*)region_alloc(
   3622 			server_region, sizeof(struct ipc_handler_conn_data));
   3623 		user_data->nsd = nsd;
   3624 		user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ);
   3625 
   3626 		handler = (struct event*) region_alloc(
   3627 			server_region, sizeof(*handler));
   3628 		memset(handler, 0, sizeof(*handler));
   3629 		event_set(handler, nsd->this_child->parent_fd, EV_PERSIST|
   3630 			EV_READ, child_handle_parent_command, user_data);
   3631 		if(event_base_set(event_base, handler) != 0)
   3632 			log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed");
   3633 		if(event_add(handler, NULL) != 0)
   3634 			log_msg(LOG_ERR, "nsd ipcchild: event_add failed");
   3635 	}
   3636 
   3637 	if(nsd->reuseport) {
   3638 		numifs = nsd->ifs / nsd->reuseport;
   3639 		from = numifs * nsd->this_child->child_num;
   3640 		if(from+numifs > nsd->ifs) { /* should not happen */
   3641 			from = 0;
   3642 			numifs = nsd->ifs;
   3643 		}
   3644 	} else {
   3645 		from = 0;
   3646 		numifs = nsd->ifs;
   3647 	}
   3648 
   3649 	if ((nsd->server_kind & NSD_SERVER_UDP)) {
   3650 		int child = nsd->this_child->child_num;
   3651 		memset(msgs, 0, sizeof(msgs));
   3652 		for (i = 0; i < NUM_RECV_PER_SELECT; i++) {
   3653 			queries[i] = query_create(server_region,
   3654 				compressed_dname_offsets,
   3655 				compression_table_size, compressed_dnames);
   3656 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
   3657 			iovecs[i].iov_base          = buffer_begin(queries[i]->packet);
   3658 			iovecs[i].iov_len           = buffer_remaining(queries[i]->packet);
   3659 			msgs[i].msg_hdr.msg_iov     = &iovecs[i];
   3660 			msgs[i].msg_hdr.msg_iovlen  = 1;
   3661 			msgs[i].msg_hdr.msg_name    = &queries[i]->remote_addr;
   3662 			msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen;
   3663 		}
   3664 
   3665 		for (i = 0; i < nsd->ifs; i++) {
   3666 			int listen;
   3667 			struct udp_handler_data *data;
   3668 
   3669 			listen = nsd_bitset_isset(nsd->udp[i].servers, child);
   3670 
   3671 			if(i >= from && i < (from + numifs) && listen) {
   3672 				data = region_alloc_zero(
   3673 					nsd->server_region, sizeof(*data));
   3674 				add_udp_handler(nsd, &nsd->udp[i], data);
   3675 			} else {
   3676 				/* close sockets intended for other servers */
   3677 				server_close_socket(&nsd->udp[i]);
   3678 			}
   3679 		}
   3680 	}
   3681 
   3682 	/*
   3683 	 * Keep track of all the TCP accept handlers so we can enable
   3684 	 * and disable them based on the current number of active TCP
   3685 	 * connections.
   3686 	 */
   3687 	if ((nsd->server_kind & NSD_SERVER_TCP)) {
   3688 		int child = nsd->this_child->child_num;
   3689 		tcp_accept_handler_count = numifs;
   3690 		tcp_accept_handlers = region_alloc_array(server_region,
   3691 			numifs, sizeof(*tcp_accept_handlers));
   3692 
   3693 		for (i = 0; i < nsd->ifs; i++) {
   3694 			int listen;
   3695 			struct tcp_accept_handler_data *data;
   3696 
   3697 			listen = nsd_bitset_isset(nsd->tcp[i].servers, child);
   3698 
   3699 			if(i >= from && i < (from + numifs) && listen) {
   3700 				data = &tcp_accept_handlers[i-from];
   3701 				memset(data, 0, sizeof(*data));
   3702 				add_tcp_handler(nsd, &nsd->tcp[i], data);
   3703 			} else {
   3704 				/* close sockets intended for other servers */
   3705 				server_close_socket(&nsd->tcp[i]);
   3706 			}
   3707 		}
   3708 	} else {
   3709 		tcp_accept_handler_count = 0;
   3710 	}
   3711 
   3712 #ifdef USE_XDP
   3713 	if (nsd->options->xdp_interface) {
   3714 		/* don't try to bind more sockets than there are queues available */
   3715 		if ((int)nsd->xdp.xdp_server.queue_count <= nsd->this_child->child_num) {
   3716 			log_msg(LOG_WARNING,
   3717 			        "xdp: server-count exceeds available queues (%d) on "
   3718 			        "interface %s, skipping xdp in this process",
   3719 			        nsd->xdp.xdp_server.queue_count,
   3720 			        nsd->xdp.xdp_server.interface_name);
   3721 		} else {
   3722 			struct xdp_handler_data *data;
   3723 			const int scratch_data_len = 1;
   3724 			void *scratch_data = region_alloc_zero(nsd->server_region,
   3725 			                                       scratch_data_len);
   3726 
   3727 			nsd->xdp.xdp_server.queue_index = nsd->this_child->child_num;
   3728 			nsd->xdp.xdp_server.queries = xdp_queries;
   3729 
   3730 			log_msg(LOG_INFO,
   3731 			        "xdp: using socket with queue_id %d on interface %s",
   3732 			        nsd->xdp.xdp_server.queue_index,
   3733 			        nsd->xdp.xdp_server.interface_name);
   3734 
   3735 			data = region_alloc_zero(nsd->server_region, sizeof(*data));
   3736 			add_xdp_handler(nsd, &nsd->xdp.xdp_server, data);
   3737 
   3738 			for (i = 0; i < XDP_RX_BATCH_SIZE; i++) {
   3739 				/* Be aware that the buffer is initialized with scratch data
   3740 				 * and will be filled by the xdp handle and receive function
   3741 				 * that receives the packet data.
   3742 				 * Using scratch data so that the existing functions in regards
   3743 				 * to queries and buffers don't break by use of NULL pointers */
   3744 				struct buffer *buffer = region_alloc_zero(
   3745 				                            nsd->server_region,
   3746 				                            sizeof(struct buffer));
   3747 				buffer_create_from(buffer, scratch_data, scratch_data_len);
   3748 				xdp_queries[i] = query_create_with_buffer(
   3749 				                                  server_region,
   3750 				                                  compressed_dname_offsets,
   3751 				                                  compression_table_size,
   3752 				                                  compressed_dnames,
   3753 				                                  buffer);
   3754 				query_reset(xdp_queries[i], UDP_MAX_MESSAGE_LEN, 0);
   3755 			}
   3756 		}
   3757 	}
   3758 #endif
   3759 
   3760 	/* The main loop... */
   3761 	while ((mode = nsd->mode) != NSD_QUIT) {
   3762 		if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd);
   3763 
   3764 		/* Do we need to do the statistics... */
   3765 		if (mode == NSD_STATS) {
   3766 #ifdef BIND8_STATS
   3767 			int p = nsd->st_period;
   3768 			nsd->st_period = 1; /* force stats printout */
   3769 			/* Dump the statistics */
   3770 			bind8_stats(nsd);
   3771 			nsd->st_period = p;
   3772 #else /* !BIND8_STATS */
   3773 			log_msg(LOG_NOTICE, "Statistics support not enabled at compile time.");
   3774 #endif /* BIND8_STATS */
   3775 
   3776 			nsd->mode = NSD_RUN;
   3777 		}
   3778 		else if (mode == NSD_REAP_CHILDREN) {
   3779 			/* got signal, notify parent. parent reaps terminated children. */
   3780 			if (nsd->this_child->parent_fd != -1) {
   3781 				sig_atomic_t parent_notify = NSD_REAP_CHILDREN;
   3782 				if (write(nsd->this_child->parent_fd,
   3783 				    &parent_notify,
   3784 				    sizeof(parent_notify)) == -1)
   3785 				{
   3786 					log_msg(LOG_ERR, "problems sending command from %d to parent: %s",
   3787 						(int) nsd->this_child->pid, strerror(errno));
   3788 				}
   3789 			} else /* no parent, so reap 'em */
   3790 				while (waitpid(-1, NULL, WNOHANG) > 0) ;
   3791 			nsd->mode = NSD_RUN;
   3792 		}
   3793 		else if(mode == NSD_RUN) {
   3794 			/* Wait for a query... */
   3795 			if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
   3796 				if (errno != EINTR) {
   3797 					log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
   3798 					break;
   3799 				}
   3800 			}
   3801 		} else if(mode == NSD_QUIT) {
   3802 			/* ignore here, quit */
   3803 		} else {
   3804 			log_msg(LOG_ERR, "mode bad value %d, back to service.",
   3805 				(int)mode);
   3806 			nsd->mode = NSD_RUN;
   3807 		}
   3808 	}
   3809 
   3810 	/* This part is seemingly never reached as the loop WOULD exit on NSD_QUIT,
   3811 	 * but nsd->mode is only set to NSD_QUIT in ipc_child_quit. However, that
   3812 	 * function also calls exit(). */
   3813 
   3814 	service_remaining_tcp(nsd);
   3815 #ifdef	BIND8_STATS
   3816 	bind8_stats(nsd);
   3817 #endif /* BIND8_STATS */
   3818 
   3819 #ifdef MEMCLEAN /* OS collects memory pages */
   3820 #ifdef RATELIMIT
   3821 	rrl_deinit(nsd->this_child->child_num);
   3822 #endif
   3823 	event_base_free(event_base);
   3824 	region_destroy(server_region);
   3825 #endif
   3826 	server_shutdown(nsd);
   3827 }
   3828 
   3829 static void remaining_tcp_timeout(int ATTR_UNUSED(fd), short event, void* arg)
   3830 {
   3831 	int* timed_out = (int*)arg;
   3832         assert((event & EV_TIMEOUT)); (void)event;
   3833 	/* wake up the service tcp thread, note event is no longer
   3834 	 * registered */
   3835 	*timed_out = 1;
   3836 }
   3837 
   3838 void
   3839 service_remaining_tcp(struct nsd* nsd)
   3840 {
   3841 	struct tcp_handler_data* p;
   3842 	struct event_base* event_base;
   3843 	/* check if it is needed */
   3844 	if(nsd->current_tcp_count == 0 || tcp_active_list == NULL)
   3845 		return;
   3846 	VERBOSITY(5, (LOG_INFO, "service remaining TCP connections"));
   3847 #ifdef USE_DNSTAP
   3848 	/* remove dnstap collector, we cannot write there because the new
   3849 	 * child process is using the file descriptor, or the child
   3850 	 * process after that. */
   3851 	dt_collector_destroy(nsd->dt_collector, nsd);
   3852 	nsd->dt_collector = NULL;
   3853 #endif
   3854 	/* setup event base */
   3855 	event_base = nsd_child_event_base();
   3856 	if(!event_base) {
   3857 		log_msg(LOG_ERR, "nsd remain tcp could not create event base");
   3858 		return;
   3859 	}
   3860 	/* register tcp connections */
   3861 	for(p = tcp_active_list; p != NULL; p = p->next) {
   3862 		struct timeval timeout;
   3863 		int fd = p->event.ev_fd;
   3864 #ifdef USE_MINI_EVENT
   3865 		short event = p->event.ev_flags & (EV_READ|EV_WRITE);
   3866 #else
   3867 		short event = p->event.ev_events & (EV_READ|EV_WRITE);
   3868 #endif
   3869 		void (*fn)(int, short, void*);
   3870 #ifdef HAVE_SSL
   3871 		if(p->tls) {
   3872 			if((event&EV_READ))
   3873 				fn = handle_tls_reading;
   3874 			else	fn = handle_tls_writing;
   3875 		} else if(p->tls_auth) {
   3876 			if((event&EV_READ))
   3877 				fn = handle_tls_reading;
   3878 			else	fn = handle_tls_writing;
   3879 		} else {
   3880 #endif
   3881 			if((event&EV_READ))
   3882 				fn = handle_tcp_reading;
   3883 			else	fn = handle_tcp_writing;
   3884 #ifdef HAVE_SSL
   3885 		}
   3886 #endif
   3887 
   3888 		p->tcp_no_more_queries = 1;
   3889 		/* set timeout to 3 seconds (previously 1/10 second) */
   3890 		if(p->tcp_timeout > 3000)
   3891 			p->tcp_timeout = 3000;
   3892 		timeout.tv_sec = p->tcp_timeout / 1000;
   3893 		timeout.tv_usec = (p->tcp_timeout % 1000)*1000;
   3894 		event_del(&p->event);
   3895 		memset(&p->event, 0, sizeof(p->event));
   3896 		event_set(&p->event, fd, EV_PERSIST | event | EV_TIMEOUT,
   3897 			fn, p);
   3898 		if(event_base_set(event_base, &p->event) != 0)
   3899 			log_msg(LOG_ERR, "event base set failed");
   3900 		if(event_add(&p->event, &timeout) != 0)
   3901 			log_msg(LOG_ERR, "event add failed");
   3902 	}
   3903 
   3904 	/* handle it */
   3905 	while(nsd->current_tcp_count > 0) {
   3906 		mode_t m = server_signal_mode(nsd);
   3907 		struct event timeout;
   3908 		struct timeval tv;
   3909 		int timed_out = 0;
   3910 		if(m == NSD_QUIT || m == NSD_SHUTDOWN ||
   3911 			m == NSD_REAP_CHILDREN) {
   3912 			/* quit */
   3913 			break;
   3914 		}
   3915 		/* timer */
   3916 		/* have to do something every 3 seconds */
   3917 		tv.tv_sec = 3;
   3918 		tv.tv_usec = 0;
   3919 		memset(&timeout, 0, sizeof(timeout));
   3920 		event_set(&timeout, -1, EV_TIMEOUT, remaining_tcp_timeout,
   3921 			&timed_out);
   3922 		if(event_base_set(event_base, &timeout) != 0)
   3923 			log_msg(LOG_ERR, "remaintcp timer: event_base_set failed");
   3924 		if(event_add(&timeout, &tv) != 0)
   3925 			log_msg(LOG_ERR, "remaintcp timer: event_add failed");
   3926 
   3927 		/* service loop */
   3928 		if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
   3929 			if (errno != EINTR) {
   3930 				log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
   3931 				break;
   3932 			}
   3933 		}
   3934 		if(!timed_out) {
   3935 			event_del(&timeout);
   3936 		} else {
   3937 			/* timed out, quit */
   3938 			VERBOSITY(5, (LOG_INFO, "service remaining TCP connections: timed out, quit"));
   3939 			break;
   3940 		}
   3941 	}
   3942 #ifdef MEMCLEAN
   3943 	event_base_free(event_base);
   3944 #endif
   3945 	/* continue to quit after return */
   3946 }
   3947 
   3948 /* Implement recvmmsg and sendmmsg if the platform does not. These functions
   3949  * are always used, even if nonblocking operations are broken, in which case
   3950  * NUM_RECV_PER_SELECT is defined to 1 (one).
   3951  */
   3952 #if defined(HAVE_RECVMMSG)
   3953 #define nsd_recvmmsg recvmmsg
   3954 #else /* !HAVE_RECVMMSG */
   3955 
   3956 static int
   3957 nsd_recvmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen,
   3958              int flags, struct timespec *timeout)
   3959 {
   3960 	unsigned int vpos = 0;
   3961 	ssize_t rcvd;
   3962 
   3963 	/* timeout is ignored, ensure caller does not expect it to work */
   3964 	assert(timeout == NULL); (void)timeout;
   3965 
   3966 	while(vpos < vlen) {
   3967 		rcvd = recvfrom(sockfd,
   3968 		                msgvec[vpos].msg_hdr.msg_iov->iov_base,
   3969 		                msgvec[vpos].msg_hdr.msg_iov->iov_len,
   3970 		                flags,
   3971 		                msgvec[vpos].msg_hdr.msg_name,
   3972 		               &msgvec[vpos].msg_hdr.msg_namelen);
   3973 		if(rcvd < 0) {
   3974 			break;
   3975 		} else {
   3976 			assert((unsigned long long)rcvd <= (unsigned long long)UINT_MAX);
   3977 			msgvec[vpos].msg_len = (unsigned int)rcvd;
   3978 			vpos++;
   3979 		}
   3980 	}
   3981 
   3982 	if(vpos) {
   3983 		/* error will be picked up next time */
   3984 		return (int)vpos;
   3985 	} else if(errno == 0) {
   3986 		return 0;
   3987 	} else if(errno == EAGAIN) {
   3988 		return 0;
   3989 	}
   3990 
   3991 	return -1;
   3992 }
   3993 #endif /* HAVE_RECVMMSG */
   3994 
   3995 #ifdef HAVE_SENDMMSG
   3996 #define nsd_sendmmsg(...) sendmmsg(__VA_ARGS__)
   3997 #else /* !HAVE_SENDMMSG */
   3998 
   3999 static int
   4000 nsd_sendmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, int flags)
   4001 {
   4002 	unsigned int vpos = 0;
   4003 	ssize_t snd;
   4004 
   4005 	while(vpos < vlen) {
   4006 		assert(msgvec[vpos].msg_hdr.msg_iovlen == 1);
   4007 		snd = sendto(sockfd,
   4008 		             msgvec[vpos].msg_hdr.msg_iov->iov_base,
   4009 		             msgvec[vpos].msg_hdr.msg_iov->iov_len,
   4010 		             flags,
   4011 		             msgvec[vpos].msg_hdr.msg_name,
   4012 		             msgvec[vpos].msg_hdr.msg_namelen);
   4013 		if(snd < 0) {
   4014 			break;
   4015 		} else {
   4016 			msgvec[vpos].msg_len = (unsigned int)snd;
   4017 			vpos++;
   4018 		}
   4019 	}
   4020 
   4021 	if(vpos) {
   4022 		return (int)vpos;
   4023 	} else if(errno == 0) {
   4024 		return 0;
   4025 	}
   4026 
   4027 	return -1;
   4028 }
   4029 #endif /* HAVE_SENDMMSG */
   4030 
   4031 static int
   4032 port_is_zero(
   4033 #ifdef INET6
   4034         struct sockaddr_storage *addr
   4035 #else
   4036         struct sockaddr_in *addr
   4037 #endif
   4038 	)
   4039 {
   4040 #ifdef INET6
   4041 	if(addr->ss_family == AF_INET6) {
   4042 		return (((struct sockaddr_in6 *)addr)->sin6_port) == 0;
   4043 	} else if(addr->ss_family == AF_INET) {
   4044 		return (((struct sockaddr_in *)addr)->sin_port) == 0;
   4045 	}
   4046 	return 0;
   4047 #else
   4048 	if(addr->sin_family == AF_INET) {
   4049 		return addr->sin_port == 0;
   4050 	}
   4051 	return 0;
   4052 #endif
   4053 }
   4054 
   4055 /* Parses the PROXYv2 header from buf and updates the struct.
   4056  * Returns 1 on success, 0 on failure. */
   4057 static int
   4058 consume_pp2_header(struct buffer* buf, struct query* q, int stream)
   4059 {
   4060 	size_t size;
   4061 	struct pp2_header* header;
   4062 	int err = pp2_read_header(buffer_begin(buf), buffer_remaining(buf));
   4063 	if(err) {
   4064 		VERBOSITY(4, (LOG_ERR, "proxy-protocol: could not parse "
   4065 			"PROXYv2 header: %s", pp_lookup_error(err)));
   4066 		return 0;
   4067 	}
   4068 	header = (struct pp2_header*)buffer_begin(buf);
   4069 	size = PP2_HEADER_SIZE + read_uint16(&header->len);
   4070 	if(size > buffer_limit(buf)) {
   4071 		VERBOSITY(4, (LOG_ERR, "proxy-protocol: not enough buffer "
   4072 			"size to read PROXYv2 header"));
   4073 		return 0;
   4074 	}
   4075 	if((header->ver_cmd & 0xF) == PP2_CMD_LOCAL) {
   4076 		/* A connection from the proxy itself.
   4077 		 * No need to do anything with addresses. */
   4078 		goto done;
   4079 	}
   4080 	if(header->fam_prot == PP2_UNSPEC_UNSPEC) {
   4081 		/* Unspecified family and protocol. This could be used for
   4082 		 * health checks by proxies.
   4083 		 * No need to do anything with addresses. */
   4084 		goto done;
   4085 	}
   4086 	/* Read the proxied address */
   4087 	switch(header->fam_prot) {
   4088 		case PP2_INET_STREAM:
   4089 		case PP2_INET_DGRAM:
   4090 			{
   4091 			struct sockaddr_in* addr =
   4092 				(struct sockaddr_in*)&q->client_addr;
   4093 			addr->sin_family = AF_INET;
   4094 			memmove(&addr->sin_addr.s_addr,
   4095 				&header->addr.addr4.src_addr, 4);
   4096 			memmove(&addr->sin_port, &header->addr.addr4.src_port,
   4097 				2);
   4098 			q->client_addrlen = (socklen_t)sizeof(struct sockaddr_in);
   4099 			}
   4100 			/* Ignore the destination address; it should be us. */
   4101 			break;
   4102 #ifdef INET6
   4103 		case PP2_INET6_STREAM:
   4104 		case PP2_INET6_DGRAM:
   4105 			{
   4106 			struct sockaddr_in6* addr =
   4107 				(struct sockaddr_in6*)&q->client_addr;
   4108 			memset(addr, 0, sizeof(*addr));
   4109 			addr->sin6_family = AF_INET6;
   4110 			memmove(&addr->sin6_addr,
   4111 				header->addr.addr6.src_addr, 16);
   4112 			memmove(&addr->sin6_port, &header->addr.addr6.src_port,
   4113 				2);
   4114 			q->client_addrlen = (socklen_t)sizeof(struct sockaddr_in6);
   4115 			}
   4116 			/* Ignore the destination address; it should be us. */
   4117 			break;
   4118 #endif /* INET6 */
   4119 		default:
   4120 			VERBOSITY(2, (LOG_ERR, "proxy-protocol: unsupported "
   4121 				"family and protocol 0x%x",
   4122 				(int)header->fam_prot));
   4123 			return 0;
   4124 	}
   4125 	q->is_proxied = 1;
   4126 done:
   4127 	if(!stream) {
   4128 		/* We are reading a whole packet;
   4129 		 * Move the rest of the data to overwrite the PROXYv2 header */
   4130 		/* XXX can we do better to avoid memmove? */
   4131 		memmove(header, ((char*)header)+size, buffer_limit(buf)-size);
   4132 		buffer_set_limit(buf, buffer_limit(buf)-size);
   4133 	}
   4134 	return 1;
   4135 }
   4136 
   4137 static void
   4138 handle_udp(int fd, short event, void* arg)
   4139 {
   4140 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
   4141 	int received, sent, recvcount, i;
   4142 	struct query *q;
   4143 	uint32_t now = 0;
   4144 
   4145 	if (!(event & EV_READ)) {
   4146 		return;
   4147 	}
   4148 	recvcount = nsd_recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
   4149 	/* this printf strangely gave a performance increase on Linux */
   4150 	/* printf("recvcount %d \n", recvcount); */
   4151 	if (recvcount == -1) {
   4152 		if (errno != EAGAIN && errno != EINTR) {
   4153 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
   4154 			STATUP(data->nsd, rxerr);
   4155 			/* No zone statup */
   4156 		}
   4157 		/* Simply no data available */
   4158 		return;
   4159 	}
   4160 	for (i = 0; i < recvcount; i++) {
   4161 	loopstart:
   4162 		received = msgs[i].msg_len;
   4163 		queries[i]->remote_addrlen = msgs[i].msg_hdr.msg_namelen;
   4164 		queries[i]->client_addrlen = (socklen_t)sizeof(queries[i]->client_addr);
   4165 		queries[i]->is_proxied = 0;
   4166 		q = queries[i];
   4167 		if (received == -1) {
   4168 			log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror(
   4169 #if defined(HAVE_RECVMMSG)
   4170 				msgs[i].msg_hdr.msg_flags
   4171 #else
   4172 				errno
   4173 #endif
   4174 				));
   4175 			STATUP(data->nsd, rxerr);
   4176 			/* No zone statup */
   4177 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
   4178 			iovecs[i].iov_len = buffer_remaining(q->packet);
   4179 			msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen;
   4180 			goto swap_drop;
   4181 		}
   4182 
   4183 		/* Account... */
   4184 #ifdef BIND8_STATS
   4185 		if (data->socket->addr.ai_family == AF_INET) {
   4186 			STATUP(data->nsd, qudp);
   4187 		} else if (data->socket->addr.ai_family == AF_INET6) {
   4188 			STATUP(data->nsd, qudp6);
   4189 		}
   4190 #endif
   4191 
   4192 		buffer_skip(q->packet, received);
   4193 		buffer_flip(q->packet);
   4194 		if(data->pp2_enabled && !consume_pp2_header(q->packet, q, 0)) {
   4195 			VERBOSITY(2, (LOG_ERR, "proxy-protocol: could not "
   4196 				"consume PROXYv2 header"));
   4197 			goto swap_drop;
   4198 		}
   4199 		if(!q->is_proxied) {
   4200 			q->client_addrlen = q->remote_addrlen;
   4201 			memmove(&q->client_addr, &q->remote_addr,
   4202 				q->remote_addrlen);
   4203 		}
   4204 #ifdef USE_DNSTAP
   4205 		/*
   4206 		 * sending UDP-query with server address (local) and client address to dnstap process
   4207 		 */
   4208 		log_addr("query from client", &q->client_addr);
   4209 		log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
   4210 		if(verbosity >= 6 && q->is_proxied)
   4211 			log_addr("query via proxy", &q->remote_addr);
   4212 		dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &q->client_addr, q->client_addrlen,
   4213 			q->tcp, q->packet);
   4214 #endif /* USE_DNSTAP */
   4215 
   4216 		/* Process and answer the query... */
   4217 		if (server_process_query_udp(data->nsd, q, &now) != QUERY_DISCARDED) {
   4218 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
   4219 				STATUP(data->nsd, nona);
   4220 				ZTATUP(data->nsd, q->zone, nona);
   4221 			}
   4222 
   4223 #ifdef USE_ZONE_STATS
   4224 			if (data->socket->addr.ai_family == AF_INET) {
   4225 				ZTATUP(data->nsd, q->zone, qudp);
   4226 			} else if (data->socket->addr.ai_family == AF_INET6) {
   4227 				ZTATUP(data->nsd, q->zone, qudp6);
   4228 			}
   4229 #endif
   4230 
   4231 			/* Add EDNS0 and TSIG info if necessary.  */
   4232 			query_add_optional(q, data->nsd, &now);
   4233 
   4234 			buffer_flip(q->packet);
   4235 			iovecs[i].iov_len = buffer_remaining(q->packet);
   4236 #ifdef BIND8_STATS
   4237 			/* Account the rcode & TC... */
   4238 			STATUP2(data->nsd, rcode, RCODE(q->packet));
   4239 			ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
   4240 			if (TC(q->packet)) {
   4241 				STATUP(data->nsd, truncated);
   4242 				ZTATUP(data->nsd, q->zone, truncated);
   4243 			}
   4244 #endif /* BIND8_STATS */
   4245 #ifdef USE_DNSTAP
   4246 			/*
   4247 			 * sending UDP-response with server address (local) and client address to dnstap process
   4248 			 */
   4249 			log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
   4250 			log_addr("response to client", &q->client_addr);
   4251 			if(verbosity >= 6 && q->is_proxied)
   4252 				log_addr("response via proxy", &q->remote_addr);
   4253 			dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr,
   4254 				&q->client_addr, q->client_addrlen, q->tcp, q->packet,
   4255 				q->zone);
   4256 #endif /* USE_DNSTAP */
   4257 		} else {
   4258 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
   4259 			iovecs[i].iov_len = buffer_remaining(q->packet);
   4260 			msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen;
   4261 		swap_drop:
   4262 			STATUP(data->nsd, dropped);
   4263 			ZTATUP(data->nsd, q->zone, dropped);
   4264 			if(i != recvcount-1) {
   4265 				/* swap with last and decrease recvcount */
   4266 				struct mmsghdr mtmp = msgs[i];
   4267 				struct iovec iotmp = iovecs[i];
   4268 				recvcount--;
   4269 				msgs[i] = msgs[recvcount];
   4270 				iovecs[i] = iovecs[recvcount];
   4271 				queries[i] = queries[recvcount];
   4272 				msgs[recvcount] = mtmp;
   4273 				iovecs[recvcount] = iotmp;
   4274 				queries[recvcount] = q;
   4275 				msgs[i].msg_hdr.msg_iov = &iovecs[i];
   4276 				msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount];
   4277 				goto loopstart;
   4278 			} else { recvcount --; }
   4279 		}
   4280 	}
   4281 
   4282 	/* send until all are sent */
   4283 	i = 0;
   4284 	while(i<recvcount) {
   4285 		sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0);
   4286 		if(sent == -1) {
   4287 			if(errno == ENOBUFS ||
   4288 #ifdef EWOULDBLOCK
   4289 				errno == EWOULDBLOCK ||
   4290 #endif
   4291 				errno == EAGAIN) {
   4292 				/* block to wait until send buffer avail */
   4293 				int flag, errstore;
   4294 				if((flag = fcntl(fd, F_GETFL)) == -1) {
   4295 					log_msg(LOG_ERR, "cannot fcntl F_GETFL: %s", strerror(errno));
   4296 					flag = 0;
   4297 				}
   4298 				flag &= ~O_NONBLOCK;
   4299 				if(fcntl(fd, F_SETFL, flag) == -1)
   4300 					log_msg(LOG_ERR, "cannot fcntl F_SETFL 0: %s", strerror(errno));
   4301 				sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0);
   4302 				errstore = errno;
   4303 				flag |= O_NONBLOCK;
   4304 				if(fcntl(fd, F_SETFL, flag) == -1)
   4305 					log_msg(LOG_ERR, "cannot fcntl F_SETFL O_NONBLOCK: %s", strerror(errno));
   4306 				if(sent != -1) {
   4307 					i += sent;
   4308 					continue;
   4309 				}
   4310 				errno = errstore;
   4311 			}
   4312 			if(errno == EINVAL) {
   4313 				/* skip the invalid argument entry,
   4314 				 * send the remaining packets in the list */
   4315 				if(!(port_is_zero((void*)&queries[i]->remote_addr) &&
   4316 					verbosity < 3)) {
   4317 					const char* es = strerror(errno);
   4318 					char a[64];
   4319 					addrport2str((void*)&queries[i]->remote_addr, a, sizeof(a));
   4320 					log_msg(LOG_ERR, "sendmmsg skip invalid argument [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
   4321 				}
   4322 				i += 1;
   4323 				continue;
   4324 			}
   4325 			/* don't log transient network full errors, unless
   4326 			 * on higher verbosity */
   4327 			if(!(errno == ENOBUFS && verbosity < 1) &&
   4328 #ifdef EWOULDBLOCK
   4329 			   errno != EWOULDBLOCK &&
   4330 #endif
   4331 			   errno != EAGAIN) {
   4332 				const char* es = strerror(errno);
   4333 				char a[64];
   4334 				addrport2str((void*)&queries[i]->remote_addr, a, sizeof(a));
   4335 				log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
   4336 			}
   4337 #ifdef BIND8_STATS
   4338 			data->nsd->st->txerr += recvcount-i;
   4339 #endif /* BIND8_STATS */
   4340 			break;
   4341 		}
   4342 		i += sent;
   4343 	}
   4344 	for(i=0; i<recvcount; i++) {
   4345 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
   4346 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
   4347 		msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen;
   4348 	}
   4349 }
   4350 
   4351 #ifdef HAVE_SSL
   4352 /*
   4353  * Setup an event for the tcp handler.
   4354  */
   4355 static void
   4356 tcp_handler_setup_event(struct tcp_handler_data* data, void (*fn)(int, short, void *),
   4357        int fd, short event)
   4358 {
   4359 	struct timeval timeout;
   4360 	struct event_base* ev_base;
   4361 
   4362 	timeout.tv_sec = data->nsd->tcp_timeout;
   4363 	timeout.tv_usec = 0L;
   4364 
   4365 	ev_base = data->event.ev_base;
   4366 	event_del(&data->event);
   4367 	memset(&data->event, 0, sizeof(data->event));
   4368 	event_set(&data->event, fd, event, fn, data);
   4369 	if(event_base_set(ev_base, &data->event) != 0)
   4370 		log_msg(LOG_ERR, "event base set failed");
   4371 	if(event_add(&data->event, &timeout) != 0)
   4372 		log_msg(LOG_ERR, "event add failed");
   4373 }
   4374 #endif /* HAVE_SSL */
   4375 
   4376 static void
   4377 cleanup_tcp_handler(struct tcp_handler_data* data)
   4378 {
   4379 	event_del(&data->event);
   4380 #ifdef HAVE_SSL
   4381 	if(data->tls) {
   4382 		SSL_shutdown(data->tls);
   4383 		SSL_free(data->tls);
   4384 		data->tls = NULL;
   4385 	}
   4386 	if(data->tls_auth) {
   4387 		SSL_shutdown(data->tls_auth);
   4388 		SSL_free(data->tls_auth);
   4389 		data->tls_auth = NULL;
   4390 	}
   4391 #endif
   4392 	data->pp2_header_state = pp2_header_none;
   4393 	close(data->event.ev_fd);
   4394 	if(data->prev)
   4395 		data->prev->next = data->next;
   4396 	else	tcp_active_list = data->next;
   4397 	if(data->next)
   4398 		data->next->prev = data->prev;
   4399 
   4400 	/*
   4401 	 * Enable the TCP accept handlers when the current number of
   4402 	 * TCP connections is about to drop below the maximum number
   4403 	 * of TCP connections.
   4404 	 */
   4405 	if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
   4406 		configure_handler_event_types(EV_READ|EV_PERSIST);
   4407 		if(slowaccept) {
   4408 			event_del(&slowaccept_event);
   4409 			slowaccept = 0;
   4410 		}
   4411 	}
   4412 	--data->nsd->current_tcp_count;
   4413 	assert(data->nsd->current_tcp_count >= 0);
   4414 
   4415 	region_destroy(data->region);
   4416 }
   4417 
   4418 /* Read more data into the buffer for tcp read. Pass the amount of additional
   4419  * data required. Returns false if nothing needs to be done this event, or
   4420  * true if the additional data is in the buffer. */
   4421 static int
   4422 more_read_buf_tcp(int fd, struct tcp_handler_data* data, void* bufpos,
   4423 	size_t add_amount, ssize_t* received)
   4424 {
   4425 	*received = read(fd, bufpos, add_amount);
   4426 	if (*received == -1) {
   4427 		if (errno == EAGAIN || errno == EINTR) {
   4428 			/*
   4429 			 * Read would block, wait until more
   4430 			 * data is available.
   4431 			 */
   4432 			return 0;
   4433 		} else {
   4434 			char buf[48];
   4435 			if(data->query) {
   4436 				addr2str(&data->query->remote_addr, buf, sizeof(buf));
   4437 			} else {
   4438 				snprintf(buf, sizeof(buf), "unknown");
   4439 			}
   4440 #ifdef ECONNRESET
   4441 			if (verbosity >= 2 || errno != ECONNRESET)
   4442 #endif /* ECONNRESET */
   4443 			log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
   4444 			cleanup_tcp_handler(data);
   4445 			return 0;
   4446 		}
   4447 	} else if (*received == 0) {
   4448 		/* EOF */
   4449 		cleanup_tcp_handler(data);
   4450 		return 0;
   4451 	}
   4452 	return 1;
   4453 }
   4454 
   4455 static void
   4456 handle_tcp_reading(int fd, short event, void* arg)
   4457 {
   4458 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
   4459 	ssize_t received;
   4460 	struct event_base* ev_base;
   4461 	struct timeval timeout;
   4462 	uint32_t now = 0;
   4463 
   4464 	if ((event & EV_TIMEOUT)) {
   4465 		/* Connection timed out.  */
   4466 		cleanup_tcp_handler(data);
   4467 		return;
   4468 	}
   4469 
   4470 	if ((data->nsd->tcp_query_count > 0 &&
   4471 	     data->query_count >= data->nsd->tcp_query_count) ||
   4472 	    (data->query_count > 0 && data->tcp_no_more_queries))
   4473   {
   4474 		/* No more queries allowed on this tcp connection. */
   4475 		cleanup_tcp_handler(data);
   4476 		return;
   4477 	}
   4478 
   4479 	assert((event & EV_READ));
   4480 
   4481 	if (data->bytes_transmitted == 0 && data->query_needs_reset) {
   4482 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
   4483 		data->query_needs_reset = 0;
   4484 	}
   4485 
   4486 	if(data->pp2_enabled && data->pp2_header_state != pp2_header_done) {
   4487 		struct pp2_header* header = NULL;
   4488 		size_t want_read_size = 0;
   4489 		size_t current_read_size = 0;
   4490 		if(data->pp2_header_state == pp2_header_none) {
   4491 			want_read_size = PP2_HEADER_SIZE;
   4492 			if(buffer_remaining(data->query->packet) <
   4493 				want_read_size) {
   4494 				VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header"));
   4495 				cleanup_tcp_handler(data);
   4496 				return;
   4497 			}
   4498 			VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading fixed part of PROXYv2 header (len %lu)", (unsigned long)want_read_size));
   4499 			current_read_size = want_read_size;
   4500 			if(data->bytes_transmitted < current_read_size) {
   4501 				if(!more_read_buf_tcp(fd, data,
   4502 					(void*)buffer_at(data->query->packet,
   4503 						data->bytes_transmitted),
   4504 					current_read_size - data->bytes_transmitted,
   4505 					&received))
   4506 					return;
   4507 				data->bytes_transmitted += received;
   4508 				buffer_skip(data->query->packet, received);
   4509 				if(data->bytes_transmitted != current_read_size)
   4510 					return;
   4511 				data->pp2_header_state = pp2_header_init;
   4512 			}
   4513 		}
   4514 		if(data->pp2_header_state == pp2_header_init) {
   4515 			int err;
   4516 			err = pp2_read_header(buffer_begin(data->query->packet),
   4517 				buffer_limit(data->query->packet));
   4518 			if(err) {
   4519 				VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not parse PROXYv2 header: %s", pp_lookup_error(err)));
   4520 				cleanup_tcp_handler(data);
   4521 				return;
   4522 			}
   4523 			header = (struct pp2_header*)buffer_begin(data->query->packet);
   4524 			want_read_size = ntohs(header->len);
   4525 			if(buffer_limit(data->query->packet) <
   4526 				PP2_HEADER_SIZE + want_read_size) {
   4527 				VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header"));
   4528 				cleanup_tcp_handler(data);
   4529 				return;
   4530 			}
   4531 			VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading variable part of PROXYv2 header (len %lu)", (unsigned long)want_read_size));
   4532 			current_read_size = PP2_HEADER_SIZE + want_read_size;
   4533 			if(want_read_size == 0) {
   4534 				/* nothing more to read; header is complete */
   4535 				data->pp2_header_state = pp2_header_done;
   4536 			} else if(data->bytes_transmitted < current_read_size) {
   4537 				if(!more_read_buf_tcp(fd, data,
   4538 					(void*)buffer_at(data->query->packet,
   4539 						data->bytes_transmitted),
   4540 					current_read_size - data->bytes_transmitted,
   4541 					&received))
   4542 					return;
   4543 				data->bytes_transmitted += received;
   4544 				buffer_skip(data->query->packet, received);
   4545 				if(data->bytes_transmitted != current_read_size)
   4546 					return;
   4547 				data->pp2_header_state = pp2_header_done;
   4548 			}
   4549 		}
   4550 		if(data->pp2_header_state != pp2_header_done || !header) {
   4551 			VERBOSITY(6, (LOG_ERR, "proxy-protocol: wrong state for the PROXYv2 header"));
   4552 
   4553 			cleanup_tcp_handler(data);
   4554 			return;
   4555 		}
   4556 		buffer_flip(data->query->packet);
   4557 		if(!consume_pp2_header(data->query->packet, data->query, 1)) {
   4558 			VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not consume PROXYv2 header"));
   4559 
   4560 			cleanup_tcp_handler(data);
   4561 			return;
   4562 		}
   4563 		/* Clear and reset the buffer to read the following
   4564 		 * DNS packet(s). */
   4565 		buffer_clear(data->query->packet);
   4566 		data->bytes_transmitted = 0;
   4567 	}
   4568 
   4569 	/*
   4570 	 * Check if we received the leading packet length bytes yet.
   4571 	 */
   4572 	if (data->bytes_transmitted < sizeof(uint16_t)) {
   4573 		if(!more_read_buf_tcp(fd, data,
   4574 			(char*) &data->query->tcplen + data->bytes_transmitted,
   4575 			sizeof(uint16_t) - data->bytes_transmitted, &received))
   4576 			return;
   4577 		data->bytes_transmitted += received;
   4578 		if (data->bytes_transmitted < sizeof(uint16_t)) {
   4579 			/*
   4580 			 * Not done with the tcplen yet, wait for more
   4581 			 * data to become available.
   4582 			 */
   4583 			return;
   4584 		}
   4585 		assert(data->bytes_transmitted == sizeof(uint16_t));
   4586 
   4587 		data->query->tcplen = ntohs(data->query->tcplen);
   4588 
   4589 		/*
   4590 		 * Minimum query size is:
   4591 		 *
   4592 		 *     Size of the header (12)
   4593 		 *   + Root domain name   (1)
   4594 		 *   + Query class        (2)
   4595 		 *   + Query type         (2)
   4596 		 */
   4597 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
   4598 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
   4599 			cleanup_tcp_handler(data);
   4600 			return;
   4601 		}
   4602 
   4603 		if (data->query->tcplen > data->query->maxlen) {
   4604 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
   4605 			cleanup_tcp_handler(data);
   4606 			return;
   4607 		}
   4608 
   4609 		buffer_set_limit(data->query->packet, data->query->tcplen);
   4610 	}
   4611 
   4612 	assert(buffer_remaining(data->query->packet) > 0);
   4613 
   4614 	/* Read the (remaining) query data.  */
   4615 	if(!more_read_buf_tcp(fd, data, buffer_current(data->query->packet),
   4616 		buffer_remaining(data->query->packet), &received))
   4617 		return;
   4618 	data->bytes_transmitted += received;
   4619 	buffer_skip(data->query->packet, received);
   4620 	if (buffer_remaining(data->query->packet) > 0) {
   4621 		/*
   4622 		 * Message not yet complete, wait for more data to
   4623 		 * become available.
   4624 		 */
   4625 		return;
   4626 	}
   4627 
   4628 	assert(buffer_position(data->query->packet) == data->query->tcplen);
   4629 
   4630 	/* Account... */
   4631 #ifdef BIND8_STATS
   4632 #ifndef INET6
   4633 	STATUP(data->nsd, ctcp);
   4634 #else
   4635 	if (data->query->remote_addr.ss_family == AF_INET) {
   4636 		STATUP(data->nsd, ctcp);
   4637 	} else if (data->query->remote_addr.ss_family == AF_INET6) {
   4638 		STATUP(data->nsd, ctcp6);
   4639 	}
   4640 #endif
   4641 #endif /* BIND8_STATS */
   4642 
   4643 	/* We have a complete query, process it.  */
   4644 
   4645 	/* tcp-query-count: handle query counter ++ */
   4646 	data->query_count++;
   4647 
   4648 	buffer_flip(data->query->packet);
   4649 #ifdef USE_DNSTAP
   4650 	/*
   4651 	 * and send TCP-query with found address (local) and client address to dnstap process
   4652 	 */
   4653 	log_addr("query from client", &data->query->client_addr);
   4654 	log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
   4655 	if(verbosity >= 6 && data->query->is_proxied)
   4656 		log_addr("query via proxy", &data->query->remote_addr);
   4657 	dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr,
   4658 		data->query->client_addrlen, data->query->tcp, data->query->packet);
   4659 #endif /* USE_DNSTAP */
   4660 	data->query_state = server_process_query(data->nsd, data->query, &now);
   4661 	if (data->query_state == QUERY_DISCARDED) {
   4662 		/* Drop the packet and the entire connection... */
   4663 		STATUP(data->nsd, dropped);
   4664 		ZTATUP(data->nsd, data->query->zone, dropped);
   4665 		cleanup_tcp_handler(data);
   4666 		return;
   4667 	}
   4668 
   4669 #ifdef BIND8_STATS
   4670 	if (RCODE(data->query->packet) == RCODE_OK
   4671 	    && !AA(data->query->packet))
   4672 	{
   4673 		STATUP(data->nsd, nona);
   4674 		ZTATUP(data->nsd, data->query->zone, nona);
   4675 	}
   4676 #endif /* BIND8_STATS */
   4677 
   4678 #ifdef USE_ZONE_STATS
   4679 #ifndef INET6
   4680 	ZTATUP(data->nsd, data->query->zone, ctcp);
   4681 #else
   4682 	if (data->query->remote_addr.ss_family == AF_INET) {
   4683 		ZTATUP(data->nsd, data->query->zone, ctcp);
   4684 	} else if (data->query->remote_addr.ss_family == AF_INET6) {
   4685 		ZTATUP(data->nsd, data->query->zone, ctcp6);
   4686 	}
   4687 #endif
   4688 #endif /* USE_ZONE_STATS */
   4689 
   4690 	query_add_optional(data->query, data->nsd, &now);
   4691 
   4692 	/* Switch to the tcp write handler.  */
   4693 	buffer_flip(data->query->packet);
   4694 	data->query->tcplen = buffer_remaining(data->query->packet);
   4695 #ifdef BIND8_STATS
   4696 	/* Account the rcode & TC... */
   4697 	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
   4698 	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
   4699 	if (TC(data->query->packet)) {
   4700 		STATUP(data->nsd, truncated);
   4701 		ZTATUP(data->nsd, data->query->zone, truncated);
   4702 	}
   4703 #endif /* BIND8_STATS */
   4704 #ifdef USE_DNSTAP
   4705 	/*
   4706 	 * sending TCP-response with found (earlier) address (local) and client address to dnstap process
   4707 	 */
   4708 	log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
   4709 	log_addr("response to client", &data->query->client_addr);
   4710 	if(verbosity >= 6 && data->query->is_proxied)
   4711 		log_addr("response via proxy", &data->query->remote_addr);
   4712 	dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr,
   4713 		data->query->client_addrlen, data->query->tcp, data->query->packet,
   4714 		data->query->zone);
   4715 #endif /* USE_DNSTAP */
   4716 	data->bytes_transmitted = 0;
   4717 
   4718 	timeout.tv_sec = data->tcp_timeout / 1000;
   4719 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
   4720 
   4721 	ev_base = data->event.ev_base;
   4722 	event_del(&data->event);
   4723 	memset(&data->event, 0, sizeof(data->event));
   4724 	event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
   4725 		handle_tcp_writing, data);
   4726 	if(event_base_set(ev_base, &data->event) != 0)
   4727 		log_msg(LOG_ERR, "event base set tcpr failed");
   4728 	if(event_add(&data->event, &timeout) != 0)
   4729 		log_msg(LOG_ERR, "event add tcpr failed");
   4730 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
   4731 	handle_tcp_writing(fd, EV_WRITE, data);
   4732 }
   4733 
   4734 static void
   4735 handle_tcp_writing(int fd, short event, void* arg)
   4736 {
   4737 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
   4738 	ssize_t sent;
   4739 	struct query *q = data->query;
   4740 	struct timeval timeout;
   4741 	struct event_base* ev_base;
   4742 	uint32_t now = 0;
   4743 
   4744 	if ((event & EV_TIMEOUT) || !q) {
   4745 		/* Connection timed out.  */
   4746 		/* Or data->query is NULL, in which case nothing to do. */
   4747 		cleanup_tcp_handler(data);
   4748 		return;
   4749 	}
   4750 
   4751 	assert((event & EV_WRITE));
   4752 
   4753 	if (data->bytes_transmitted < sizeof(q->tcplen)) {
   4754 		/* Writing the response packet length.  */
   4755 		uint16_t n_tcplen = htons(q->tcplen);
   4756 #ifdef HAVE_WRITEV
   4757 		struct iovec iov[2];
   4758 		iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted;
   4759 		iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted;
   4760 		iov[1].iov_base = buffer_begin(q->packet);
   4761 		iov[1].iov_len = buffer_limit(q->packet);
   4762 		sent = writev(fd, iov, 2);
   4763 #else /* HAVE_WRITEV */
   4764 		sent = write(fd,
   4765 			     (const char *) &n_tcplen + data->bytes_transmitted,
   4766 			     sizeof(n_tcplen) - data->bytes_transmitted);
   4767 #endif /* HAVE_WRITEV */
   4768 		if (sent == -1) {
   4769 			if (errno == EAGAIN || errno == EINTR) {
   4770 				/*
   4771 				 * Write would block, wait until
   4772 				 * socket becomes writable again.
   4773 				 */
   4774 				return;
   4775 			} else {
   4776 #ifdef ECONNRESET
   4777 				if(verbosity >= 2 || errno != ECONNRESET)
   4778 #endif /* ECONNRESET */
   4779 #ifdef EPIPE
   4780 				  if(verbosity >= 2 || errno != EPIPE)
   4781 #endif /* EPIPE 'broken pipe' */
   4782 				{
   4783 					char client_ip[128];
   4784 					if(data->query) {
   4785 						addr2str(&data->query->client_addr, client_ip, sizeof(client_ip));
   4786 					} else {
   4787 						snprintf(client_ip, sizeof(client_ip), "unknown");
   4788 					}
   4789 					log_msg(LOG_ERR, "failed writing to tcp from %s: %s", client_ip, strerror(errno));
   4790 				}
   4791 				cleanup_tcp_handler(data);
   4792 				return;
   4793 			}
   4794 		}
   4795 
   4796 		data->bytes_transmitted += sent;
   4797 		if (data->bytes_transmitted < sizeof(q->tcplen)) {
   4798 			/*
   4799 			 * Writing not complete, wait until socket
   4800 			 * becomes writable again.
   4801 			 */
   4802 			return;
   4803 		}
   4804 
   4805 #ifdef HAVE_WRITEV
   4806 		sent -= sizeof(n_tcplen);
   4807 		/* handle potential 'packet done' code */
   4808 		goto packet_could_be_done;
   4809 #endif
   4810  	}
   4811 
   4812 	sent = write(fd,
   4813 		     buffer_current(q->packet),
   4814 		     buffer_remaining(q->packet));
   4815 	if (sent == -1) {
   4816 		if (errno == EAGAIN || errno == EINTR) {
   4817 			/*
   4818 			 * Write would block, wait until
   4819 			 * socket becomes writable again.
   4820 			 */
   4821 			return;
   4822 		} else {
   4823 #ifdef ECONNRESET
   4824 			if(verbosity >= 2 || errno != ECONNRESET)
   4825 #endif /* ECONNRESET */
   4826 #ifdef EPIPE
   4827 				  if(verbosity >= 2 || errno != EPIPE)
   4828 #endif /* EPIPE 'broken pipe' */
   4829 		{
   4830 			char client_ip[128];
   4831 			if(data->query) {
   4832 				addr2str(&data->query->client_addr, client_ip, sizeof(client_ip));
   4833 			} else {
   4834 				snprintf(client_ip, sizeof(client_ip), "unknown");
   4835 			}
   4836 			log_msg(LOG_ERR, "failed writing to tcp from %s: %s", client_ip, strerror(errno));
   4837 		}
   4838 			cleanup_tcp_handler(data);
   4839 			return;
   4840 		}
   4841 	}
   4842 
   4843 	data->bytes_transmitted += sent;
   4844 #ifdef HAVE_WRITEV
   4845   packet_could_be_done:
   4846 #endif
   4847 	buffer_skip(q->packet, sent);
   4848 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
   4849 		/*
   4850 		 * Still more data to write when socket becomes
   4851 		 * writable again.
   4852 		 */
   4853 		return;
   4854 	}
   4855 
   4856 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
   4857 
   4858 	if (data->query_state == QUERY_IN_AXFR ||
   4859 		data->query_state == QUERY_IN_IXFR) {
   4860 		/* Continue processing AXFR and writing back results.  */
   4861 		buffer_clear(q->packet);
   4862 		if(data->query_state == QUERY_IN_AXFR)
   4863 			data->query_state = query_axfr(data->nsd, q, 0);
   4864 		else data->query_state = query_ixfr(data->nsd, q);
   4865 		if (data->query_state != QUERY_PROCESSED) {
   4866 			query_add_optional(data->query, data->nsd, &now);
   4867 
   4868 			/* Reset data. */
   4869 			buffer_flip(q->packet);
   4870 			q->tcplen = buffer_remaining(q->packet);
   4871 			data->bytes_transmitted = 0;
   4872 			/* Reset timeout.  */
   4873 			timeout.tv_sec = data->tcp_timeout / 1000;
   4874 			timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
   4875 			ev_base = data->event.ev_base;
   4876 			event_del(&data->event);
   4877 			memset(&data->event, 0, sizeof(data->event));
   4878 			event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
   4879 				handle_tcp_writing, data);
   4880 			if(event_base_set(ev_base, &data->event) != 0)
   4881 				log_msg(LOG_ERR, "event base set tcpw failed");
   4882 			if(event_add(&data->event, &timeout) != 0)
   4883 				log_msg(LOG_ERR, "event add tcpw failed");
   4884 
   4885 			/*
   4886 			 * Write data if/when the socket is writable
   4887 			 * again.
   4888 			 */
   4889 			return;
   4890 		}
   4891 	}
   4892 
   4893 	/*
   4894 	 * Done sending, wait for the next request to arrive on the
   4895 	 * TCP socket by installing the TCP read handler.
   4896 	 */
   4897 	if ((data->nsd->tcp_query_count > 0 &&
   4898 		data->query_count >= data->nsd->tcp_query_count) ||
   4899 		data->tcp_no_more_queries) {
   4900 
   4901 		(void) shutdown(fd, SHUT_WR);
   4902 	}
   4903 
   4904 	data->bytes_transmitted = 0;
   4905 	data->query_needs_reset = 1;
   4906 
   4907 	timeout.tv_sec = data->tcp_timeout / 1000;
   4908 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
   4909 	ev_base = data->event.ev_base;
   4910 	event_del(&data->event);
   4911 	memset(&data->event, 0, sizeof(data->event));
   4912 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
   4913 		handle_tcp_reading, data);
   4914 	if(event_base_set(ev_base, &data->event) != 0)
   4915 		log_msg(LOG_ERR, "event base set tcpw failed");
   4916 	if(event_add(&data->event, &timeout) != 0)
   4917 		log_msg(LOG_ERR, "event add tcpw failed");
   4918 }
   4919 
   4920 #ifdef HAVE_SSL
   4921 /** create SSL object and associate fd */
   4922 static SSL*
   4923 incoming_ssl_fd(SSL_CTX* ctx, int fd)
   4924 {
   4925 	SSL* ssl = SSL_new((SSL_CTX*)ctx);
   4926 	if(!ssl) {
   4927 		log_crypto_err("could not SSL_new");
   4928 		return NULL;
   4929 	}
   4930 	SSL_set_accept_state(ssl);
   4931 	(void)SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY);
   4932 	if(!SSL_set_fd(ssl, fd)) {
   4933 		log_crypto_err("could not SSL_set_fd");
   4934 		SSL_free(ssl);
   4935 		return NULL;
   4936 	}
   4937 	return ssl;
   4938 }
   4939 
   4940 /** TLS handshake to upgrade TCP connection */
   4941 static int
   4942 tls_handshake(struct tcp_handler_data* data, int fd, int writing)
   4943 {
   4944 	int r;
   4945 	if(data->shake_state == tls_hs_read_event) {
   4946 		/* read condition satisfied back to writing */
   4947 		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
   4948 		data->shake_state = tls_hs_none;
   4949 		return 1;
   4950 	}
   4951 	if(data->shake_state == tls_hs_write_event) {
   4952 		/* write condition satisfied back to reading */
   4953 		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
   4954 		data->shake_state = tls_hs_none;
   4955 		return 1;
   4956 	}
   4957 
   4958 	/* (continue to) setup the TLS connection */
   4959 	ERR_clear_error();
   4960 	if(data->tls_auth)
   4961 		r = SSL_do_handshake(data->tls_auth);
   4962 	else
   4963 		r = SSL_do_handshake(data->tls);
   4964 
   4965 	if(r != 1) {
   4966 		int want;
   4967 		if(data->tls_auth)
   4968 			want = SSL_get_error(data->tls_auth, r);
   4969 		else
   4970 			want = SSL_get_error(data->tls, r);
   4971 		if(want == SSL_ERROR_WANT_READ) {
   4972 			if(data->shake_state == tls_hs_read) {
   4973 				/* try again later */
   4974 				return 1;
   4975 			}
   4976 			data->shake_state = tls_hs_read;
   4977 			/* switch back to reading mode */
   4978 			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
   4979 			return 1;
   4980 		} else if(want == SSL_ERROR_WANT_WRITE) {
   4981 			if(data->shake_state == tls_hs_write) {
   4982 				/* try again later */
   4983 				return 1;
   4984 			}
   4985 			data->shake_state = tls_hs_write;
   4986 			/* switch back to writing mode */
   4987 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
   4988 			return 1;
   4989 		} else {
   4990 			if(r == 0)
   4991 				VERBOSITY(5, (LOG_ERR, "TLS handshake: connection closed prematurely"));
   4992 			else {
   4993 				unsigned long err = ERR_get_error();
   4994 				if(!squelch_err_ssl_handshake(err)) {
   4995 					char a[64], s[256];
   4996 					if(data->query) {
   4997 						addr2str(&data->query->remote_addr, a, sizeof(a));
   4998 					} else {
   4999 						snprintf(a, sizeof(a), "unknown");
   5000 					}
   5001 					snprintf(s, sizeof(s), "TLS handshake failed from %s", a);
   5002 					log_crypto_from_err(LOG_ERR, s, err);
   5003 				}
   5004 			}
   5005 			cleanup_tcp_handler(data);
   5006 			return 0;
   5007 		}
   5008 	}
   5009 
   5010 	/* Use to log successful upgrade for testing - could be removed*/
   5011 	if(data->tls_auth)
   5012 		VERBOSITY(5, (LOG_INFO, "TLS-AUTH handshake succeeded."));
   5013 	else
   5014 		VERBOSITY(5, (LOG_INFO, "TLS handshake succeeded."));
   5015 	/* set back to the event we need to have when reading (or writing) */
   5016 	if(data->shake_state == tls_hs_read && writing) {
   5017 		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
   5018 	} else if(data->shake_state == tls_hs_write && !writing) {
   5019 		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
   5020 	}
   5021 	data->shake_state = tls_hs_none;
   5022 	return 1;
   5023 }
   5024 
   5025 /* Read more data into the buffer for tls read. Pass the amount of additional
   5026  * data required. Returns false if nothing needs to be done this event, or
   5027  * true if the additional data is in the buffer. */
   5028 static int
   5029 more_read_buf_tls(int fd, struct tcp_handler_data* data, void* bufpos,
   5030 	size_t add_amount, ssize_t* received)
   5031 {
   5032 	int r;
   5033 	ERR_clear_error();
   5034 	if(data->tls_auth)
   5035 		r = (*received=SSL_read(data->tls_auth, bufpos, add_amount));
   5036 	else
   5037 		r = (*received=SSL_read(data->tls, bufpos, add_amount));
   5038 	if(r <= 0) {
   5039 		int want;
   5040 		if(data->tls_auth)
   5041 			want = SSL_get_error(data->tls_auth, *received);
   5042 		else
   5043 			want = SSL_get_error(data->tls, *received);
   5044 		if(want == SSL_ERROR_ZERO_RETURN) {
   5045 			cleanup_tcp_handler(data);
   5046 			return 0; /* shutdown, closed */
   5047 		} else if(want == SSL_ERROR_WANT_READ) {
   5048 			/* wants to be called again */
   5049 			return 0;
   5050 		}
   5051 		else if(want == SSL_ERROR_WANT_WRITE) {
   5052 			/* switch to writing */
   5053 			data->shake_state = tls_hs_write_event;
   5054 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
   5055 			return 0;
   5056 		}
   5057 		cleanup_tcp_handler(data);
   5058 		log_crypto_err("could not SSL_read");
   5059 		return 0;
   5060 	}
   5061 	return 1;
   5062 }
   5063 
   5064 /** handle TLS reading of incoming query */
   5065 static void
   5066 handle_tls_reading(int fd, short event, void* arg)
   5067 {
   5068 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
   5069 	ssize_t received;
   5070 	uint32_t now = 0;
   5071 
   5072 	if ((event & EV_TIMEOUT)) {
   5073 		/* Connection timed out.  */
   5074 		cleanup_tcp_handler(data);
   5075 		return;
   5076 	}
   5077 
   5078 	if ((data->nsd->tcp_query_count > 0 &&
   5079 	     data->query_count >= data->nsd->tcp_query_count) ||
   5080 	    (data->query_count > 0 && data->tcp_no_more_queries))
   5081 	{
   5082 		/* No more queries allowed on this tcp connection. */
   5083 		cleanup_tcp_handler(data);
   5084 		return;
   5085 	}
   5086 
   5087 	assert((event & EV_READ));
   5088 
   5089 	if (data->bytes_transmitted == 0 && data->query_needs_reset) {
   5090 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
   5091 		data->query_needs_reset = 0;
   5092 	}
   5093 
   5094 	if(data->shake_state != tls_hs_none) {
   5095 		if(!tls_handshake(data, fd, 0))
   5096 			return;
   5097 		if(data->shake_state != tls_hs_none)
   5098 			return;
   5099 	}
   5100 
   5101 	if(data->pp2_enabled && data->pp2_header_state != pp2_header_done) {
   5102 		struct pp2_header* header = NULL;
   5103 		size_t want_read_size = 0;
   5104 		size_t current_read_size = 0;
   5105 		if(data->pp2_header_state == pp2_header_none) {
   5106 			want_read_size = PP2_HEADER_SIZE;
   5107 			if(buffer_remaining(data->query->packet) <
   5108 				want_read_size) {
   5109 				VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header"));
   5110 				cleanup_tcp_handler(data);
   5111 				return;
   5112 			}
   5113 			VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading fixed part of PROXYv2 header (len %lu)", (unsigned long)want_read_size));
   5114 			current_read_size = want_read_size;
   5115 			if(data->bytes_transmitted < current_read_size) {
   5116 				if(!more_read_buf_tls(fd, data,
   5117 					buffer_at(data->query->packet,
   5118 						data->bytes_transmitted),
   5119 					current_read_size - data->bytes_transmitted,
   5120 					&received))
   5121 					return;
   5122 				data->bytes_transmitted += received;
   5123 				buffer_skip(data->query->packet, received);
   5124 				if(data->bytes_transmitted != current_read_size)
   5125 					return;
   5126 				data->pp2_header_state = pp2_header_init;
   5127 			}
   5128 		}
   5129 		if(data->pp2_header_state == pp2_header_init) {
   5130 			int err;
   5131 			err = pp2_read_header(buffer_begin(data->query->packet),
   5132 				buffer_limit(data->query->packet));
   5133 			if(err) {
   5134 				VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not parse PROXYv2 header: %s", pp_lookup_error(err)));
   5135 				cleanup_tcp_handler(data);
   5136 				return;
   5137 			}
   5138 			header = (struct pp2_header*)buffer_begin(data->query->packet);
   5139 			want_read_size = ntohs(header->len);
   5140 			if(buffer_limit(data->query->packet) <
   5141 				PP2_HEADER_SIZE + want_read_size) {
   5142 				VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header"));
   5143 				cleanup_tcp_handler(data);
   5144 				return;
   5145 			}
   5146 			VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading variable part of PROXYv2 header (len %lu)", (unsigned long)want_read_size));
   5147 			current_read_size = PP2_HEADER_SIZE + want_read_size;
   5148 			if(want_read_size == 0) {
   5149 				/* nothing more to read; header is complete */
   5150 				data->pp2_header_state = pp2_header_done;
   5151 			} else if(data->bytes_transmitted < current_read_size) {
   5152 				if(!more_read_buf_tls(fd, data,
   5153 					buffer_at(data->query->packet,
   5154 						data->bytes_transmitted),
   5155 					current_read_size - data->bytes_transmitted,
   5156 					&received))
   5157 					return;
   5158 				data->bytes_transmitted += received;
   5159 				buffer_skip(data->query->packet, received);
   5160 				if(data->bytes_transmitted != current_read_size)
   5161 					return;
   5162 				data->pp2_header_state = pp2_header_done;
   5163 			}
   5164 		}
   5165 		if(data->pp2_header_state != pp2_header_done || !header) {
   5166 			VERBOSITY(6, (LOG_ERR, "proxy-protocol: wrong state for the PROXYv2 header"));
   5167 			cleanup_tcp_handler(data);
   5168 			return;
   5169 		}
   5170 		buffer_flip(data->query->packet);
   5171 		if(!consume_pp2_header(data->query->packet, data->query, 1)) {
   5172 			VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not consume PROXYv2 header"));
   5173 			cleanup_tcp_handler(data);
   5174 			return;
   5175 		}
   5176 		/* Clear and reset the buffer to read the following
   5177 		 * DNS packet(s). */
   5178 		buffer_clear(data->query->packet);
   5179 		data->bytes_transmitted = 0;
   5180 	}
   5181 	/*
   5182 	 * Check if we received the leading packet length bytes yet.
   5183 	 */
   5184 	if(data->bytes_transmitted < sizeof(uint16_t)) {
   5185 		if(!more_read_buf_tls(fd, data,
   5186 		    (char *) &data->query->tcplen + data->bytes_transmitted,
   5187 		    sizeof(uint16_t) - data->bytes_transmitted, &received))
   5188 			return;
   5189 		data->bytes_transmitted += received;
   5190 		if (data->bytes_transmitted < sizeof(uint16_t)) {
   5191 			/*
   5192 			 * Not done with the tcplen yet, wait for more
   5193 			 * data to become available.
   5194 			 */
   5195 			return;
   5196 		}
   5197 
   5198 		assert(data->bytes_transmitted == sizeof(uint16_t));
   5199 
   5200 		data->query->tcplen = ntohs(data->query->tcplen);
   5201 
   5202 		/*
   5203 		 * Minimum query size is:
   5204 		 *
   5205 		 *     Size of the header (12)
   5206 		 *   + Root domain name   (1)
   5207 		 *   + Query class        (2)
   5208 		 *   + Query type         (2)
   5209 		 */
   5210 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
   5211 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
   5212 			cleanup_tcp_handler(data);
   5213 			return;
   5214 		}
   5215 
   5216 		if (data->query->tcplen > data->query->maxlen) {
   5217 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
   5218 			cleanup_tcp_handler(data);
   5219 			return;
   5220 		}
   5221 
   5222 		buffer_set_limit(data->query->packet, data->query->tcplen);
   5223 	}
   5224 
   5225 	assert(buffer_remaining(data->query->packet) > 0);
   5226 
   5227 	/* Read the (remaining) query data.  */
   5228 	if(!more_read_buf_tls(fd, data, buffer_current(data->query->packet),
   5229 		buffer_remaining(data->query->packet), &received))
   5230 		return;
   5231 	data->bytes_transmitted += received;
   5232 	buffer_skip(data->query->packet, received);
   5233 	if (buffer_remaining(data->query->packet) > 0) {
   5234 		/*
   5235 		 * Message not yet complete, wait for more data to
   5236 		 * become available.
   5237 		 */
   5238 		return;
   5239 	}
   5240 
   5241 	assert(buffer_position(data->query->packet) == data->query->tcplen);
   5242 
   5243 	/* Account... */
   5244 #ifndef INET6
   5245 	STATUP(data->nsd, ctls);
   5246 #else
   5247 	if (data->query->remote_addr.ss_family == AF_INET) {
   5248 		STATUP(data->nsd, ctls);
   5249 	} else if (data->query->remote_addr.ss_family == AF_INET6) {
   5250 		STATUP(data->nsd, ctls6);
   5251 	}
   5252 #endif
   5253 
   5254 	/* We have a complete query, process it.  */
   5255 
   5256 	/* tcp-query-count: handle query counter ++ */
   5257 	data->query_count++;
   5258 
   5259 	buffer_flip(data->query->packet);
   5260 #ifdef USE_DNSTAP
   5261 	/*
   5262 	 * and send TCP-query with found address (local) and client address to dnstap process
   5263 	 */
   5264 	log_addr("query from client", &data->query->client_addr);
   5265 	log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
   5266 	if(verbosity >= 6 && data->query->is_proxied)
   5267 		log_addr("query via proxy", &data->query->remote_addr);
   5268 	dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr,
   5269 		data->query->client_addrlen, data->query->tcp, data->query->packet);
   5270 #endif /* USE_DNSTAP */
   5271 	data->query_state = server_process_query(data->nsd, data->query, &now);
   5272 	if (data->query_state == QUERY_DISCARDED) {
   5273 		/* Drop the packet and the entire connection... */
   5274 		STATUP(data->nsd, dropped);
   5275 		ZTATUP(data->nsd, data->query->zone, dropped);
   5276 		cleanup_tcp_handler(data);
   5277 		return;
   5278 	}
   5279 
   5280 #ifdef BIND8_STATS
   5281 	if (RCODE(data->query->packet) == RCODE_OK
   5282 	    && !AA(data->query->packet))
   5283 	{
   5284 		STATUP(data->nsd, nona);
   5285 		ZTATUP(data->nsd, data->query->zone, nona);
   5286 	}
   5287 #endif /* BIND8_STATS */
   5288 
   5289 #ifdef USE_ZONE_STATS
   5290 #ifndef INET6
   5291 	ZTATUP(data->nsd, data->query->zone, ctls);
   5292 #else
   5293 	if (data->query->remote_addr.ss_family == AF_INET) {
   5294 		ZTATUP(data->nsd, data->query->zone, ctls);
   5295 	} else if (data->query->remote_addr.ss_family == AF_INET6) {
   5296 		ZTATUP(data->nsd, data->query->zone, ctls6);
   5297 	}
   5298 #endif
   5299 #endif /* USE_ZONE_STATS */
   5300 
   5301 	query_add_optional(data->query, data->nsd, &now);
   5302 
   5303 	/* Switch to the tcp write handler.  */
   5304 	buffer_flip(data->query->packet);
   5305 	data->query->tcplen = buffer_remaining(data->query->packet);
   5306 #ifdef BIND8_STATS
   5307 	/* Account the rcode & TC... */
   5308 	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
   5309 	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
   5310 	if (TC(data->query->packet)) {
   5311 		STATUP(data->nsd, truncated);
   5312 		ZTATUP(data->nsd, data->query->zone, truncated);
   5313 	}
   5314 #endif /* BIND8_STATS */
   5315 #ifdef USE_DNSTAP
   5316 	/*
   5317 	 * sending TCP-response with found (earlier) address (local) and client address to dnstap process
   5318 	 */
   5319 	log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
   5320 	log_addr("response to client", &data->query->client_addr);
   5321 	if(verbosity >= 6 && data->query->is_proxied)
   5322 		log_addr("response via proxy", &data->query->remote_addr);
   5323 	dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr,
   5324 		data->query->client_addrlen, data->query->tcp, data->query->packet,
   5325 		data->query->zone);
   5326 #endif /* USE_DNSTAP */
   5327 	data->bytes_transmitted = 0;
   5328 
   5329 	tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
   5330 
   5331 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
   5332 	handle_tls_writing(fd, EV_WRITE, data);
   5333 }
   5334 
   5335 /** handle TLS writing of outgoing response */
   5336 static void
   5337 handle_tls_writing(int fd, short event, void* arg)
   5338 {
   5339 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
   5340 	ssize_t sent;
   5341 	struct query *q = data->query;
   5342 	/* static variable that holds reassembly buffer used to put the
   5343 	 * TCP length in front of the packet, like writev. */
   5344 	static buffer_type* global_tls_temp_buffer = NULL;
   5345 	buffer_type* write_buffer;
   5346 	uint32_t now = 0;
   5347 
   5348 	if ((event & EV_TIMEOUT) || !q) {
   5349 		/* Connection timed out.  */
   5350 		/* Or data->query is NULL, in which case nothing to do. */
   5351 		cleanup_tcp_handler(data);
   5352 		return;
   5353 	}
   5354 
   5355 	assert((event & EV_WRITE));
   5356 
   5357 	if(data->shake_state != tls_hs_none) {
   5358 		if(!tls_handshake(data, fd, 1))
   5359 			return;
   5360 		if(data->shake_state != tls_hs_none)
   5361 			return;
   5362 	}
   5363 
   5364 	if(data->tls_auth)
   5365 		(void)SSL_set_mode(data->tls_auth, SSL_MODE_ENABLE_PARTIAL_WRITE);
   5366 	else
   5367 		(void)SSL_set_mode(data->tls, SSL_MODE_ENABLE_PARTIAL_WRITE);
   5368 
   5369 	/* If we are writing the start of a message, we must include the length
   5370 	 * this is done with a copy into write_buffer. */
   5371 	write_buffer = NULL;
   5372 	if (data->bytes_transmitted == 0) {
   5373 		if(!global_tls_temp_buffer) {
   5374 			/* gets deallocated when nsd shuts down from
   5375 			 * nsd.region */
   5376 			global_tls_temp_buffer = buffer_create(nsd.region,
   5377 				QIOBUFSZ + sizeof(q->tcplen));
   5378 			if (!global_tls_temp_buffer) {
   5379 				return;
   5380 			}
   5381 		}
   5382 		write_buffer = global_tls_temp_buffer;
   5383 		buffer_clear(write_buffer);
   5384 		buffer_write_u16(write_buffer, q->tcplen);
   5385 		buffer_write(write_buffer, buffer_current(q->packet),
   5386 			(int)buffer_remaining(q->packet));
   5387 		buffer_flip(write_buffer);
   5388 	} else {
   5389 		write_buffer = q->packet;
   5390 	}
   5391 
   5392 	/* Write the response */
   5393 	ERR_clear_error();
   5394 	if(data->tls_auth)
   5395 		sent = SSL_write(data->tls_auth, buffer_current(write_buffer), buffer_remaining(write_buffer));
   5396 	else
   5397 		sent = SSL_write(data->tls, buffer_current(write_buffer), buffer_remaining(write_buffer));
   5398 	if(sent <= 0) {
   5399 		int want;
   5400 		if(data->tls_auth)
   5401 			want = SSL_get_error(data->tls_auth, sent);
   5402 		else
   5403 			want = SSL_get_error(data->tls, sent);
   5404 		if(want == SSL_ERROR_ZERO_RETURN) {
   5405 			cleanup_tcp_handler(data);
   5406 			/* closed */
   5407 		} else if(want == SSL_ERROR_WANT_READ) {
   5408 			/* switch back to reading */
   5409 			data->shake_state = tls_hs_read_event;
   5410 			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
   5411 		} else if(want != SSL_ERROR_WANT_WRITE) {
   5412 			cleanup_tcp_handler(data);
   5413 			{
   5414 				char client_ip[128], e[188];
   5415 				if(data->query) {
   5416 					addr2str(&data->query->client_addr, client_ip, sizeof(client_ip));
   5417 				} else {
   5418 					snprintf(client_ip, sizeof(client_ip), "unknown");
   5419 				}
   5420 				snprintf(e, sizeof(e), "failed writing to tls from %s: %s",
   5421 					client_ip, "SSL_write error");
   5422 				log_crypto_err(e);
   5423 			}
   5424 		}
   5425 		return;
   5426 	}
   5427 
   5428 	buffer_skip(write_buffer, sent);
   5429 	if(buffer_remaining(write_buffer) != 0) {
   5430 		/* If not all sent, sync up the real buffer if it wasn't used.*/
   5431 		if (data->bytes_transmitted == 0 && (ssize_t)sent > (ssize_t)sizeof(q->tcplen)) {
   5432 			buffer_skip(q->packet, (ssize_t)sent - (ssize_t)sizeof(q->tcplen));
   5433 		}
   5434 	}
   5435 
   5436 	data->bytes_transmitted += sent;
   5437 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
   5438 		/*
   5439 		 * Still more data to write when socket becomes
   5440 		 * writable again.
   5441 		 */
   5442 		return;
   5443 	}
   5444 
   5445 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
   5446 
   5447 	if (data->query_state == QUERY_IN_AXFR ||
   5448 		data->query_state == QUERY_IN_IXFR) {
   5449 		/* Continue processing AXFR and writing back results.  */
   5450 		buffer_clear(q->packet);
   5451 		if(data->query_state == QUERY_IN_AXFR)
   5452 			data->query_state = query_axfr(data->nsd, q, 0);
   5453 		else data->query_state = query_ixfr(data->nsd, q);
   5454 		if (data->query_state != QUERY_PROCESSED) {
   5455 			query_add_optional(data->query, data->nsd, &now);
   5456 
   5457 			/* Reset data. */
   5458 			buffer_flip(q->packet);
   5459 			q->tcplen = buffer_remaining(q->packet);
   5460 			data->bytes_transmitted = 0;
   5461 			/* Reset to writing mode.  */
   5462 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
   5463 
   5464 			/*
   5465 			 * Write data if/when the socket is writable
   5466 			 * again.
   5467 			 */
   5468 			return;
   5469 		}
   5470 	}
   5471 
   5472 	/*
   5473 	 * Done sending, wait for the next request to arrive on the
   5474 	 * TCP socket by installing the TCP read handler.
   5475 	 */
   5476 	if ((data->nsd->tcp_query_count > 0 &&
   5477 		data->query_count >= data->nsd->tcp_query_count) ||
   5478 		data->tcp_no_more_queries) {
   5479 
   5480 		(void) shutdown(fd, SHUT_WR);
   5481 	}
   5482 
   5483 	data->bytes_transmitted = 0;
   5484 	data->query_needs_reset = 1;
   5485 
   5486 	tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
   5487 }
   5488 #endif
   5489 
   5490 static void
   5491 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event),
   5492 	void* ATTR_UNUSED(arg))
   5493 {
   5494 	if(slowaccept) {
   5495 		configure_handler_event_types(EV_PERSIST | EV_READ);
   5496 		slowaccept = 0;
   5497 	}
   5498 }
   5499 
   5500 static int perform_accept(int fd, struct sockaddr *addr, socklen_t *addrlen)
   5501 {
   5502 #ifndef HAVE_ACCEPT4
   5503 	int s = accept(fd, addr, addrlen);
   5504 	if (s != -1) {
   5505 		if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) {
   5506 			log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno));
   5507 			close(s);
   5508 			s = -1;
   5509 			errno=EINTR; /* stop error printout as error in accept4
   5510 				by setting this errno, it omits printout, in
   5511 				later code that calls nsd_accept4 */
   5512 		}
   5513 	}
   5514 	return s;
   5515 #else
   5516 	return accept4(fd, addr, addrlen, SOCK_NONBLOCK);
   5517 #endif /* HAVE_ACCEPT4 */
   5518 }
   5519 
   5520 /*
   5521  * Handle an incoming TCP connection.  The connection is accepted and
   5522  * a new TCP reader event handler is added.  The TCP handler
   5523  * is responsible for cleanup when the connection is closed.
   5524  */
   5525 static void
   5526 handle_tcp_accept(int fd, short event, void* arg)
   5527 {
   5528 	struct tcp_accept_handler_data *data
   5529 		= (struct tcp_accept_handler_data *) arg;
   5530 	int s;
   5531 	int reject = 0;
   5532 	struct tcp_handler_data *tcp_data;
   5533 	region_type *tcp_region;
   5534 #ifdef INET6
   5535 	struct sockaddr_storage addr;
   5536 #else
   5537 	struct sockaddr_in addr;
   5538 #endif
   5539 	socklen_t addrlen;
   5540 	struct timeval timeout;
   5541 
   5542 	if (!(event & EV_READ)) {
   5543 		return;
   5544 	}
   5545 
   5546 	if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) {
   5547 		reject = data->nsd->options->tcp_reject_overflow;
   5548 		if (!reject) {
   5549 			return;
   5550 		}
   5551 	}
   5552 
   5553 	/* Accept it... */
   5554 	addrlen = sizeof(addr);
   5555 	s = perform_accept(fd, (struct sockaddr *) &addr, &addrlen);
   5556 	if (s == -1) {
   5557 		/**
   5558 		 * EMFILE and ENFILE is a signal that the limit of open
   5559 		 * file descriptors has been reached. Pause accept().
   5560 		 * EINTR is a signal interrupt. The others are various OS ways
   5561 		 * of saying that the client has closed the connection.
   5562 		 */
   5563 		if (errno == EMFILE || errno == ENFILE) {
   5564 			if (!slowaccept) {
   5565 				/* disable accept events */
   5566 				struct timeval tv;
   5567 				configure_handler_event_types(0);
   5568 				tv.tv_sec = SLOW_ACCEPT_TIMEOUT;
   5569 				tv.tv_usec = 0L;
   5570 				memset(&slowaccept_event, 0,
   5571 					sizeof(slowaccept_event));
   5572 				event_set(&slowaccept_event, -1, EV_TIMEOUT,
   5573 					handle_slowaccept_timeout, NULL);
   5574 				(void)event_base_set(data->event.ev_base,
   5575 					&slowaccept_event);
   5576 				(void)event_add(&slowaccept_event, &tv);
   5577 				slowaccept = 1;
   5578 				/* We don't want to spam the logs here */
   5579 			}
   5580 		} else if (errno != EINTR
   5581 			&& errno != EWOULDBLOCK
   5582 #ifdef ECONNABORTED
   5583 			&& errno != ECONNABORTED
   5584 #endif /* ECONNABORTED */
   5585 #ifdef EPROTO
   5586 			&& errno != EPROTO
   5587 #endif /* EPROTO */
   5588 			) {
   5589 			log_msg(LOG_ERR, "accept failed: %s", strerror(errno));
   5590 		}
   5591 		return;
   5592 	}
   5593 
   5594 	if (reject) {
   5595 		shutdown(s, SHUT_RDWR);
   5596 		close(s);
   5597 		return;
   5598 	}
   5599 
   5600 	/*
   5601 	 * This region is deallocated when the TCP connection is
   5602 	 * closed by the TCP handler.
   5603 	 */
   5604 	tcp_region = region_create(xalloc, free);
   5605 	tcp_data = (struct tcp_handler_data *) region_alloc(
   5606 		tcp_region, sizeof(struct tcp_handler_data));
   5607 	tcp_data->region = tcp_region;
   5608 	tcp_data->query = query_create(tcp_region, compressed_dname_offsets,
   5609 		compression_table_size, compressed_dnames);
   5610 	tcp_data->nsd = data->nsd;
   5611 	tcp_data->query_count = 0;
   5612 #ifdef HAVE_SSL
   5613 	tcp_data->shake_state = tls_hs_none;
   5614 	/* initialize both incase of dangling pointers */
   5615 	tcp_data->tls = NULL;
   5616 	tcp_data->tls_auth = NULL;
   5617 #endif
   5618 	tcp_data->query_needs_reset = 1;
   5619 	tcp_data->pp2_enabled = data->pp2_enabled;
   5620 	tcp_data->pp2_header_state = pp2_header_none;
   5621 	tcp_data->prev = NULL;
   5622 	tcp_data->next = NULL;
   5623 
   5624 	tcp_data->query_state = QUERY_PROCESSED;
   5625 	tcp_data->bytes_transmitted = 0;
   5626 	memcpy(&tcp_data->query->remote_addr, &addr, addrlen);
   5627 	tcp_data->query->remote_addrlen = addrlen;
   5628 	/* Copy remote_address to client_address.
   5629 	 * Simplest way/time for streams to do that. */
   5630 	memcpy(&tcp_data->query->client_addr, &addr, addrlen);
   5631 	tcp_data->query->client_addrlen = addrlen;
   5632 	tcp_data->query->is_proxied = 0;
   5633 
   5634 	tcp_data->tcp_no_more_queries = 0;
   5635 	tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000;
   5636 	if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) {
   5637 		/* very busy, give smaller timeout */
   5638 		tcp_data->tcp_timeout = 200;
   5639 	}
   5640 	memset(&tcp_data->event, 0, sizeof(tcp_data->event));
   5641 	timeout.tv_sec = tcp_data->tcp_timeout / 1000;
   5642 	timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000;
   5643 
   5644 #ifdef USE_DNSTAP
   5645 	/* save the address of the connection */
   5646 	tcp_data->socket = data->socket;
   5647 #endif /* USE_DNSTAP */
   5648 
   5649 #ifdef HAVE_SSL
   5650 	if (data->tls_accept) {
   5651 		tcp_data->tls = incoming_ssl_fd(tcp_data->nsd->tls_ctx, s);
   5652 		if(!tcp_data->tls) {
   5653 			close(s);
   5654 			return;
   5655 		}
   5656 		tcp_data->query->tls = tcp_data->tls;
   5657 		tcp_data->shake_state = tls_hs_read;
   5658 		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
   5659 		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
   5660 			  handle_tls_reading, tcp_data);
   5661 	} else if (data->tls_auth_accept) {
   5662 		tcp_data->tls_auth = incoming_ssl_fd(tcp_data->nsd->tls_auth_ctx, s);
   5663 		if(!tcp_data->tls_auth) {
   5664 			close(s);
   5665 			return;
   5666 		}
   5667 		tcp_data->query->tls_auth = tcp_data->tls_auth;
   5668 		tcp_data->shake_state = tls_hs_read;
   5669 		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
   5670 		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
   5671 			  handle_tls_reading, tcp_data);
   5672 	} else {
   5673 #endif
   5674 		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
   5675 		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
   5676 			  handle_tcp_reading, tcp_data);
   5677 #ifdef HAVE_SSL
   5678 	}
   5679 #endif
   5680 	if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) {
   5681 		log_msg(LOG_ERR, "cannot set tcp event base");
   5682 		close(s);
   5683 		region_destroy(tcp_region);
   5684 		return;
   5685 	}
   5686 	if(event_add(&tcp_data->event, &timeout) != 0) {
   5687 		log_msg(LOG_ERR, "cannot add tcp to event base");
   5688 		close(s);
   5689 		region_destroy(tcp_region);
   5690 		return;
   5691 	}
   5692 	if(tcp_active_list) {
   5693 		tcp_active_list->prev = tcp_data;
   5694 		tcp_data->next = tcp_active_list;
   5695 	}
   5696 	tcp_active_list = tcp_data;
   5697 
   5698 	/*
   5699 	 * Keep track of the total number of TCP handlers installed so
   5700 	 * we can stop accepting connections when the maximum number
   5701 	 * of simultaneous TCP connections is reached.
   5702 	 *
   5703 	 * If tcp-reject-overflow is enabled, however, then we do not
   5704 	 * change the handler event type; we keep it as-is and accept
   5705 	 * overflow TCP connections only so that we can forcibly kill
   5706 	 * them off.
   5707 	 */
   5708 	++data->nsd->current_tcp_count;
   5709 	if (!data->nsd->options->tcp_reject_overflow &&
   5710 	     data->nsd->current_tcp_count == data->nsd->maximum_tcp_count)
   5711 	{
   5712 		configure_handler_event_types(0);
   5713 	}
   5714 }
   5715 
   5716 #ifdef USE_XDP
   5717 static void handle_xdp(int fd, short event, void* arg) {
   5718 	struct xdp_handler_data *data = (struct xdp_handler_data*) arg;
   5719 
   5720 	if ((event & EV_READ))
   5721 		xdp_handle_recv_and_send(data->server);
   5722 	(void)fd;
   5723 }
   5724 #endif
   5725 
   5726 static void
   5727 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout)
   5728 {
   5729 	size_t i;
   5730 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
   5731 	for (i = 0; i < nsd->child_count; ++i) {
   5732 		if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) {
   5733 			if (write(nsd->children[i].child_fd,
   5734 				&command,
   5735 				sizeof(command)) == -1)
   5736 			{
   5737 				if(errno != EAGAIN && errno != EINTR)
   5738 					log_msg(LOG_ERR, "problems sending command %d to server %d: %s",
   5739 					(int) command,
   5740 					(int) nsd->children[i].pid,
   5741 					strerror(errno));
   5742 			} else if (timeout > 0) {
   5743 				(void)block_read(NULL,
   5744 					nsd->children[i].child_fd,
   5745 					&command, sizeof(command), timeout);
   5746 			}
   5747 			fsync(nsd->children[i].child_fd);
   5748 			close(nsd->children[i].child_fd);
   5749 			nsd->children[i].child_fd = -1;
   5750 		}
   5751 	}
   5752 }
   5753 
   5754 static void
   5755 send_children_quit(struct nsd* nsd)
   5756 {
   5757 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit"));
   5758 	send_children_command(nsd, NSD_QUIT, 0);
   5759 }
   5760 
   5761 static void
   5762 send_children_quit_and_wait(struct nsd* nsd)
   5763 {
   5764 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait"));
   5765 	send_children_command(nsd, NSD_QUIT_CHILD, 3);
   5766 }
   5767 
   5768 #ifdef BIND8_STATS
   5769 static void
   5770 set_children_stats(struct nsd* nsd)
   5771 {
   5772 	size_t i;
   5773 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
   5774 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children"));
   5775 	for (i = 0; i < nsd->child_count; ++i) {
   5776 		nsd->children[i].need_to_send_STATS = 1;
   5777 		nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE;
   5778 	}
   5779 }
   5780 #endif /* BIND8_STATS */
   5781 
   5782 static void
   5783 configure_handler_event_types(short event_types)
   5784 {
   5785 	size_t i;
   5786 
   5787 	for (i = 0; i < tcp_accept_handler_count; ++i) {
   5788 		struct event* handler = &tcp_accept_handlers[i].event;
   5789 		if(event_types) {
   5790 			/* reassign */
   5791 			int fd = handler->ev_fd;
   5792 			struct event_base* base = handler->ev_base;
   5793 			if(tcp_accept_handlers[i].event_added)
   5794 				event_del(handler);
   5795 			memset(handler, 0, sizeof(*handler));
   5796 			event_set(handler, fd, event_types,
   5797 				handle_tcp_accept, &tcp_accept_handlers[i]);
   5798 			if(event_base_set(base, handler) != 0)
   5799 				log_msg(LOG_ERR, "conhand: cannot event_base");
   5800 			if(event_add(handler, NULL) != 0)
   5801 				log_msg(LOG_ERR, "conhand: cannot event_add");
   5802 			tcp_accept_handlers[i].event_added = 1;
   5803 		} else {
   5804 			/* remove */
   5805 			if(tcp_accept_handlers[i].event_added) {
   5806 				event_del(handler);
   5807 				tcp_accept_handlers[i].event_added = 0;
   5808 			}
   5809 		}
   5810 	}
   5811 }
   5812