1 /* 2 * server.c -- nsd(8) network input/output 3 * 4 * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. 5 * 6 * See LICENSE for the license. 7 * 8 */ 9 10 #include "config.h" 11 12 #include <sys/types.h> 13 #include <sys/param.h> 14 #include <limits.h> 15 #include <sys/socket.h> 16 #include <sys/uio.h> 17 #include <sys/wait.h> 18 19 #include <netinet/in.h> 20 #ifdef USE_TCP_FASTOPEN 21 #include <netinet/tcp.h> 22 #endif 23 #include <arpa/inet.h> 24 25 #include <assert.h> 26 #include <ctype.h> 27 #include <errno.h> 28 #include <fcntl.h> 29 #include <stddef.h> 30 #include <stdio.h> 31 #include <stdlib.h> 32 #include <string.h> 33 #include <time.h> 34 #include <unistd.h> 35 #include <signal.h> 36 #include <netdb.h> 37 #include <poll.h> 38 #ifdef HAVE_SYS_RANDOM_H 39 #include <sys/random.h> 40 #endif 41 #ifndef SHUT_WR 42 #define SHUT_WR 1 43 #endif 44 #ifdef HAVE_MMAP 45 #include <sys/mman.h> 46 #endif /* HAVE_MMAP */ 47 #ifdef HAVE_OPENSSL_RAND_H 48 #include <openssl/rand.h> 49 #endif 50 #ifdef HAVE_OPENSSL_SSL_H 51 #include <openssl/ssl.h> 52 #endif 53 #ifdef HAVE_OPENSSL_ERR_H 54 #include <openssl/err.h> 55 #endif 56 #ifdef HAVE_OPENSSL_OCSP_H 57 #include <openssl/ocsp.h> 58 #endif 59 #ifndef USE_MINI_EVENT 60 # ifdef HAVE_EVENT_H 61 # include <event.h> 62 # else 63 # include <event2/event.h> 64 # include "event2/event_struct.h" 65 # include "event2/event_compat.h" 66 # endif 67 #else 68 # include "mini_event.h" 69 #endif 70 71 #include "axfr.h" 72 #include "namedb.h" 73 #include "netio.h" 74 #include "xfrd.h" 75 #include "xfrd-tcp.h" 76 #include "xfrd-disk.h" 77 #include "difffile.h" 78 #include "nsec3.h" 79 #include "ipc.h" 80 #include "udb.h" 81 #include "remote.h" 82 #include "lookup3.h" 83 #include "rrl.h" 84 #include "ixfr.h" 85 #ifdef USE_DNSTAP 86 #include "dnstap/dnstap_collector.h" 87 #endif 88 #include "verify.h" 89 #include "util/proxy_protocol.h" 90 #ifdef USE_XDP 91 #include "xdp-server.h" 92 #endif 93 #ifdef USE_METRICS 94 #include "metrics.h" 95 #endif /* USE_METRICS */ 96 97 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */ 98 99 #ifdef USE_DNSTAP 100 /* 101 * log_addr() - the function to print sockaddr_in/sockaddr_in6 structures content 102 * just like its done in Unbound via the same log_addr(VERB_LEVEL, const char*, sockaddr_storage*) 103 */ 104 static void 105 log_addr(const char* descr, 106 #ifdef INET6 107 struct sockaddr_storage* addr 108 #else 109 struct sockaddr_in* addr 110 #endif 111 ) 112 { 113 char str_buf[64]; 114 if(verbosity < 6) 115 return; 116 if( 117 #ifdef INET6 118 addr->ss_family == AF_INET 119 #else 120 addr->sin_family == AF_INET 121 #endif 122 ) { 123 struct sockaddr_in* s = (struct sockaddr_in*)addr; 124 inet_ntop(AF_INET, &s->sin_addr.s_addr, str_buf, sizeof(str_buf)); 125 VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s->sin_port))); 126 #ifdef INET6 127 } else { 128 struct sockaddr_in6* s6 = (struct sockaddr_in6*)addr; 129 inet_ntop(AF_INET6, &s6->sin6_addr.s6_addr, str_buf, sizeof(str_buf)); 130 VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s6->sin6_port))); 131 #endif 132 } 133 } 134 #endif /* USE_DNSTAP */ 135 136 #ifdef USE_TCP_FASTOPEN 137 #define TCP_FASTOPEN_FILE "/proc/sys/net/ipv4/tcp_fastopen" 138 #define TCP_FASTOPEN_SERVER_BIT_MASK 0x2 139 #endif 140 141 /* header state for the PROXYv2 header (for TCP) */ 142 enum pp2_header_state { 143 /* no header encounter yet */ 144 pp2_header_none = 0, 145 /* read the static part of the header */ 146 pp2_header_init, 147 /* read the full header */ 148 pp2_header_done 149 }; 150 151 /* 152 * Data for the UDP handlers. 153 */ 154 struct udp_handler_data 155 { 156 struct nsd *nsd; 157 struct nsd_socket *socket; 158 struct event event; 159 /* if set, PROXYv2 is expected on this connection */ 160 int pp2_enabled; 161 }; 162 163 struct tcp_accept_handler_data { 164 struct nsd *nsd; 165 struct nsd_socket *socket; 166 int event_added; 167 struct event event; 168 #ifdef HAVE_SSL 169 /* handler accepts TLS connections on the dedicated port */ 170 int tls_accept; 171 int tls_auth_accept; 172 #endif 173 /* if set, PROXYv2 is expected on this connection */ 174 int pp2_enabled; 175 }; 176 177 #ifdef USE_XDP 178 struct xdp_handler_data { 179 struct nsd *nsd; 180 struct xdp_server *server; 181 struct event event; 182 }; 183 #endif 184 185 /* 186 * These globals are used to enable the TCP accept handlers 187 * when the number of TCP connection drops below the maximum 188 * number of TCP connections. 189 */ 190 static size_t tcp_accept_handler_count; 191 static struct tcp_accept_handler_data *tcp_accept_handlers; 192 193 static struct event slowaccept_event; 194 static int slowaccept; 195 196 #ifdef HAVE_SSL 197 static unsigned char *ocspdata = NULL; 198 static long ocspdata_len = 0; 199 #endif 200 201 #ifdef NONBLOCKING_IS_BROKEN 202 /* Define NUM_RECV_PER_SELECT to 1 (one) to avoid opportunistically trying to 203 read multiple times from a socket when reported ready by select. */ 204 # define NUM_RECV_PER_SELECT (1) 205 #else /* !NONBLOCKING_IS_BROKEN */ 206 # define NUM_RECV_PER_SELECT (100) 207 #endif /* NONBLOCKING_IS_BROKEN */ 208 209 #ifndef HAVE_MMSGHDR 210 struct mmsghdr { 211 struct msghdr msg_hdr; 212 unsigned int msg_len; 213 }; 214 #endif 215 216 static struct mmsghdr msgs[NUM_RECV_PER_SELECT]; 217 static struct iovec iovecs[NUM_RECV_PER_SELECT]; 218 static struct query *queries[NUM_RECV_PER_SELECT]; 219 #ifdef USE_XDP 220 static struct query *xdp_queries[XDP_RX_BATCH_SIZE]; 221 #endif 222 223 /* 224 * Data for the TCP connection handlers. 225 * 226 * The TCP handlers use non-blocking I/O. This is necessary to avoid 227 * blocking the entire server on a slow TCP connection, but does make 228 * reading from and writing to the socket more complicated. 229 * 230 * Basically, whenever a read/write would block (indicated by the 231 * EAGAIN errno variable) we remember the position we were reading 232 * from/writing to and return from the TCP reading/writing event 233 * handler. When the socket becomes readable/writable again we 234 * continue from the same position. 235 */ 236 struct tcp_handler_data 237 { 238 /* 239 * The region used to allocate all TCP connection related 240 * data, including this structure. This region is destroyed 241 * when the connection is closed. 242 */ 243 region_type* region; 244 245 /* 246 * The global nsd structure. 247 */ 248 struct nsd* nsd; 249 250 /* 251 * The current query data for this TCP connection. 252 */ 253 query_type* query; 254 255 /* 256 * The query_state is used to remember if we are performing an 257 * AXFR, if we're done processing, or if we should discard the 258 * query and connection. 259 */ 260 query_state_type query_state; 261 262 /* 263 * The event for the file descriptor and tcp timeout 264 */ 265 struct event event; 266 267 /* 268 * The bytes_transmitted field is used to remember the number 269 * of bytes transmitted when receiving or sending a DNS 270 * packet. The count includes the two additional bytes used 271 * to specify the packet length on a TCP connection. 272 */ 273 size_t bytes_transmitted; 274 275 /* If the query is restarted and needs a reset */ 276 int query_needs_reset; 277 278 /* 279 * The number of queries handled by this specific TCP connection. 280 */ 281 int query_count; 282 283 /* 284 * The timeout in msec for this tcp connection 285 */ 286 int tcp_timeout; 287 288 /* 289 * If the connection is allowed to have further queries on it. 290 */ 291 int tcp_no_more_queries; 292 293 #ifdef USE_DNSTAP 294 /* the socket of the accept socket to find proper service (local) address the socket is bound to. */ 295 struct nsd_socket *socket; 296 #endif /* USE_DNSTAP */ 297 298 /* if set, PROXYv2 is expected on this connection */ 299 int pp2_enabled; 300 301 /* header state for the PROXYv2 header (for TCP) */ 302 enum pp2_header_state pp2_header_state; 303 304 #ifdef HAVE_SSL 305 /* 306 * TLS objects. 307 */ 308 SSL* tls; 309 SSL* tls_auth; 310 311 /* 312 * TLS handshake state. 313 */ 314 enum { tls_hs_none, tls_hs_read, tls_hs_write, 315 tls_hs_read_event, tls_hs_write_event } shake_state; 316 #endif 317 /* list of connections, for service of remaining tcp channels */ 318 struct tcp_handler_data *prev, *next; 319 }; 320 /* global that is the list of active tcp channels */ 321 static struct tcp_handler_data *tcp_active_list = NULL; 322 323 /* 324 * Handle incoming queries on the UDP server sockets. 325 */ 326 static void handle_udp(int fd, short event, void* arg); 327 328 /* 329 * Handle incoming connections on the TCP sockets. These handlers 330 * usually wait for the NETIO_EVENT_READ event (indicating an incoming 331 * connection) but are disabled when the number of current TCP 332 * connections is equal to the maximum number of TCP connections. 333 * Disabling is done by changing the handler to wait for the 334 * NETIO_EVENT_NONE type. This is done using the function 335 * configure_tcp_accept_handlers. 336 */ 337 static void handle_tcp_accept(int fd, short event, void* arg); 338 339 /* 340 * Handle incoming queries on a TCP connection. The TCP connections 341 * are configured to be non-blocking and the handler may be called 342 * multiple times before a complete query is received. 343 */ 344 static void handle_tcp_reading(int fd, short event, void* arg); 345 346 /* 347 * Handle outgoing responses on a TCP connection. The TCP connections 348 * are configured to be non-blocking and the handler may be called 349 * multiple times before a complete response is sent. 350 */ 351 static void handle_tcp_writing(int fd, short event, void* arg); 352 353 #ifdef HAVE_SSL 354 /* Create SSL object and associate fd */ 355 static SSL* incoming_ssl_fd(SSL_CTX* ctx, int fd); 356 /* 357 * Handle TLS handshake. May be called multiple times if incomplete. 358 */ 359 static int tls_handshake(struct tcp_handler_data* data, int fd, int writing); 360 361 /* 362 * Handle incoming queries on a TLS over TCP connection. The TLS 363 * connections are configured to be non-blocking and the handler may 364 * be called multiple times before a complete query is received. 365 */ 366 static void handle_tls_reading(int fd, short event, void* arg); 367 368 /* 369 * Handle outgoing responses on a TLS over TCP connection. The TLS 370 * connections are configured to be non-blocking and the handler may 371 * be called multiple times before a complete response is sent. 372 */ 373 static void handle_tls_writing(int fd, short event, void* arg); 374 #endif 375 376 #ifdef USE_XDP 377 static void handle_xdp(int fd, short event, void* arg); 378 #endif 379 380 /* 381 * Send all children the quit nonblocking, then close pipe. 382 */ 383 static void send_children_quit(struct nsd* nsd); 384 /* same, for shutdown time, waits for child to exit to avoid restart issues */ 385 static void send_children_quit_and_wait(struct nsd* nsd); 386 387 /* set childrens flags to send NSD_STATS to them */ 388 #ifdef BIND8_STATS 389 static void set_children_stats(struct nsd* nsd); 390 #endif /* BIND8_STATS */ 391 392 /* 393 * Change the event types the HANDLERS are interested in to EVENT_TYPES. 394 */ 395 static void configure_handler_event_types(short event_types); 396 397 static uint16_t *compressed_dname_offsets = 0; 398 static uint32_t compression_table_capacity = 0; 399 static uint32_t compression_table_size = 0; 400 static domain_type* compressed_dnames[MAXRRSPP]; 401 402 #ifdef USE_TCP_FASTOPEN 403 /* Checks to see if the kernel value must be manually changed in order for 404 TCP Fast Open to support server mode */ 405 static void report_tcp_fastopen_config() { 406 407 int tcp_fastopen_fp; 408 uint8_t tcp_fastopen_value; 409 410 if ( (tcp_fastopen_fp = open(TCP_FASTOPEN_FILE, O_RDONLY)) == -1 ) { 411 log_msg(LOG_INFO,"Error opening " TCP_FASTOPEN_FILE ": %s\n", strerror(errno)); 412 } 413 if (read(tcp_fastopen_fp, &tcp_fastopen_value, 1) == -1 ) { 414 log_msg(LOG_INFO,"Error reading " TCP_FASTOPEN_FILE ": %s\n", strerror(errno)); 415 close(tcp_fastopen_fp); 416 } 417 if (!(tcp_fastopen_value & TCP_FASTOPEN_SERVER_BIT_MASK)) { 418 log_msg(LOG_WARNING, "Error: TCP Fast Open support is available and configured in NSD by default.\n"); 419 log_msg(LOG_WARNING, "However the kernel parameters are not configured to support TCP_FASTOPEN in server mode.\n"); 420 log_msg(LOG_WARNING, "To enable TFO use the command:"); 421 log_msg(LOG_WARNING, " 'sudo sysctl -w net.ipv4.tcp_fastopen=2' for pure server mode or\n"); 422 log_msg(LOG_WARNING, " 'sudo sysctl -w net.ipv4.tcp_fastopen=3' for both client and server mode\n"); 423 log_msg(LOG_WARNING, "NSD will not have TCP Fast Open available until this change is made.\n"); 424 close(tcp_fastopen_fp); 425 } 426 close(tcp_fastopen_fp); 427 } 428 #endif 429 430 /* 431 * Remove the specified pid from the list of child pids. Returns -1 if 432 * the pid is not in the list, child_num otherwise. The field is set to 0. 433 */ 434 static int 435 delete_child_pid(struct nsd *nsd, pid_t pid) 436 { 437 size_t i; 438 for (i = 0; i < nsd->child_count; ++i) { 439 if (nsd->children[i].pid == pid) { 440 nsd->children[i].pid = 0; 441 if(!nsd->children[i].need_to_exit) { 442 if(nsd->children[i].child_fd != -1) 443 close(nsd->children[i].child_fd); 444 nsd->children[i].child_fd = -1; 445 if(nsd->children[i].handler) 446 nsd->children[i].handler->fd = -1; 447 } 448 return i; 449 } 450 } 451 return -1; 452 } 453 454 /* 455 * Restart child servers if necessary. 456 */ 457 static int 458 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio, 459 int* xfrd_sock_p) 460 { 461 size_t i; 462 int sv[2]; 463 464 /* Fork the child processes... */ 465 for (i = 0; i < nsd->child_count; ++i) { 466 if (nsd->children[i].pid <= 0) { 467 if (nsd->children[i].child_fd != -1) 468 close(nsd->children[i].child_fd); 469 if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) { 470 log_msg(LOG_ERR, "socketpair: %s", 471 strerror(errno)); 472 return -1; 473 } 474 nsd->children[i].child_fd = sv[0]; 475 nsd->children[i].parent_fd = sv[1]; 476 nsd->children[i].pid = fork(); 477 switch (nsd->children[i].pid) { 478 default: /* SERVER MAIN */ 479 close(nsd->children[i].parent_fd); 480 nsd->children[i].parent_fd = -1; 481 if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) { 482 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 483 } 484 if(!nsd->children[i].handler) 485 { 486 struct main_ipc_handler_data *ipc_data; 487 ipc_data = (struct main_ipc_handler_data*) region_alloc( 488 region, sizeof(struct main_ipc_handler_data)); 489 ipc_data->nsd = nsd; 490 ipc_data->child = &nsd->children[i]; 491 nsd->children[i].handler = (struct netio_handler*) region_alloc( 492 region, sizeof(struct netio_handler)); 493 nsd->children[i].handler->fd = nsd->children[i].child_fd; 494 nsd->children[i].handler->timeout = NULL; 495 nsd->children[i].handler->user_data = ipc_data; 496 nsd->children[i].handler->event_types = NETIO_EVENT_READ; 497 nsd->children[i].handler->event_handler = parent_handle_child_command; 498 netio_add_handler(netio, nsd->children[i].handler); 499 } 500 /* restart - update fd */ 501 nsd->children[i].handler->fd = nsd->children[i].child_fd; 502 break; 503 case 0: /* CHILD */ 504 #ifdef MEMCLEAN /* OS collects memory pages */ 505 region_destroy(region); 506 #endif 507 nsd->pid = 0; 508 nsd->child_count = 0; 509 nsd->server_kind = nsd->children[i].kind; 510 nsd->this_child = &nsd->children[i]; 511 nsd->this_child->child_num = i; 512 /* remove signal flags inherited from parent 513 the parent will handle them. */ 514 nsd->signal_hint_reload_hup = 0; 515 nsd->signal_hint_reload = 0; 516 nsd->signal_hint_child = 0; 517 nsd->signal_hint_quit = 0; 518 nsd->signal_hint_shutdown = 0; 519 nsd->signal_hint_stats = 0; 520 nsd->signal_hint_statsusr = 0; 521 close(*xfrd_sock_p); 522 close(nsd->this_child->child_fd); 523 nsd->this_child->child_fd = -1; 524 if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) { 525 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 526 } 527 server_child(nsd); 528 /* NOTREACH */ 529 exit(0); 530 case -1: 531 log_msg(LOG_ERR, "fork failed: %s", 532 strerror(errno)); 533 return -1; 534 } 535 } 536 } 537 return 0; 538 } 539 540 #ifdef BIND8_STATS 541 static void set_bind8_alarm(struct nsd* nsd) 542 { 543 /* resync so that the next alarm is on the next whole minute */ 544 if(nsd->st_period > 0) /* % by 0 gives divbyzero error */ 545 alarm(nsd->st_period - (time(NULL) % nsd->st_period)); 546 } 547 #endif 548 549 /* set zone stat ids for zones initially read in */ 550 static void 551 zonestatid_tree_set(struct nsd* nsd) 552 { 553 struct radnode* n; 554 for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) { 555 zone_type* zone = (zone_type*)n->elem; 556 zone->zonestatid = getzonestatid(nsd->options, zone->opts); 557 } 558 } 559 560 #ifdef USE_ZONE_STATS 561 void 562 server_zonestat_alloc(struct nsd* nsd) 563 { 564 size_t num = (nsd->options->zonestatnames->count==0?1: 565 nsd->options->zonestatnames->count); 566 size_t sz = sizeof(struct nsdst)*num; 567 char tmpfile[256]; 568 uint8_t z = 0; 569 570 /* file names */ 571 nsd->zonestatfname[0] = 0; 572 nsd->zonestatfname[1] = 0; 573 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0", 574 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 575 nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile); 576 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1", 577 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 578 nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile); 579 580 /* file descriptors */ 581 nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600); 582 if(nsd->zonestatfd[0] == -1) { 583 log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0], 584 strerror(errno)); 585 exit(1); 586 } 587 nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600); 588 if(nsd->zonestatfd[1] == -1) { 589 log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1], 590 strerror(errno)); 591 close(nsd->zonestatfd[0]); 592 unlink(nsd->zonestatfname[0]); 593 exit(1); 594 } 595 596 #ifdef HAVE_MMAP 597 if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) { 598 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0], 599 strerror(errno)); 600 exit(1); 601 } 602 if(write(nsd->zonestatfd[0], &z, 1) == -1) { 603 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 604 nsd->zonestatfname[0], strerror(errno)); 605 exit(1); 606 } 607 if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) { 608 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1], 609 strerror(errno)); 610 exit(1); 611 } 612 if(write(nsd->zonestatfd[1], &z, 1) == -1) { 613 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 614 nsd->zonestatfname[1], strerror(errno)); 615 exit(1); 616 } 617 nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE, 618 MAP_SHARED, nsd->zonestatfd[0], 0); 619 if(nsd->zonestat[0] == MAP_FAILED) { 620 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 621 unlink(nsd->zonestatfname[0]); 622 unlink(nsd->zonestatfname[1]); 623 exit(1); 624 } 625 nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE, 626 MAP_SHARED, nsd->zonestatfd[1], 0); 627 if(nsd->zonestat[1] == MAP_FAILED) { 628 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 629 unlink(nsd->zonestatfname[0]); 630 unlink(nsd->zonestatfname[1]); 631 exit(1); 632 } 633 memset(nsd->zonestat[0], 0, sz); 634 memset(nsd->zonestat[1], 0, sz); 635 nsd->zonestatsize[0] = num; 636 nsd->zonestatsize[1] = num; 637 nsd->zonestatdesired = num; 638 nsd->zonestatsizenow = num; 639 nsd->zonestatnow = nsd->zonestat[0]; 640 #endif /* HAVE_MMAP */ 641 } 642 643 void 644 zonestat_remap(struct nsd* nsd, int idx, size_t sz) 645 { 646 #ifdef HAVE_MMAP 647 #ifdef MREMAP_MAYMOVE 648 nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx], 649 sizeof(struct nsdst)*nsd->zonestatsize[idx], sz, 650 MREMAP_MAYMOVE); 651 if(nsd->zonestat[idx] == MAP_FAILED) { 652 log_msg(LOG_ERR, "mremap failed: %s", strerror(errno)); 653 exit(1); 654 } 655 #else /* !HAVE MREMAP */ 656 if(msync(nsd->zonestat[idx], 657 sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0) 658 log_msg(LOG_ERR, "msync failed: %s", strerror(errno)); 659 if(munmap(nsd->zonestat[idx], 660 sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0) 661 log_msg(LOG_ERR, "munmap failed: %s", strerror(errno)); 662 nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz, 663 PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0); 664 if(nsd->zonestat[idx] == MAP_FAILED) { 665 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 666 exit(1); 667 } 668 #endif /* MREMAP */ 669 #endif /* HAVE_MMAP */ 670 } 671 672 /* realloc the zonestat array for the one that is not currently in use, 673 * to match the desired new size of the array (if applicable) */ 674 void 675 server_zonestat_realloc(struct nsd* nsd) 676 { 677 #ifdef HAVE_MMAP 678 uint8_t z = 0; 679 size_t sz; 680 int idx = 0; /* index of the zonestat array that is not in use */ 681 if(nsd->zonestatnow == nsd->zonestat[0]) 682 idx = 1; 683 if(nsd->zonestatsize[idx] == nsd->zonestatdesired) 684 return; 685 sz = sizeof(struct nsdst)*nsd->zonestatdesired; 686 if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) { 687 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx], 688 strerror(errno)); 689 exit(1); 690 } 691 if(write(nsd->zonestatfd[idx], &z, 1) == -1) { 692 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 693 nsd->zonestatfname[idx], strerror(errno)); 694 exit(1); 695 } 696 zonestat_remap(nsd, idx, sz); 697 /* zero the newly allocated region */ 698 if(nsd->zonestatdesired > nsd->zonestatsize[idx]) { 699 memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) * 700 nsd->zonestatsize[idx], 0, sizeof(struct nsdst) * 701 (nsd->zonestatdesired - nsd->zonestatsize[idx])); 702 } 703 nsd->zonestatsize[idx] = nsd->zonestatdesired; 704 #endif /* HAVE_MMAP */ 705 } 706 707 /* switchover to use the other array for the new children, that 708 * briefly coexist with the old children. And we want to avoid them 709 * both writing to the same statistics arrays. */ 710 void 711 server_zonestat_switch(struct nsd* nsd) 712 { 713 if(nsd->zonestatnow == nsd->zonestat[0]) { 714 nsd->zonestatnow = nsd->zonestat[1]; 715 nsd->zonestatsizenow = nsd->zonestatsize[1]; 716 } else { 717 nsd->zonestatnow = nsd->zonestat[0]; 718 nsd->zonestatsizenow = nsd->zonestatsize[0]; 719 } 720 } 721 #endif /* USE_ZONE_STATS */ 722 723 #ifdef BIND8_STATS 724 void 725 server_stat_alloc(struct nsd* nsd) 726 { 727 char tmpfile[256]; 728 size_t sz = sizeof(struct nsdst) * nsd->child_count * 2; 729 uint8_t z = 0; 730 731 /* file name */ 732 nsd->statfname = 0; 733 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.stat", 734 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 735 nsd->statfname = region_strdup(nsd->region, tmpfile); 736 737 /* file descriptor */ 738 nsd->statfd = open(nsd->statfname, O_CREAT|O_RDWR, 0600); 739 if(nsd->statfd == -1) { 740 log_msg(LOG_ERR, "cannot create %s: %s", nsd->statfname, 741 strerror(errno)); 742 unlink(nsd->zonestatfname[0]); 743 unlink(nsd->zonestatfname[1]); 744 exit(1); 745 } 746 747 #ifdef HAVE_MMAP 748 if(lseek(nsd->statfd, (off_t)sz-1, SEEK_SET) == -1) { 749 log_msg(LOG_ERR, "lseek %s: %s", nsd->statfname, 750 strerror(errno)); 751 goto fail_exit; 752 } 753 if(write(nsd->statfd, &z, 1) == -1) { 754 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 755 nsd->statfname, strerror(errno)); 756 goto fail_exit; 757 } 758 nsd->stat_map = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE, 759 MAP_SHARED, nsd->statfd, 0); 760 if(nsd->stat_map == MAP_FAILED) { 761 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 762 fail_exit: 763 close(nsd->statfd); 764 unlink(nsd->statfname); 765 unlink(nsd->zonestatfname[0]); 766 unlink(nsd->zonestatfname[1]); 767 exit(1); 768 } 769 memset(nsd->stat_map, 0, sz); 770 nsd->stats_per_child[0] = nsd->stat_map; 771 nsd->stats_per_child[1] = &nsd->stat_map[nsd->child_count]; 772 nsd->stat_current = 0; 773 nsd->st = &nsd->stats_per_child[nsd->stat_current][0]; 774 #endif /* HAVE_MMAP */ 775 } 776 #endif /* BIND8_STATS */ 777 778 #ifdef BIND8_STATS 779 void 780 server_stat_free(struct nsd* nsd) 781 { 782 unlink(nsd->statfname); 783 } 784 #endif /* BIND8_STATS */ 785 786 static void 787 cleanup_dname_compression_tables(void *ptr) 788 { 789 free(ptr); 790 compressed_dname_offsets = NULL; 791 compression_table_capacity = 0; 792 } 793 794 static void 795 initialize_dname_compression_tables(struct nsd *nsd) 796 { 797 size_t needed = domain_table_count(nsd->db->domains) + 1; 798 needed += EXTRA_DOMAIN_NUMBERS; 799 if(compression_table_capacity < needed) { 800 if(compressed_dname_offsets) { 801 region_remove_cleanup(nsd->db->region, 802 cleanup_dname_compression_tables, 803 compressed_dname_offsets); 804 free(compressed_dname_offsets); 805 } 806 compressed_dname_offsets = (uint16_t *) xmallocarray( 807 needed, sizeof(uint16_t)); 808 region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables, 809 compressed_dname_offsets); 810 compression_table_capacity = needed; 811 compression_table_size=domain_table_count(nsd->db->domains)+1; 812 } 813 memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t)); 814 compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */ 815 } 816 817 static int 818 set_cloexec(struct nsd_socket *sock) 819 { 820 assert(sock != NULL); 821 822 if(fcntl(sock->s, F_SETFD, FD_CLOEXEC) == -1) { 823 const char *socktype = 824 sock->addr.ai_family == SOCK_DGRAM ? "udp" : "tcp"; 825 log_msg(LOG_ERR, "fcntl(..., O_CLOEXEC) failed for %s: %s", 826 socktype, strerror(errno)); 827 return -1; 828 } 829 830 return 1; 831 } 832 833 static int 834 set_reuseport(struct nsd_socket *sock) 835 { 836 #ifdef SO_REUSEPORT 837 int on = 1; 838 #ifdef SO_REUSEPORT_LB 839 /* FreeBSD 12 has SO_REUSEPORT_LB that does load balancing like 840 * SO_REUSEPORT on Linux. This is what the users want with the config 841 * option in nsd.conf; if we actually need local address and port reuse 842 * they'll also need to have SO_REUSEPORT set for them, assume it was 843 * _LB they want. 844 */ 845 int opt = SO_REUSEPORT_LB; 846 static const char optname[] = "SO_REUSEPORT_LB"; 847 #else /* !SO_REUSEPORT_LB */ 848 int opt = SO_REUSEPORT; 849 static const char optname[] = "SO_REUSEPORT"; 850 #endif /* SO_REUSEPORT_LB */ 851 852 if (0 == setsockopt(sock->s, SOL_SOCKET, opt, &on, sizeof(on))) { 853 return 1; 854 } else if(verbosity >= 3 || errno != ENOPROTOOPT) { 855 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s", 856 optname, strerror(errno)); 857 } 858 return -1; 859 #else 860 (void)sock; 861 #endif /* SO_REUSEPORT */ 862 863 return 0; 864 } 865 866 static int 867 set_reuseaddr(struct nsd_socket *sock) 868 { 869 #ifdef SO_REUSEADDR 870 int on = 1; 871 if(setsockopt(sock->s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == 0) { 872 return 1; 873 } 874 log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s", 875 strerror(errno)); 876 return -1; 877 #endif /* SO_REUSEADDR */ 878 return 0; 879 } 880 881 static int 882 set_rcvbuf(struct nsd_socket *sock, int rcv) 883 { 884 #ifdef SO_RCVBUF 885 #ifdef SO_RCVBUFFORCE 886 if(0 == setsockopt( 887 sock->s, SOL_SOCKET, SO_RCVBUFFORCE, &rcv, sizeof(rcv))) 888 { 889 return 1; 890 } 891 if(errno == EPERM || errno == ENOBUFS) { 892 if(errno == ENOBUFS) { 893 VERBOSITY(2, (LOG_INFO, "setsockopt(..., SO_RCVBUFFORCE, %d) was not granted: %s", 894 rcv, strerror(errno))); 895 } 896 return 0; 897 } 898 log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, %d) failed: %s", 899 rcv, strerror(errno)); 900 return -1; 901 #else /* !SO_RCVBUFFORCE */ 902 if (0 == setsockopt( 903 sock->s, SOL_SOCKET, SO_RCVBUF, &rcv, sizeof(rcv))) 904 { 905 return 1; 906 } 907 if(errno == ENOSYS || errno == ENOBUFS) { 908 if(errno == ENOBUFS) { 909 VERBOSITY(2, (LOG_INFO, "setsockopt(..., SO_RCVBUF, %d) was not granted: %s", 910 rcv, strerror(errno))); 911 } 912 return 0; 913 } 914 log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, %d) failed: %s", 915 rcv, strerror(errno)); 916 return -1; 917 #endif /* SO_RCVBUFFORCE */ 918 #endif /* SO_RCVBUF */ 919 920 return 0; 921 } 922 923 static int 924 set_sndbuf(struct nsd_socket *sock, int snd) 925 { 926 #ifdef SO_SNDBUF 927 #ifdef SO_SNDBUFFORCE 928 if(0 == setsockopt( 929 sock->s, SOL_SOCKET, SO_SNDBUFFORCE, &snd, sizeof(snd))) 930 { 931 return 1; 932 } 933 if(errno == EPERM || errno == ENOBUFS) { 934 if(errno == ENOBUFS) { 935 VERBOSITY(2, (LOG_INFO, "setsockopt(..., SO_SNDBUFFORCE, %d) was not granted: %s", 936 snd, strerror(errno))); 937 } 938 return 0; 939 } 940 log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, %d) failed: %s", 941 snd, strerror(errno)); 942 return -1; 943 #else /* !SO_SNDBUFFORCE */ 944 if(0 == setsockopt( 945 sock->s, SOL_SOCKET, SO_SNDBUF, &snd, sizeof(snd))) 946 { 947 return 1; 948 } 949 if(errno == ENOSYS || errno == ENOBUFS) { 950 if(errno == ENOBUFS) { 951 VERBOSITY(2, (LOG_INFO, "setsockopt(..., SO_SNDBUF, %d) was not granted: %s", 952 snd, strerror(errno))); 953 } 954 return 0; 955 } 956 log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, %d) failed: %s", 957 snd, strerror(errno)); 958 return -1; 959 #endif /* SO_SNDBUFFORCE */ 960 #endif /* SO_SNDBUF */ 961 962 return 0; 963 } 964 965 static int 966 set_nonblock(struct nsd_socket *sock) 967 { 968 const char *socktype = 969 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 970 971 if(fcntl(sock->s, F_SETFL, O_NONBLOCK) == -1) { 972 log_msg(LOG_ERR, "fctnl(..., O_NONBLOCK) failed for %s: %s", 973 socktype, strerror(errno)); 974 return -1; 975 } 976 977 return 1; 978 } 979 980 #ifdef INET6 981 static int 982 set_ipv6_v6only(struct nsd_socket *sock) 983 { 984 #ifdef IPV6_V6ONLY 985 int on = 1; 986 const char *socktype = 987 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 988 989 if(0 == setsockopt( 990 sock->s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on))) 991 { 992 return 1; 993 } 994 995 log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed for %s: %s", 996 socktype, strerror(errno)); 997 return -1; 998 #else 999 (void)sock; 1000 #endif /* IPV6_V6ONLY */ 1001 1002 return 0; 1003 } 1004 #endif /* INET6 */ 1005 1006 #ifdef INET6 1007 static int 1008 set_ipv6_use_min_mtu(struct nsd_socket *sock) 1009 { 1010 #if defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU) 1011 #if defined(IPV6_USE_MIN_MTU) 1012 /* There is no fragmentation of IPv6 datagrams during forwarding in the 1013 * network. Therefore we do not send UDP datagrams larger than the 1014 * minimum IPv6 MTU of 1280 octets. The EDNS0 message length can be 1015 * larger if the network stack supports IPV6_USE_MIN_MTU. 1016 */ 1017 int opt = IPV6_USE_MIN_MTU; 1018 int optval = 1; 1019 static const char optname[] = "IPV6_USE_MIN_MTU"; 1020 #elif defined(IPV6_MTU) 1021 /* On Linux, PMTUD is disabled by default for datagrams so set the MTU 1022 * to the MIN MTU to get the same. 1023 */ 1024 int opt = IPV6_MTU; 1025 int optval = IPV6_MIN_MTU; 1026 static const char optname[] = "IPV6_MTU"; 1027 #endif 1028 if(0 == setsockopt( 1029 sock->s, IPPROTO_IPV6, opt, &optval, sizeof(optval))) 1030 { 1031 return 1; 1032 } 1033 1034 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s", 1035 optname, strerror(errno)); 1036 return -1; 1037 #else 1038 (void)sock; 1039 #endif /* INET6 */ 1040 1041 return 0; 1042 } 1043 #endif /* INET6 */ 1044 1045 static int 1046 set_ipv4_no_pmtu_disc(struct nsd_socket *sock) 1047 { 1048 int ret = 0; 1049 1050 #if defined(IP_MTU_DISCOVER) 1051 int opt = IP_MTU_DISCOVER; 1052 int optval; 1053 # if defined(IP_PMTUDISC_OMIT) 1054 /* Linux 3.15 has IP_PMTUDISC_OMIT which makes sockets ignore PMTU 1055 * information and send packets with DF=0. Fragmentation is allowed if 1056 * and only if the packet size exceeds the outgoing interface MTU or 1057 * the packet encounters smaller MTU link in network. This mitigates 1058 * DNS fragmentation attacks by preventing forged PMTU information. 1059 * FreeBSD already has same semantics without setting the option. 1060 */ 1061 optval = IP_PMTUDISC_OMIT; 1062 if(0 == setsockopt( 1063 sock->s, IPPROTO_IP, opt, &optval, sizeof(optval))) 1064 { 1065 return 1; 1066 } 1067 1068 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", 1069 "IP_MTU_DISCOVER", "IP_PMTUDISC_OMIT", strerror(errno)); 1070 # endif /* IP_PMTUDISC_OMIT */ 1071 # if defined(IP_PMTUDISC_DONT) 1072 /* Use IP_PMTUDISC_DONT if IP_PMTUDISC_OMIT failed / undefined. */ 1073 optval = IP_PMTUDISC_DONT; 1074 if(0 == setsockopt( 1075 sock->s, IPPROTO_IP, opt, &optval, sizeof(optval))) 1076 { 1077 return 1; 1078 } 1079 1080 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", 1081 "IP_MTU_DISCOVER", "IP_PMTUDISC_DONT", strerror(errno)); 1082 # endif 1083 ret = -1; 1084 #elif defined(IP_DONTFRAG) 1085 int off = 0; 1086 if (0 == setsockopt( 1087 sock->s, IPPROTO_IP, IP_DONTFRAG, &off, sizeof(off))) 1088 { 1089 return 1; 1090 } 1091 1092 log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s", 1093 strerror(errno)); 1094 ret = -1; 1095 #else 1096 (void)sock; 1097 #endif 1098 1099 return ret; 1100 } 1101 1102 static int 1103 set_ip_freebind(struct nsd_socket *sock) 1104 { 1105 #ifdef IP_FREEBIND 1106 int on = 1; 1107 const char *socktype = 1108 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 1109 if(setsockopt(sock->s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) == 0) 1110 { 1111 return 1; 1112 } 1113 log_msg(LOG_ERR, "setsockopt(..., IP_FREEBIND, ...) failed for %s: %s", 1114 socktype, strerror(errno)); 1115 return -1; 1116 #else 1117 (void)sock; 1118 #endif /* IP_FREEBIND */ 1119 1120 return 0; 1121 } 1122 1123 static int 1124 set_ip_transparent(struct nsd_socket *sock) 1125 { 1126 /* 1127 The scandalous preprocessor blob here calls for some explanation :) 1128 POSIX does not specify an option to bind non-local IPs, so 1129 platforms developed several implementation-specific options, 1130 all set in the same way, but with different names. 1131 For additional complexity, some platform manage this setting 1132 differently for different address families (IPv4 vs IPv6). 1133 This scandalous preprocessor blob below abstracts such variability 1134 in the way which leaves the C code as lean and clear as possible. 1135 */ 1136 1137 #if defined(IP_TRANSPARENT) 1138 # define NSD_SOCKET_OPTION_TRANSPARENT IP_TRANSPARENT 1139 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL IPPROTO_IP 1140 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "IP_TRANSPARENT" 1141 // as of 2020-01, Linux does not support this on IPv6 programmatically 1142 #elif defined(SO_BINDANY) 1143 # define NSD_SOCKET_OPTION_TRANSPARENT SO_BINDANY 1144 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL SOL_SOCKET 1145 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "SO_BINDANY" 1146 #elif defined(IP_BINDANY) 1147 # define NSD_SOCKET_OPTION_TRANSPARENT IP_BINDANY 1148 # define NSD_SOCKET_OPTION_TRANSPARENT6 IPV6_BINDANY 1149 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL IPPROTO_IP 1150 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 IPPROTO_IPV6 1151 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "IP_BINDANY" 1152 #endif 1153 1154 #ifndef NSD_SOCKET_OPTION_TRANSPARENT 1155 (void)sock; 1156 #else 1157 # ifndef NSD_SOCKET_OPTION_TRANSPARENT6 1158 # define NSD_SOCKET_OPTION_TRANSPARENT6 NSD_SOCKET_OPTION_TRANSPARENT 1159 # endif 1160 # ifndef NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 1161 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL 1162 # endif 1163 # ifndef NSD_SOCKET_OPTION_TRANSPARENT_NAME6 1164 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME6 NSD_SOCKET_OPTION_TRANSPARENT_NAME 1165 # endif 1166 1167 int on = 1; 1168 const char *socktype = 1169 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 1170 const int is_ip6 = (sock->addr.ai_family == AF_INET6); 1171 1172 if(0 == setsockopt( 1173 sock->s, 1174 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 : NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL, 1175 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT6 : NSD_SOCKET_OPTION_TRANSPARENT, 1176 &on, sizeof(on))) 1177 { 1178 return 1; 1179 } 1180 1181 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed for %s: %s", 1182 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_NAME6 : NSD_SOCKET_OPTION_TRANSPARENT_NAME, socktype, strerror(errno)); 1183 return -1; 1184 #endif 1185 1186 return 0; 1187 } 1188 1189 static int 1190 set_tcp_maxseg(struct nsd_socket *sock, int mss) 1191 { 1192 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG) 1193 if(setsockopt(sock->s, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == 0) { 1194 return 1; 1195 } 1196 log_msg(LOG_ERR, "setsockopt(..., TCP_MAXSEG, ...) failed for tcp: %s", 1197 strerror(errno)); 1198 return -1; 1199 #else 1200 log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported"); 1201 #endif 1202 return 0; 1203 } 1204 1205 #ifdef USE_TCP_FASTOPEN 1206 static int 1207 set_tcp_fastopen(struct nsd_socket *sock) 1208 { 1209 /* qlen specifies how many outstanding TFO requests to allow. Limit is 1210 * a defense against IP spoofing attacks as suggested in RFC7413. 1211 */ 1212 int qlen; 1213 1214 #ifdef __APPLE__ 1215 /* macOS X implementation only supports qlen of 1 via this call. The 1216 * actual value is configured by the net.inet.tcp.fastopen_backlog 1217 * kernel parameter. 1218 */ 1219 qlen = 1; 1220 #else 1221 /* 5 is recommended on Linux. */ 1222 qlen = 5; 1223 #endif 1224 if (0 == setsockopt( 1225 sock->s, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen))) 1226 { 1227 return 1; 1228 } 1229 1230 if (errno == EPERM) { 1231 log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s " 1232 "; this could likely be because sysctl " 1233 "net.inet.tcp.fastopen.enabled, " 1234 "net.inet.tcp.fastopen.server_enable, or " 1235 "net.ipv4.tcp_fastopen is disabled", 1236 strerror(errno)); 1237 /* Squelch ENOPROTOOPT: FreeBSD server mode with kernel support 1238 * disabled, except when verbosity enabled for debugging 1239 */ 1240 } else if(errno != ENOPROTOOPT || verbosity >= 3) { 1241 log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s", 1242 strerror(errno)); 1243 } 1244 1245 return (errno == ENOPROTOOPT ? 0 : -1); 1246 } 1247 #endif /* USE_TCP_FASTOPEN */ 1248 1249 static int 1250 set_bindtodevice(struct nsd_socket *sock) 1251 { 1252 #if defined(SO_BINDTODEVICE) 1253 if(setsockopt(sock->s, SOL_SOCKET, SO_BINDTODEVICE, 1254 sock->device, strlen(sock->device)) == -1) 1255 { 1256 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", 1257 "SO_BINDTODEVICE", sock->device, strerror(errno)); 1258 return -1; 1259 } 1260 1261 return 1; 1262 #else 1263 (void)sock; 1264 return 0; 1265 #endif 1266 } 1267 1268 static int 1269 set_setfib(struct nsd_socket *sock) 1270 { 1271 #if defined(SO_SETFIB) 1272 if(setsockopt(sock->s, SOL_SOCKET, SO_SETFIB, 1273 (const void *)&sock->fib, sizeof(sock->fib)) == -1) 1274 { 1275 log_msg(LOG_ERR, "setsockopt(..., %s, %d, ...) failed: %s", 1276 "SO_SETFIB", sock->fib, strerror(errno)); 1277 return -1; 1278 } 1279 1280 return 1; 1281 #else 1282 (void)sock; 1283 return 0; 1284 #endif 1285 } 1286 1287 static int 1288 open_udp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works) 1289 { 1290 int rcv = nsd->options->receive_buffer_size; 1291 int snd = nsd->options->send_buffer_size; 1292 1293 if(-1 == (sock->s = socket( 1294 sock->addr.ai_family, sock->addr.ai_socktype, 0))) 1295 { 1296 #ifdef INET6 1297 if((sock->flags & NSD_SOCKET_IS_OPTIONAL) && 1298 (sock->addr.ai_family == AF_INET6) && 1299 (errno == EAFNOSUPPORT)) 1300 { 1301 log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: " 1302 "not supported"); 1303 return 0; 1304 } 1305 #endif 1306 log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno)); 1307 return -1; 1308 } 1309 1310 set_cloexec(sock); 1311 1312 if(nsd->reuseport && reuseport_works && *reuseport_works) 1313 *reuseport_works = (set_reuseport(sock) == 1); 1314 1315 if(set_rcvbuf(sock, rcv) == -1) 1316 return -1; 1317 1318 if(set_sndbuf(sock, snd) == -1) 1319 return -1; 1320 #ifdef INET6 1321 if(sock->addr.ai_family == AF_INET6) { 1322 if(set_ipv6_v6only(sock) == -1 || 1323 set_ipv6_use_min_mtu(sock) == -1) 1324 return -1; 1325 } else 1326 #endif /* INET6 */ 1327 if(sock->addr.ai_family == AF_INET) { 1328 if(set_ipv4_no_pmtu_disc(sock) == -1) 1329 return -1; 1330 } 1331 1332 /* Set socket to non-blocking. Otherwise, on operating systems 1333 * with thundering herd problems, the UDP recv could block 1334 * after select returns readable. 1335 */ 1336 set_nonblock(sock); 1337 1338 if(nsd->options->ip_freebind) 1339 (void)set_ip_freebind(sock); 1340 if(nsd->options->ip_transparent) 1341 (void)set_ip_transparent(sock); 1342 if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1) 1343 return -1; 1344 if(sock->fib != -1 && set_setfib(sock) == -1) 1345 return -1; 1346 1347 if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) { 1348 char buf[256]; 1349 addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf)); 1350 log_msg(LOG_ERR, "can't bind udp socket %s: %s", 1351 buf, strerror(errno)); 1352 return -1; 1353 } 1354 1355 return 1; 1356 } 1357 1358 static int 1359 open_tcp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works) 1360 { 1361 #ifdef USE_TCP_FASTOPEN 1362 report_tcp_fastopen_config(); 1363 #endif 1364 1365 (void)reuseport_works; 1366 1367 if(-1 == (sock->s = socket( 1368 sock->addr.ai_family, sock->addr.ai_socktype, 0))) 1369 { 1370 #ifdef INET6 1371 if((sock->flags & NSD_SOCKET_IS_OPTIONAL) && 1372 (sock->addr.ai_family == AF_INET6) && 1373 (errno == EAFNOSUPPORT)) 1374 { 1375 log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: " 1376 "not supported"); 1377 return 0; 1378 } 1379 #endif /* INET6 */ 1380 log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno)); 1381 return -1; 1382 } 1383 1384 set_cloexec(sock); 1385 1386 if(nsd->reuseport && reuseport_works && *reuseport_works) 1387 *reuseport_works = (set_reuseport(sock) == 1); 1388 1389 (void)set_reuseaddr(sock); 1390 1391 #ifdef INET6 1392 if(sock->addr.ai_family == AF_INET6) { 1393 if (set_ipv6_v6only(sock) == -1 || 1394 set_ipv6_use_min_mtu(sock) == -1) 1395 return -1; 1396 } 1397 #endif 1398 1399 if(nsd->tcp_mss > 0) 1400 set_tcp_maxseg(sock, nsd->tcp_mss); 1401 /* (StevensUNP p463), if TCP listening socket is blocking, then 1402 it may block in accept, even if select() says readable. */ 1403 (void)set_nonblock(sock); 1404 if(nsd->options->ip_freebind) 1405 (void)set_ip_freebind(sock); 1406 if(nsd->options->ip_transparent) 1407 (void)set_ip_transparent(sock); 1408 if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1) 1409 return -1; 1410 if(sock->fib != -1 && set_setfib(sock) == -1) 1411 return -1; 1412 1413 if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) { 1414 char buf[256]; 1415 addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf)); 1416 log_msg(LOG_ERR, "can't bind tcp socket %s: %s", 1417 buf, strerror(errno)); 1418 return -1; 1419 } 1420 1421 #ifdef USE_TCP_FASTOPEN 1422 (void)set_tcp_fastopen(sock); 1423 #endif 1424 1425 if(listen(sock->s, nsd->options->tcp_listen_queue) == -1) { 1426 log_msg(LOG_ERR, "can't listen: %s", strerror(errno)); 1427 return -1; 1428 } 1429 1430 return 1; 1431 } 1432 1433 /* 1434 * Initialize the server, reuseport, create and bind the sockets. 1435 */ 1436 int 1437 server_init(struct nsd *nsd) 1438 { 1439 size_t i; 1440 int reuseport = 1; /* Determine if REUSEPORT works. */ 1441 1442 /* open server interface ports */ 1443 for(i = 0; i < nsd->ifs; i++) { 1444 if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1 || 1445 open_tcp_socket(nsd, &nsd->tcp[i], &reuseport) == -1) 1446 { 1447 return -1; 1448 } 1449 } 1450 1451 if(nsd->reuseport && reuseport) { 1452 size_t ifs = nsd->ifs * nsd->reuseport; 1453 1454 /* increase the size of the interface arrays, there are going 1455 * to be separate interface file descriptors for every server 1456 * instance */ 1457 region_remove_cleanup(nsd->region, free, nsd->udp); 1458 region_remove_cleanup(nsd->region, free, nsd->tcp); 1459 1460 nsd->udp = xrealloc(nsd->udp, ifs * sizeof(*nsd->udp)); 1461 nsd->tcp = xrealloc(nsd->tcp, ifs * sizeof(*nsd->tcp)); 1462 region_add_cleanup(nsd->region, free, nsd->udp); 1463 region_add_cleanup(nsd->region, free, nsd->tcp); 1464 if(ifs > nsd->ifs) { 1465 memset(&nsd->udp[nsd->ifs], 0, 1466 (ifs-nsd->ifs)*sizeof(*nsd->udp)); 1467 memset(&nsd->tcp[nsd->ifs], 0, 1468 (ifs-nsd->ifs)*sizeof(*nsd->tcp)); 1469 } 1470 1471 for(i = nsd->ifs; i < ifs; i++) { 1472 nsd->udp[i] = nsd->udp[i%nsd->ifs]; 1473 nsd->udp[i].s = -1; 1474 if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1) { 1475 return -1; 1476 } 1477 nsd->tcp[i] = nsd->tcp[i%nsd->ifs]; 1478 nsd->tcp[i].s = -1; 1479 if(open_tcp_socket(nsd, &nsd->tcp[i], &reuseport) == -1) { 1480 return -1; 1481 } 1482 } 1483 1484 nsd->ifs = ifs; 1485 } else { 1486 nsd->reuseport = 0; 1487 } 1488 1489 /* open server interface ports for verifiers */ 1490 for(i = 0; i < nsd->verify_ifs; i++) { 1491 if(open_udp_socket(nsd, &nsd->verify_udp[i], NULL) == -1 || 1492 open_tcp_socket(nsd, &nsd->verify_tcp[i], NULL) == -1) 1493 { 1494 return -1; 1495 } 1496 } 1497 1498 return 0; 1499 } 1500 1501 /* 1502 * Prepare the server for take off. 1503 * 1504 */ 1505 int 1506 server_prepare(struct nsd *nsd) 1507 { 1508 #ifdef RATELIMIT 1509 /* set secret modifier for hashing (rate limits) */ 1510 #ifdef HAVE_GETRANDOM 1511 uint32_t v; 1512 if(getrandom(&v, sizeof(v), 0) == -1) { 1513 log_msg(LOG_ERR, "getrandom failed: %s", strerror(errno)); 1514 exit(1); 1515 } 1516 hash_set_raninit(v); 1517 #elif defined(HAVE_ARC4RANDOM) 1518 hash_set_raninit(arc4random()); 1519 #else 1520 uint32_t v = getpid() ^ time(NULL); 1521 srandom((unsigned long)v); 1522 # ifdef HAVE_SSL 1523 if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0) 1524 hash_set_raninit(v); 1525 else 1526 # endif 1527 hash_set_raninit(random()); 1528 #endif 1529 rrl_mmap_init(nsd->child_count, nsd->options->rrl_size, 1530 nsd->options->rrl_ratelimit, 1531 nsd->options->rrl_whitelist_ratelimit, 1532 nsd->options->rrl_slip, 1533 nsd->options->rrl_ipv4_prefix_length, 1534 nsd->options->rrl_ipv6_prefix_length); 1535 #endif /* RATELIMIT */ 1536 1537 /* Open the database... */ 1538 if ((nsd->db = namedb_open(nsd->options)) == NULL) { 1539 log_msg(LOG_ERR, "unable to open the database: %s", strerror(errno)); 1540 unlink(nsd->task[0]->fname); 1541 unlink(nsd->task[1]->fname); 1542 #ifdef USE_ZONE_STATS 1543 unlink(nsd->zonestatfname[0]); 1544 unlink(nsd->zonestatfname[1]); 1545 #endif 1546 #ifdef BIND8_STATS 1547 server_stat_free(nsd); 1548 #endif 1549 xfrd_del_tempdir(nsd); 1550 return -1; 1551 } 1552 /* check if zone files can be read */ 1553 /* NULL for taskudb because we send soainfo in a moment, batched up, 1554 * for all zones */ 1555 namedb_check_zonefiles(nsd, nsd->options, NULL, NULL); 1556 zonestatid_tree_set(nsd); 1557 1558 compression_table_capacity = 0; 1559 initialize_dname_compression_tables(nsd); 1560 1561 #ifdef BIND8_STATS 1562 /* Initialize times... */ 1563 time(&nsd->st->boot); 1564 nsd->st->reloadcount = 0; 1565 set_bind8_alarm(nsd); 1566 #endif /* BIND8_STATS */ 1567 1568 return 0; 1569 } 1570 1571 /* 1572 * Fork the required number of servers. 1573 */ 1574 static int 1575 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio, 1576 int* xfrd_sock_p) 1577 { 1578 size_t i; 1579 1580 /* Start all child servers initially. */ 1581 for (i = 0; i < nsd->child_count; ++i) { 1582 nsd->children[i].pid = 0; 1583 } 1584 1585 return restart_child_servers(nsd, region, netio, xfrd_sock_p); 1586 } 1587 1588 static void 1589 server_close_socket(struct nsd_socket *sock) 1590 { 1591 if(sock->s != -1) { 1592 close(sock->s); 1593 sock->s = -1; 1594 } 1595 } 1596 1597 void 1598 server_close_all_sockets(struct nsd_socket sockets[], size_t n) 1599 { 1600 size_t i; 1601 1602 /* Close all the sockets... */ 1603 for (i = 0; i < n; ++i) { 1604 server_close_socket(&sockets[i]); 1605 } 1606 } 1607 1608 /* 1609 * Close the sockets, shutdown the server and exit. 1610 * Does not return. 1611 */ 1612 void 1613 server_shutdown(struct nsd *nsd) 1614 { 1615 size_t i; 1616 1617 server_close_all_sockets(nsd->udp, nsd->ifs); 1618 server_close_all_sockets(nsd->tcp, nsd->ifs); 1619 /* CHILD: close command channel to parent */ 1620 if(nsd->this_child && nsd->this_child->parent_fd != -1) 1621 { 1622 close(nsd->this_child->parent_fd); 1623 nsd->this_child->parent_fd = -1; 1624 } 1625 /* SERVER: close command channels to children */ 1626 if(!nsd->this_child) 1627 { 1628 for(i=0; i < nsd->child_count; ++i) 1629 if(nsd->children[i].child_fd != -1) 1630 { 1631 close(nsd->children[i].child_fd); 1632 nsd->children[i].child_fd = -1; 1633 } 1634 } 1635 1636 tsig_finalize(); 1637 daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */ 1638 #ifdef USE_METRICS 1639 daemon_metrics_delete(nsd->metrics); 1640 #endif /* USE_METRICS */ 1641 #ifdef HAVE_SSL 1642 if (nsd->tls_ctx) 1643 SSL_CTX_free(nsd->tls_ctx); 1644 if (nsd->tls_auth_ctx) 1645 SSL_CTX_free(nsd->tls_auth_ctx); 1646 #endif 1647 1648 #ifdef MEMCLEAN /* OS collects memory pages */ 1649 #ifdef RATELIMIT 1650 rrl_mmap_deinit_keep_mmap(); 1651 #endif 1652 #ifdef USE_DNSTAP 1653 dt_collector_destroy(nsd->dt_collector, nsd); 1654 #endif 1655 udb_base_free_keep_mmap(nsd->task[0]); 1656 udb_base_free_keep_mmap(nsd->task[1]); 1657 namedb_free_ixfr(nsd->db); 1658 namedb_close(nsd->db); 1659 nsd_options_destroy(nsd->options); 1660 region_destroy(nsd->region); 1661 #endif 1662 log_finalize(); 1663 exit(0); 1664 } 1665 1666 void 1667 server_prepare_xfrd(struct nsd* nsd) 1668 { 1669 char tmpfile[256]; 1670 size_t i; 1671 /* create task mmaps */ 1672 nsd->mytask = 0; 1673 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0", 1674 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 1675 nsd->task[0] = task_file_create(tmpfile); 1676 if(!nsd->task[0]) { 1677 #ifdef USE_ZONE_STATS 1678 unlink(nsd->zonestatfname[0]); 1679 unlink(nsd->zonestatfname[1]); 1680 #endif 1681 #ifdef BIND8_STATS 1682 server_stat_free(nsd); 1683 #endif 1684 xfrd_del_tempdir(nsd); 1685 exit(1); 1686 } 1687 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1", 1688 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 1689 nsd->task[1] = task_file_create(tmpfile); 1690 if(!nsd->task[1]) { 1691 unlink(nsd->task[0]->fname); 1692 #ifdef USE_ZONE_STATS 1693 unlink(nsd->zonestatfname[0]); 1694 unlink(nsd->zonestatfname[1]); 1695 #endif 1696 #ifdef BIND8_STATS 1697 server_stat_free(nsd); 1698 #endif 1699 xfrd_del_tempdir(nsd); 1700 exit(1); 1701 } 1702 assert(udb_base_get_userdata(nsd->task[0])->data == 0); 1703 assert(udb_base_get_userdata(nsd->task[1])->data == 0); 1704 /* create xfrd listener structure */ 1705 nsd->xfrd_listener = region_alloc(nsd->region, 1706 sizeof(netio_handler_type)); 1707 nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*) 1708 region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data)); 1709 nsd->xfrd_listener->fd = -1; 1710 ((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd = 1711 nsd; 1712 ((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn = 1713 xfrd_tcp_create(nsd->region, QIOBUFSZ); 1714 /* setup sockets to pass NOTIFY messages from the serve processes */ 1715 nsd->serve2xfrd_fd_send = region_alloc_array( 1716 nsd->region, 2 * nsd->child_count, sizeof(int)); 1717 nsd->serve2xfrd_fd_recv= region_alloc_array( 1718 nsd->region, 2 * nsd->child_count, sizeof(int)); 1719 for(i=0; i < 2 * nsd->child_count; i++) { 1720 int pipefd[2]; 1721 pipefd[0] = -1; /* For receiving by parent (xfrd) */ 1722 pipefd[1] = -1; /* For sending by child (server childs) */ 1723 if(pipe(pipefd) < 0) { 1724 log_msg(LOG_ERR, "fatal error: cannot create NOTIFY " 1725 "communication channel: %s", strerror(errno)); 1726 exit(1); 1727 } 1728 nsd->serve2xfrd_fd_recv[i] = pipefd[0]; 1729 nsd->serve2xfrd_fd_send[i] = pipefd[1]; 1730 } 1731 nsd->serve2xfrd_fd_swap = nsd->serve2xfrd_fd_send + nsd->child_count; 1732 } 1733 1734 1735 void 1736 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active) 1737 { 1738 pid_t pid; 1739 int sockets[2] = {0,0}; 1740 struct ipc_handler_conn_data *data; 1741 size_t i; 1742 1743 if(nsd->xfrd_listener->fd != -1) 1744 close(nsd->xfrd_listener->fd); 1745 if(del_db) { 1746 /* recreate taskdb that xfrd was using, it may be corrupt */ 1747 /* we (or reload) use nsd->mytask, and xfrd uses the other */ 1748 char* tmpfile = nsd->task[1-nsd->mytask]->fname; 1749 nsd->task[1-nsd->mytask]->fname = NULL; 1750 /* free alloc already, so udb does not shrink itself */ 1751 udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc); 1752 nsd->task[1-nsd->mytask]->alloc = NULL; 1753 udb_base_free(nsd->task[1-nsd->mytask]); 1754 /* create new file, overwrite the old one */ 1755 nsd->task[1-nsd->mytask] = task_file_create(tmpfile); 1756 free(tmpfile); 1757 } 1758 if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) { 1759 log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno)); 1760 return; 1761 } 1762 pid = fork(); 1763 switch (pid) { 1764 case -1: 1765 log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno)); 1766 break; 1767 default: 1768 /* PARENT: close first socket, use second one */ 1769 close(sockets[0]); 1770 if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) { 1771 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 1772 } 1773 if(del_db) xfrd_free_namedb(nsd); 1774 /* use other task than I am using, since if xfrd died and is 1775 * restarted, the reload is using nsd->mytask */ 1776 nsd->mytask = 1 - nsd->mytask; 1777 1778 /* close the send site of the serve2xfrd fds */ 1779 assert(nsd->serve2xfrd_fd_send < nsd->serve2xfrd_fd_swap); 1780 for(i = 0; i < 2 * nsd->child_count; i++) { 1781 if(nsd->serve2xfrd_fd_send[i] != -1) { 1782 close(nsd->serve2xfrd_fd_send[i]); 1783 nsd->serve2xfrd_fd_send[i] = -1; 1784 } 1785 } 1786 #ifdef HAVE_SETPROCTITLE 1787 setproctitle("xfrd"); 1788 #endif 1789 #ifdef USE_LOG_PROCESS_ROLE 1790 log_set_process_role("xfrd"); 1791 #endif 1792 #ifdef HAVE_CPUSET_T 1793 if(nsd->use_cpu_affinity) { 1794 set_cpu_affinity(nsd->xfrd_cpuset); 1795 } 1796 #endif 1797 1798 xfrd_init(sockets[1], nsd, del_db, reload_active, pid); 1799 /* ENOTREACH */ 1800 break; 1801 case 0: 1802 /* CHILD: close second socket, use first one */ 1803 close(sockets[1]); 1804 if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) { 1805 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 1806 } 1807 nsd->xfrd_listener->fd = sockets[0]; 1808 /* close the receive site of the serve2xfrd fds */ 1809 for(i = 0; i < 2 * nsd->child_count; i++) { 1810 if(nsd->serve2xfrd_fd_recv[i] != -1) { 1811 close(nsd->serve2xfrd_fd_recv[i]); 1812 nsd->serve2xfrd_fd_recv[i] = -1; 1813 } 1814 } 1815 #ifdef HAVE_SETPROCTITLE 1816 setproctitle("main"); 1817 #endif 1818 #ifdef USE_LOG_PROCESS_ROLE 1819 log_set_process_role("main"); 1820 #endif 1821 break; 1822 } 1823 /* server-parent only */ 1824 nsd->xfrd_listener->timeout = NULL; 1825 nsd->xfrd_listener->event_types = NETIO_EVENT_READ; 1826 nsd->xfrd_listener->event_handler = parent_handle_xfrd_command; 1827 /* clear ongoing ipc reads */ 1828 data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data; 1829 data->conn->is_reading = 0; 1830 } 1831 1832 /** add all soainfo to taskdb */ 1833 static void 1834 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb) 1835 { 1836 struct radnode* n; 1837 udb_ptr task_last; /* last task, mytask is empty so NULL */ 1838 /* add all SOA INFO to mytask */ 1839 udb_ptr_init(&task_last, taskudb); 1840 for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) { 1841 task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0); 1842 } 1843 udb_ptr_unlink(&task_last, taskudb); 1844 } 1845 1846 void 1847 server_send_soa_xfrd(struct nsd* nsd, int shortsoa) 1848 { 1849 /* normally this exchanges the SOA from nsd->xfrd and the expire back. 1850 * parent fills one taskdb with soas, xfrd fills other with expires. 1851 * then they exchange and process. 1852 * shortsoa: xfrd crashes and needs to be restarted and one taskdb 1853 * may be in use by reload. Fill SOA in taskdb and give to xfrd. 1854 * expire notifications can be sent back via a normal reload later 1855 * (xfrd will wait for current running reload to finish if any). 1856 */ 1857 sig_atomic_t cmd = 0; 1858 pid_t mypid; 1859 int xfrd_sock = nsd->xfrd_listener->fd; 1860 struct udb_base* taskudb = nsd->task[nsd->mytask]; 1861 udb_ptr t; 1862 if(!shortsoa) { 1863 if(nsd->signal_hint_shutdown) { 1864 shutdown: 1865 log_msg(LOG_WARNING, "signal received, shutting down..."); 1866 server_close_all_sockets(nsd->udp, nsd->ifs); 1867 server_close_all_sockets(nsd->tcp, nsd->ifs); 1868 daemon_remote_close(nsd->rc); 1869 /* Unlink it if possible... */ 1870 unlinkpid(nsd->pidfile, nsd->username); 1871 unlink(nsd->task[0]->fname); 1872 unlink(nsd->task[1]->fname); 1873 #ifdef USE_ZONE_STATS 1874 unlink(nsd->zonestatfname[0]); 1875 unlink(nsd->zonestatfname[1]); 1876 #endif 1877 #ifdef BIND8_STATS 1878 server_stat_free(nsd); 1879 #endif 1880 server_shutdown(nsd); 1881 /* ENOTREACH */ 1882 exit(0); 1883 } 1884 } 1885 if(shortsoa) { 1886 /* put SOA in xfrd task because mytask may be in use */ 1887 taskudb = nsd->task[1-nsd->mytask]; 1888 } 1889 1890 add_all_soa_to_task(nsd, taskudb); 1891 if(!shortsoa) { 1892 /* wait for xfrd to signal task is ready, RELOAD signal */ 1893 if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) || 1894 cmd != NSD_RELOAD) { 1895 log_msg(LOG_ERR, "did not get start signal from xfrd"); 1896 exit(1); 1897 } 1898 if(nsd->signal_hint_shutdown) { 1899 goto shutdown; 1900 } 1901 } 1902 /* give xfrd our task, signal it with RELOAD_DONE */ 1903 task_process_sync(taskudb); 1904 cmd = NSD_RELOAD_DONE; 1905 if(!write_socket(xfrd_sock, &cmd, sizeof(cmd))) { 1906 log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s", 1907 (int)nsd->pid, strerror(errno)); 1908 } 1909 mypid = getpid(); 1910 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 1911 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 1912 strerror(errno)); 1913 } 1914 1915 if(!shortsoa) { 1916 /* process the xfrd task works (expiry data) */ 1917 nsd->mytask = 1 - nsd->mytask; 1918 taskudb = nsd->task[nsd->mytask]; 1919 task_remap(taskudb); 1920 udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb)); 1921 while(!udb_ptr_is_null(&t)) { 1922 task_process_expire(nsd->db, TASKLIST(&t)); 1923 udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next); 1924 } 1925 udb_ptr_unlink(&t, taskudb); 1926 task_clear(taskudb); 1927 1928 /* tell xfrd that the task is emptied, signal with RELOAD_DONE */ 1929 cmd = NSD_RELOAD_DONE; 1930 if(!write_socket(xfrd_sock, &cmd, sizeof(cmd))) { 1931 log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s", 1932 (int)nsd->pid, strerror(errno)); 1933 } 1934 } 1935 } 1936 1937 #ifdef HAVE_SSL 1938 static void 1939 log_crypto_from_err(int level, const char* str, unsigned long err) 1940 { 1941 /* error:[error code]:[library name]:[function name]:[reason string] */ 1942 char buf[128]; 1943 unsigned long e; 1944 ERR_error_string_n(err, buf, sizeof(buf)); 1945 log_msg(level, "%s crypto %s", str, buf); 1946 while( (e=ERR_get_error()) ) { 1947 ERR_error_string_n(e, buf, sizeof(buf)); 1948 log_msg(level, "and additionally crypto %s", buf); 1949 } 1950 } 1951 1952 void 1953 log_crypto_err(const char* str) 1954 { 1955 log_crypto_from_err(LOG_ERR, str, ERR_get_error()); 1956 } 1957 1958 void 1959 log_crypto_warning(const char* str) 1960 { 1961 log_crypto_from_err(LOG_WARNING, str, ERR_get_error()); 1962 } 1963 1964 /** true if the ssl handshake error has to be squelched from the logs */ 1965 static int 1966 squelch_err_ssl_handshake(unsigned long err) 1967 { 1968 if(verbosity >= 3) 1969 return 0; /* only squelch on low verbosity */ 1970 /* this is very specific, we could filter on ERR_GET_REASON() 1971 * (the third element in ERR_PACK) */ 1972 if(err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTPS_PROXY_REQUEST) || 1973 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTP_REQUEST) || 1974 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_WRONG_VERSION_NUMBER) || 1975 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_READ_BYTES, SSL_R_SSLV3_ALERT_BAD_CERTIFICATE) 1976 #ifdef SSL_F_TLS_POST_PROCESS_CLIENT_HELLO 1977 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_POST_PROCESS_CLIENT_HELLO, SSL_R_NO_SHARED_CIPHER) 1978 #endif 1979 #ifdef SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO 1980 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNKNOWN_PROTOCOL) 1981 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNSUPPORTED_PROTOCOL) 1982 # ifdef SSL_R_VERSION_TOO_LOW 1983 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_VERSION_TOO_LOW) 1984 # endif 1985 #endif 1986 ) 1987 return 1; 1988 return 0; 1989 } 1990 1991 void 1992 perform_openssl_init(void) 1993 { 1994 /* init SSL library */ 1995 #ifdef HAVE_ERR_LOAD_CRYPTO_STRINGS 1996 ERR_load_crypto_strings(); 1997 #endif 1998 #if defined(HAVE_ERR_LOAD_SSL_STRINGS) && !defined(DEPRECATED_ERR_LOAD_SSL_STRINGS) 1999 ERR_load_SSL_strings(); 2000 #endif 2001 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_CRYPTO) 2002 OpenSSL_add_all_algorithms(); 2003 #else 2004 OPENSSL_init_crypto(OPENSSL_INIT_ADD_ALL_CIPHERS 2005 | OPENSSL_INIT_ADD_ALL_DIGESTS 2006 | OPENSSL_INIT_LOAD_CRYPTO_STRINGS, NULL); 2007 #endif 2008 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_SSL) 2009 (void)SSL_library_init(); 2010 #else 2011 OPENSSL_init_ssl(0, NULL); 2012 #endif 2013 2014 if(!RAND_status()) { 2015 /* try to seed it */ 2016 unsigned char buf[256]; 2017 unsigned int v, seed=(unsigned)time(NULL) ^ (unsigned)getpid(); 2018 size_t i; 2019 v = seed; 2020 for(i=0; i<256/sizeof(v); i++) { 2021 memmove(buf+i*sizeof(v), &v, sizeof(v)); 2022 v = v*seed + (unsigned int)i; 2023 } 2024 RAND_seed(buf, 256); 2025 log_msg(LOG_WARNING, "warning: no entropy, seeding openssl PRNG with time"); 2026 } 2027 } 2028 2029 static int 2030 get_ocsp(char *filename, unsigned char **ocsp) 2031 { 2032 BIO *bio; 2033 OCSP_RESPONSE *response; 2034 int len = -1; 2035 unsigned char *p, *buf; 2036 assert(filename); 2037 2038 if ((bio = BIO_new_file(filename, "r")) == NULL) { 2039 log_crypto_err("get_ocsp: BIO_new_file failed"); 2040 return -1; 2041 } 2042 2043 if ((response = d2i_OCSP_RESPONSE_bio(bio, NULL)) == NULL) { 2044 log_crypto_err("get_ocsp: d2i_OCSP_RESPONSE_bio failed"); 2045 BIO_free(bio); 2046 return -1; 2047 } 2048 2049 if ((len = i2d_OCSP_RESPONSE(response, NULL)) <= 0) { 2050 log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #1 failed"); 2051 OCSP_RESPONSE_free(response); 2052 BIO_free(bio); 2053 return -1; 2054 } 2055 2056 if ((buf = malloc((size_t) len)) == NULL) { 2057 log_msg(LOG_ERR, "get_ocsp: malloc failed"); 2058 OCSP_RESPONSE_free(response); 2059 BIO_free(bio); 2060 return -1; 2061 } 2062 2063 p = buf; 2064 if ((len = i2d_OCSP_RESPONSE(response, &p)) <= 0) { 2065 log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #2 failed"); 2066 free(buf); 2067 OCSP_RESPONSE_free(response); 2068 BIO_free(bio); 2069 return -1; 2070 } 2071 2072 OCSP_RESPONSE_free(response); 2073 BIO_free(bio); 2074 2075 *ocsp = buf; 2076 return len; 2077 } 2078 2079 /* further setup ssl ctx after the keys are loaded */ 2080 static void 2081 listen_sslctx_setup_2(void* ctxt) 2082 { 2083 SSL_CTX* ctx = (SSL_CTX*)ctxt; 2084 (void)ctx; 2085 #if HAVE_DECL_SSL_CTX_SET_ECDH_AUTO 2086 if(!SSL_CTX_set_ecdh_auto(ctx,1)) { 2087 /* ENOTREACH */ 2088 log_crypto_err("Error in SSL_CTX_ecdh_auto, not enabling ECDHE"); 2089 } 2090 #elif defined(HAVE_DECL_SSL_CTX_SET_TMP_ECDH) && defined(NID_X9_62_prime256v1) && defined(HAVE_EC_KEY_NEW_BY_CURVE_NAME) 2091 if(1) { 2092 EC_KEY *ecdh = EC_KEY_new_by_curve_name (NID_X9_62_prime256v1); 2093 if (!ecdh) { 2094 log_crypto_err("could not find p256, not enabling ECDHE"); 2095 } else { 2096 if (1 != SSL_CTX_set_tmp_ecdh (ctx, ecdh)) { 2097 log_crypto_err("Error in SSL_CTX_set_tmp_ecdh, not enabling ECDHE"); 2098 } 2099 EC_KEY_free (ecdh); 2100 } 2101 } 2102 #endif 2103 } 2104 2105 static int 2106 add_ocsp_data_cb(SSL *s, void* ATTR_UNUSED(arg)) 2107 { 2108 if(ocspdata) { 2109 unsigned char *p; 2110 if ((p=malloc(ocspdata_len)) == NULL) { 2111 log_msg(LOG_ERR, "add_ocsp_data_cb: malloc failure"); 2112 return SSL_TLSEXT_ERR_NOACK; 2113 } 2114 memcpy(p, ocspdata, ocspdata_len); 2115 if ((SSL_set_tlsext_status_ocsp_resp(s, p, ocspdata_len)) != 1) { 2116 log_crypto_err("Error in SSL_set_tlsext_status_ocsp_resp"); 2117 free(p); 2118 return SSL_TLSEXT_ERR_NOACK; 2119 } 2120 return SSL_TLSEXT_ERR_OK; 2121 } else { 2122 return SSL_TLSEXT_ERR_NOACK; 2123 } 2124 } 2125 2126 static int 2127 server_alpn_cb(SSL* ATTR_UNUSED(s), 2128 const unsigned char** out, unsigned char* outlen, 2129 const unsigned char* in, unsigned int inlen, 2130 void* ATTR_UNUSED(arg)) 2131 { 2132 static const unsigned char alpns[] = { 3, 'd', 'o', 't' }; 2133 unsigned char* tmp_out; 2134 2135 SSL_select_next_proto(&tmp_out, outlen, alpns, sizeof(alpns), in, inlen); 2136 *out = tmp_out; 2137 return SSL_TLSEXT_ERR_OK; 2138 } 2139 2140 SSL_CTX* 2141 server_tls_ctx_setup(char* key, char* pem, char* verifypem) 2142 { 2143 SSL_CTX *ctx = SSL_CTX_new(SSLv23_server_method()); 2144 if(!ctx) { 2145 log_crypto_err("could not SSL_CTX_new"); 2146 return NULL; 2147 } 2148 /* no SSLv2, SSLv3 because has defects */ 2149 #if SSL_OP_NO_SSLv2 != 0 2150 if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2) & SSL_OP_NO_SSLv2) != SSL_OP_NO_SSLv2){ 2151 log_crypto_err("could not set SSL_OP_NO_SSLv2"); 2152 SSL_CTX_free(ctx); 2153 return NULL; 2154 } 2155 #endif 2156 if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv3) & SSL_OP_NO_SSLv3) 2157 != SSL_OP_NO_SSLv3){ 2158 log_crypto_err("could not set SSL_OP_NO_SSLv3"); 2159 SSL_CTX_free(ctx); 2160 return 0; 2161 } 2162 #if defined(SSL_OP_NO_TLSv1) && defined(SSL_OP_NO_TLSv1_1) 2163 /* if we have tls 1.1 disable 1.0 */ 2164 if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1) & SSL_OP_NO_TLSv1) 2165 != SSL_OP_NO_TLSv1){ 2166 log_crypto_err("could not set SSL_OP_NO_TLSv1"); 2167 SSL_CTX_free(ctx); 2168 return 0; 2169 } 2170 #endif 2171 #if defined(SSL_OP_NO_TLSv1_1) && defined(SSL_OP_NO_TLSv1_2) 2172 /* if we have tls 1.2 disable 1.1 */ 2173 if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_1) & SSL_OP_NO_TLSv1_1) 2174 != SSL_OP_NO_TLSv1_1){ 2175 log_crypto_err("could not set SSL_OP_NO_TLSv1_1"); 2176 SSL_CTX_free(ctx); 2177 return 0; 2178 } 2179 #endif 2180 #if defined(SSL_OP_NO_TLSv1_2) && defined(SSL_OP_NO_TLSv1_3) 2181 /* if we have tls 1.3 disable 1.2 */ 2182 if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_2) & SSL_OP_NO_TLSv1_2) 2183 != SSL_OP_NO_TLSv1_2){ 2184 log_crypto_err("could not set SSL_OP_NO_TLSv1_2"); 2185 SSL_CTX_free(ctx); 2186 return 0; 2187 } 2188 #endif 2189 #if defined(SSL_OP_NO_RENEGOTIATION) 2190 /* disable client renegotiation */ 2191 if((SSL_CTX_set_options(ctx, SSL_OP_NO_RENEGOTIATION) & 2192 SSL_OP_NO_RENEGOTIATION) != SSL_OP_NO_RENEGOTIATION) { 2193 log_crypto_err("could not set SSL_OP_NO_RENEGOTIATION"); 2194 SSL_CTX_free(ctx); 2195 return 0; 2196 } 2197 #endif 2198 #if defined(SSL_OP_IGNORE_UNEXPECTED_EOF) 2199 /* disable client renegotiation */ 2200 if((SSL_CTX_set_options(ctx, SSL_OP_IGNORE_UNEXPECTED_EOF) & 2201 SSL_OP_IGNORE_UNEXPECTED_EOF) != SSL_OP_IGNORE_UNEXPECTED_EOF) { 2202 log_crypto_warning("could not set SSL_OP_IGNORE_UNEXPECTED_EOF"); 2203 } 2204 #endif 2205 #if defined(SHA256_DIGEST_LENGTH) && defined(SSL_TXT_CHACHA20) 2206 /* if we detect system-wide crypto policies, use those */ 2207 if (access( "/etc/crypto-policies/config", F_OK ) != 0 ) { 2208 /* if we have sha256, set the cipher list to have no known vulns */ 2209 if(!SSL_CTX_set_cipher_list(ctx, "ECDHE+AESGCM:ECDHE+CHACHA20")) 2210 log_crypto_err("could not set cipher list with SSL_CTX_set_cipher_list"); 2211 } 2212 #endif 2213 if((SSL_CTX_set_options(ctx, SSL_OP_CIPHER_SERVER_PREFERENCE) & 2214 SSL_OP_CIPHER_SERVER_PREFERENCE) != 2215 SSL_OP_CIPHER_SERVER_PREFERENCE) { 2216 log_crypto_err("could not set SSL_OP_CIPHER_SERVER_PREFERENCE"); 2217 SSL_CTX_free(ctx); 2218 return 0; 2219 } 2220 #ifdef HAVE_SSL_CTX_SET_SECURITY_LEVEL 2221 SSL_CTX_set_security_level(ctx, 0); 2222 #endif 2223 if(!SSL_CTX_use_certificate_chain_file(ctx, pem)) { 2224 log_msg(LOG_ERR, "error for cert file: %s", pem); 2225 log_crypto_err("error in SSL_CTX use_certificate_chain_file"); 2226 SSL_CTX_free(ctx); 2227 return NULL; 2228 } 2229 if(!SSL_CTX_use_PrivateKey_file(ctx, key, SSL_FILETYPE_PEM)) { 2230 log_msg(LOG_ERR, "error for private key file: %s", key); 2231 log_crypto_err("Error in SSL_CTX use_PrivateKey_file"); 2232 SSL_CTX_free(ctx); 2233 return NULL; 2234 } 2235 if(!SSL_CTX_check_private_key(ctx)) { 2236 log_msg(LOG_ERR, "error for key file: %s", key); 2237 log_crypto_err("Error in SSL_CTX check_private_key"); 2238 SSL_CTX_free(ctx); 2239 return NULL; 2240 } 2241 listen_sslctx_setup_2(ctx); 2242 if(verifypem && verifypem[0]) { 2243 if(!SSL_CTX_load_verify_locations(ctx, verifypem, NULL)) { 2244 log_crypto_err("Error in SSL_CTX verify locations"); 2245 SSL_CTX_free(ctx); 2246 return NULL; 2247 } 2248 SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file(verifypem)); 2249 SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER | SSL_VERIFY_FAIL_IF_NO_PEER_CERT, NULL); 2250 } 2251 SSL_CTX_set_alpn_select_cb(ctx, server_alpn_cb, NULL); 2252 return ctx; 2253 } 2254 2255 SSL_CTX* 2256 server_tls_ctx_create(struct nsd* nsd, char* verifypem, char* ocspfile) 2257 { 2258 char *key, *pem; 2259 SSL_CTX *ctx; 2260 2261 key = nsd->options->tls_service_key; 2262 pem = nsd->options->tls_service_pem; 2263 if(!key || key[0] == 0) { 2264 log_msg(LOG_ERR, "error: no tls-service-key file specified"); 2265 return NULL; 2266 } 2267 if(!pem || pem[0] == 0) { 2268 log_msg(LOG_ERR, "error: no tls-service-pem file specified"); 2269 return NULL; 2270 } 2271 2272 /* NOTE:This mimics the existing code in Unbound 1.5.1 by supporting SSL but 2273 * raft-ietf-uta-tls-bcp-08 recommends only using TLSv1.2*/ 2274 ctx = server_tls_ctx_setup(key, pem, verifypem); 2275 if(!ctx) { 2276 log_msg(LOG_ERR, "could not setup server TLS context"); 2277 return NULL; 2278 } 2279 if(ocspfile && ocspfile[0]) { 2280 if ((ocspdata_len = get_ocsp(ocspfile, &ocspdata)) < 0) { 2281 log_crypto_err("Error reading OCSPfile"); 2282 SSL_CTX_free(ctx); 2283 return NULL; 2284 } else { 2285 VERBOSITY(2, (LOG_INFO, "ocspfile %s loaded", ocspfile)); 2286 if(!SSL_CTX_set_tlsext_status_cb(ctx, add_ocsp_data_cb)) { 2287 log_crypto_err("Error in SSL_CTX_set_tlsext_status_cb"); 2288 SSL_CTX_free(ctx); 2289 return NULL; 2290 } 2291 } 2292 } 2293 return ctx; 2294 } 2295 2296 /* check if tcp_handler_accept_data created for TLS dedicated port */ 2297 int 2298 using_tls_port(struct sockaddr* addr, const char* tls_port) 2299 { 2300 in_port_t port = 0; 2301 2302 if (addr->sa_family == AF_INET) 2303 port = ((struct sockaddr_in*)addr)->sin_port; 2304 #ifndef HAVE_STRUCT_SOCKADDR_IN6 2305 else 2306 port = ((struct sockaddr_in6*)addr)->sin6_port; 2307 #endif /* HAVE_STRUCT_SOCKADDR_IN6 */ 2308 if (atoi(tls_port) == ntohs(port)) 2309 return 1; 2310 2311 return 0; 2312 } 2313 #endif 2314 2315 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */ 2316 ssize_t 2317 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout) 2318 { 2319 uint8_t* buf = (uint8_t*) p; 2320 ssize_t total = 0; 2321 struct pollfd fd; 2322 memset(&fd, 0, sizeof(fd)); 2323 fd.fd = s; 2324 fd.events = POLLIN; 2325 2326 while( total < sz) { 2327 ssize_t ret; 2328 ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000); 2329 if(ret == -1) { 2330 if(errno == EAGAIN) 2331 /* blocking read */ 2332 continue; 2333 if(errno == EINTR) { 2334 if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown)) 2335 return -1; 2336 /* other signals can be handled later */ 2337 continue; 2338 } 2339 /* some error */ 2340 return -1; 2341 } 2342 if(ret == 0) { 2343 /* operation timed out */ 2344 return -2; 2345 } 2346 ret = read(s, buf+total, sz-total); 2347 if(ret == -1) { 2348 if(errno == EAGAIN) 2349 /* blocking read */ 2350 continue; 2351 if(errno == EINTR) { 2352 if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown)) 2353 return -1; 2354 /* other signals can be handled later */ 2355 continue; 2356 } 2357 /* some error */ 2358 return -1; 2359 } 2360 if(ret == 0) { 2361 /* closed connection! */ 2362 return 0; 2363 } 2364 total += ret; 2365 } 2366 return total; 2367 } 2368 2369 static void 2370 reload_process_non_xfr_tasks(struct nsd* nsd, udb_ptr* xfrs2process, 2371 udb_ptr* last_task) 2372 { 2373 udb_ptr t, next, xfr_tail; 2374 udb_base* u = nsd->task[nsd->mytask]; 2375 udb_ptr_init(&next, u); 2376 udb_ptr_init(&xfr_tail, u); 2377 udb_ptr_new(&t, u, udb_base_get_userdata(u)); 2378 udb_base_set_userdata(u, 0); 2379 /* Execute all tasks except of type "task_apply_xfr". */ 2380 while(!udb_ptr_is_null(&t)) { 2381 /* store next in list so this one can be deleted or reused */ 2382 udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next); 2383 udb_rptr_zero(&TASKLIST(&t)->next, u); 2384 2385 if(TASKLIST(&t)->task_type != task_apply_xfr) { 2386 /* process task t */ 2387 /* append results for task t and update last_task */ 2388 task_process_in_reload(nsd, u, last_task, &t); 2389 2390 } else if(udb_ptr_is_null(xfrs2process)) { 2391 udb_ptr_set_ptr( xfrs2process, u, &t); 2392 udb_ptr_set_ptr(&xfr_tail, u, &t); 2393 } else { 2394 udb_rptr_set_ptr(&TASKLIST(&xfr_tail)->next, u, &t); 2395 udb_ptr_set_ptr(&xfr_tail, u, &t); 2396 } 2397 /* go to next */ 2398 udb_ptr_set_ptr(&t, u, &next); 2399 } 2400 /* t and next are already unlinked (because they are null) */ 2401 udb_ptr_unlink(&xfr_tail, u); 2402 } 2403 2404 static size_t 2405 reload_process_xfr_tasks(struct nsd* nsd, int cmdsocket, udb_ptr* xfrs2process) 2406 { 2407 sig_atomic_t cmd = NSD_QUIT_SYNC; 2408 udb_ptr next; 2409 udb_base* u = nsd->task[nsd->mytask]; 2410 size_t xfrs_processed = 0; 2411 2412 udb_ptr_init(&next, u); 2413 while(!udb_ptr_is_null(xfrs2process)) { 2414 /* store next in list so this one can be deleted or reused */ 2415 udb_ptr_set_rptr(&next, u, &TASKLIST(xfrs2process)->next); 2416 udb_rptr_zero(&TASKLIST(xfrs2process)->next, u); 2417 2418 /* process xfr task at xfrs2process */ 2419 assert(TASKLIST(xfrs2process)->task_type == task_apply_xfr); 2420 task_process_apply_xfr(nsd, u, xfrs2process); 2421 xfrs_processed += 1; 2422 2423 /* go to next */ 2424 udb_ptr_set_ptr(xfrs2process, u, &next); 2425 2426 /* if the "old-main" has quit, we must quit too, poll the fd for cmds */ 2427 if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) != sizeof(cmd)) 2428 ; /* pass */ 2429 else if (cmd != NSD_QUIT) 2430 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from old-main %d", (int)cmd)); 2431 else { 2432 udb_ptr_unlink(&next, u); 2433 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd")); 2434 /* unlink files of remainder of tasks */ 2435 while(!udb_ptr_is_null(xfrs2process)) { 2436 assert(TASKLIST(xfrs2process)->task_type == task_apply_xfr); 2437 xfrd_unlink_xfrfile(nsd, TASKLIST(xfrs2process)->yesno); 2438 udb_ptr_set_rptr(xfrs2process, u, &TASKLIST(xfrs2process)->next); 2439 } 2440 exit(0); 2441 } 2442 } 2443 /* xfrs2process and next are already unlinked (because they are null) */ 2444 return xfrs_processed; 2445 } 2446 2447 static void server_verify(struct nsd *nsd, int cmdsocket, 2448 struct sigaction* old_sigchld); 2449 2450 struct quit_sync_event_data { 2451 struct event_base* base; 2452 size_t read; 2453 union { 2454 uint8_t buf[sizeof(sig_atomic_t)]; 2455 sig_atomic_t cmd; 2456 } to_read; 2457 }; 2458 2459 static void server_reload_handle_sigchld(int sig, short event, 2460 void* ATTR_UNUSED(arg)) 2461 { 2462 assert(sig == SIGCHLD); 2463 assert((event & EV_SIGNAL)); 2464 2465 /* reap the exited old-serve child(s) */ 2466 while(waitpid(-1, NULL, WNOHANG) > 0) { 2467 /* pass */ 2468 } 2469 } 2470 2471 static void server_reload_handle_quit_sync_ack(int cmdsocket, short event, 2472 void* arg) 2473 { 2474 struct quit_sync_event_data* cb_data = 2475 (struct quit_sync_event_data*)arg; 2476 ssize_t r; 2477 2478 if((event & EV_TIMEOUT)) { 2479 sig_atomic_t cmd = NSD_QUIT_SYNC; 2480 2481 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main")); 2482 if (!write_socket(cmdsocket, &cmd, sizeof(cmd))) { 2483 log_msg(LOG_ERR, "problems sending command from " 2484 "reload to old-main: %s", strerror(errno)); 2485 } 2486 /* Wait for cmdsocket to become readable or for next timeout, 2487 * (this works because event is added EV_TIMEOUT|EV_PERSIST). 2488 */ 2489 return; 2490 } 2491 assert((event & EV_READ)); 2492 assert(cb_data->read < sizeof(cb_data->to_read.cmd)); 2493 2494 r = read(cmdsocket, cb_data->to_read.buf + cb_data->read, 2495 sizeof(cb_data->to_read.cmd) - cb_data->read); 2496 if(r == 0) { 2497 DEBUG(DEBUG_IPC, 1, (LOG_WARNING, 2498 "reload: old-main quit during quit sync")); 2499 cb_data->to_read.cmd = NSD_RELOAD; 2500 2501 } else if(r == -1) { 2502 if(errno == EAGAIN || errno == EINTR) 2503 return; 2504 2505 log_msg(LOG_ERR, "reload: could not wait for parent to quit: " 2506 "%s", strerror(errno)); 2507 cb_data->to_read.cmd = NSD_RELOAD; 2508 2509 } else if (cb_data->read + r < sizeof(cb_data->to_read.cmd)) { 2510 /* More to read */ 2511 cb_data->read += r; 2512 return; 2513 2514 } else { 2515 assert(cb_data->read + r == sizeof(cb_data->to_read.cmd)); 2516 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d", 2517 (int)cb_data->to_read.cmd)); 2518 } 2519 /* Done */ 2520 event_base_loopexit(cb_data->base, NULL); 2521 } 2522 2523 /* 2524 * Reload the database, stop parent, re-fork children and continue. 2525 * as server_main. 2526 */ 2527 static void 2528 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio, 2529 int cmdsocket, udb_ptr* xfrs2process, udb_ptr* last_task) 2530 { 2531 pid_t mypid; 2532 sig_atomic_t cmd; 2533 struct sigaction old_sigchld, ign_sigchld; 2534 struct radnode* node; 2535 zone_type* zone; 2536 enum soainfo_hint hint; 2537 struct quit_sync_event_data cb_data; 2538 struct event signal_event, cmd_event; 2539 struct timeval reload_sync_timeout; 2540 size_t xfrs_processed = 0; 2541 /* For swapping filedescriptors from the serve childs to the xfrd 2542 * and/or the dnstap collector */ 2543 int *swap_fd_send; 2544 2545 /* ignore SIGCHLD from the previous server_main that used this pid */ 2546 memset(&ign_sigchld, 0, sizeof(ign_sigchld)); 2547 ign_sigchld.sa_handler = SIG_IGN; 2548 sigaction(SIGCHLD, &ign_sigchld, &old_sigchld); 2549 2550 #ifdef HAVE_CPUSET_T 2551 if(nsd->use_cpu_affinity) { 2552 set_cpu_affinity(nsd->cpuset); 2553 } 2554 #endif 2555 2556 /* see what tasks we got from xfrd */ 2557 xfrs_processed = reload_process_xfr_tasks(nsd, cmdsocket, xfrs2process); 2558 2559 #ifndef NDEBUG 2560 if(nsd_debug_level >= 1) 2561 region_log_stats(nsd->db->region); 2562 #endif /* NDEBUG */ 2563 initialize_dname_compression_tables(nsd); 2564 2565 #ifdef BIND8_STATS 2566 /* Restart dumping stats if required. */ 2567 time(&nsd->st->boot); 2568 set_bind8_alarm(nsd); 2569 /* Switch to a different set of stat array for new server processes, 2570 * because they can briefly coexist with the old processes. They 2571 * have their own stat structure. */ 2572 nsd->stat_current = (nsd->stat_current==0?1:0); 2573 #endif 2574 #ifdef USE_ZONE_STATS 2575 server_zonestat_realloc(nsd); /* realloc for new children */ 2576 server_zonestat_switch(nsd); 2577 #endif 2578 2579 if(nsd->options->verify_enable) { 2580 #ifdef RATELIMIT 2581 /* allocate resources for rate limiting. use a slot that is guaranteed 2582 not mapped to a file so no persistent data is overwritten */ 2583 rrl_init(nsd->child_count + 1); 2584 #endif 2585 2586 /* spin-up server and execute verifiers for each zone */ 2587 server_verify(nsd, cmdsocket, &old_sigchld); 2588 #ifdef RATELIMIT 2589 /* deallocate rate limiting resources */ 2590 rrl_deinit(nsd->child_count + 1); 2591 #endif 2592 } 2593 2594 if(xfrs_processed) for( node = radix_first(nsd->db->zonetree) 2595 ; node != NULL; node = radix_next(node)) { 2596 2597 zone = (zone_type *)node->elem; 2598 if(zone->is_updated) { 2599 if(zone->is_bad) { 2600 nsd->mode = NSD_RELOAD_FAILED; 2601 hint = soainfo_bad; 2602 } else { 2603 hint = soainfo_ok; 2604 } 2605 /* update(s), verified or not, possibly with subsequent 2606 skipped update(s). skipped update(s) are picked up 2607 by failed update check in xfrd */ 2608 task_new_soainfo(nsd->task[nsd->mytask], last_task, 2609 zone, hint); 2610 } else if(zone->is_skipped) { 2611 /* corrupt or inconsistent update without preceding 2612 update(s), communicate soainfo_gone */ 2613 task_new_soainfo(nsd->task[nsd->mytask], last_task, 2614 zone, soainfo_gone); 2615 } 2616 zone->is_updated = 0; 2617 zone->is_skipped = 0; 2618 } 2619 2620 if(nsd->mode == NSD_RELOAD_FAILED) { 2621 exit(NSD_RELOAD_FAILED); 2622 } 2623 #ifdef BIND8_STATS 2624 nsd->stats_per_child[nsd->stat_current][0].reloadcount = 2625 nsd->stats_per_child[(nsd->stat_current==0?1:0)][0].reloadcount+1; 2626 nsd->stats_per_child[nsd->stat_current][0].db_mem = 2627 region_get_mem(nsd->db->region); 2628 #endif 2629 2630 /* listen for the signals of failed children again */ 2631 sigaction(SIGCHLD, &old_sigchld, NULL); 2632 #ifdef USE_DNSTAP 2633 if (nsd->dt_collector) { 2634 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: swap dnstap collector pipes")); 2635 /* Swap fd_send with fd_swap so old serve child and new serve 2636 * childs will not write to the same pipe ends simultaneously */ 2637 swap_fd_send = nsd->dt_collector_fd_send; 2638 nsd->dt_collector_fd_send = nsd->dt_collector_fd_swap; 2639 nsd->dt_collector_fd_swap = swap_fd_send; 2640 2641 } 2642 #endif 2643 swap_fd_send = nsd->serve2xfrd_fd_send; 2644 nsd->serve2xfrd_fd_send = nsd->serve2xfrd_fd_swap; 2645 nsd->serve2xfrd_fd_swap = swap_fd_send; 2646 /* Start new child processes */ 2647 if (server_start_children(nsd, server_region, netio, &nsd-> 2648 xfrd_listener->fd) != 0) { 2649 send_children_quit(nsd); 2650 exit(1); 2651 } 2652 2653 /* if the old-main has quit, we must quit too, poll the fd for cmds */ 2654 if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) { 2655 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd)); 2656 if(cmd == NSD_QUIT) { 2657 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd")); 2658 send_children_quit(nsd); 2659 exit(0); 2660 } 2661 } 2662 2663 /* Send quit command to old-main: blocking, wait for receipt. 2664 * The old-main process asks the old-serve processes to quit, however 2665 * if a reload succeeded before, this process is the parent of the 2666 * old-serve processes, so we need to reap the children for it. 2667 */ 2668 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main")); 2669 cmd = NSD_QUIT_SYNC; 2670 if (!write_socket(cmdsocket, &cmd, sizeof(cmd))) 2671 { 2672 log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s", 2673 strerror(errno)); 2674 } 2675 2676 reload_sync_timeout.tv_sec = RELOAD_SYNC_TIMEOUT; 2677 reload_sync_timeout.tv_usec = 0; 2678 2679 cb_data.base = nsd_child_event_base(); 2680 cb_data.to_read.cmd = cmd; 2681 cb_data.read = 0; 2682 2683 event_set(&signal_event, SIGCHLD, EV_SIGNAL|EV_PERSIST, 2684 server_reload_handle_sigchld, NULL); 2685 if(event_base_set(cb_data.base, &signal_event) != 0 2686 || signal_add(&signal_event, NULL) != 0) { 2687 log_msg(LOG_ERR, "NSD quit sync: could not add signal event"); 2688 } 2689 2690 event_set(&cmd_event, cmdsocket, EV_READ|EV_TIMEOUT|EV_PERSIST, 2691 server_reload_handle_quit_sync_ack, &cb_data); 2692 if(event_base_set(cb_data.base, &cmd_event) != 0 2693 || event_add(&cmd_event, &reload_sync_timeout) != 0) { 2694 log_msg(LOG_ERR, "NSD quit sync: could not add command event"); 2695 } 2696 2697 /* short-lived main loop */ 2698 event_base_dispatch(cb_data.base); 2699 2700 /* remove command and signal event handlers */ 2701 event_del(&cmd_event); 2702 signal_del(&signal_event); 2703 /* restore the ordinary signal handler for SIGCHLD */ 2704 sigaction(SIGCHLD, &old_sigchld, NULL); 2705 event_base_free(cb_data.base); 2706 cmd = cb_data.to_read.cmd; 2707 2708 if(cmd == NSD_QUIT) { 2709 /* small race condition possible here, parent got quit cmd. */ 2710 send_children_quit(nsd); 2711 exit(1); 2712 } 2713 assert(cmd == NSD_RELOAD); 2714 udb_ptr_set(last_task, nsd->task[nsd->mytask], 0); 2715 task_process_sync(nsd->task[nsd->mytask]); 2716 #ifdef USE_ZONE_STATS 2717 server_zonestat_realloc(nsd); /* realloc for next children */ 2718 #endif 2719 2720 /* send soainfo to the xfrd process, signal it that reload is done, 2721 * it picks up the taskudb */ 2722 cmd = NSD_RELOAD_DONE; 2723 if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) { 2724 log_msg(LOG_ERR, "problems sending reload_done xfrd: %s", 2725 strerror(errno)); 2726 } 2727 mypid = getpid(); 2728 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 2729 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 2730 strerror(errno)); 2731 } 2732 2733 /* try to reopen file */ 2734 if (nsd->file_rotation_ok) 2735 log_reopen(nsd->log_filename, 1); 2736 /* exit reload, continue as new server_main */ 2737 } 2738 2739 /* 2740 * Get the mode depending on the signal hints that have been received. 2741 * Multiple signal hints can be received and will be handled in turn. 2742 */ 2743 static sig_atomic_t 2744 server_signal_mode(struct nsd *nsd) 2745 { 2746 if(nsd->signal_hint_quit) { 2747 nsd->signal_hint_quit = 0; 2748 return NSD_QUIT; 2749 } 2750 else if(nsd->signal_hint_shutdown) { 2751 nsd->signal_hint_shutdown = 0; 2752 return NSD_SHUTDOWN; 2753 } 2754 else if(nsd->signal_hint_child) { 2755 nsd->signal_hint_child = 0; 2756 return NSD_REAP_CHILDREN; 2757 } 2758 else if(nsd->signal_hint_reload) { 2759 nsd->signal_hint_reload = 0; 2760 return NSD_RELOAD; 2761 } 2762 else if(nsd->signal_hint_reload_hup) { 2763 nsd->signal_hint_reload_hup = 0; 2764 return NSD_RELOAD_REQ; 2765 } 2766 else if(nsd->signal_hint_stats) { 2767 nsd->signal_hint_stats = 0; 2768 #ifdef BIND8_STATS 2769 set_bind8_alarm(nsd); 2770 #endif 2771 return NSD_STATS; 2772 } 2773 else if(nsd->signal_hint_statsusr) { 2774 nsd->signal_hint_statsusr = 0; 2775 return NSD_STATS; 2776 } 2777 return NSD_RUN; 2778 } 2779 2780 /* 2781 * The main server simply waits for signals and child processes to 2782 * terminate. Child processes are restarted as necessary. 2783 */ 2784 void 2785 server_main(struct nsd *nsd) 2786 { 2787 region_type *server_region = region_create(xalloc, free); 2788 netio_type *netio = netio_create(server_region); 2789 netio_handler_type reload_listener; 2790 int reload_sockets[2] = {-1, -1}; 2791 /* pointer to the xfr tasks that will be processed in a second pass */ 2792 udb_ptr xfrs2process; 2793 /* pointer to results of task processing */ 2794 udb_ptr last_task; 2795 struct timespec timeout_spec; 2796 int status; 2797 pid_t child_pid; 2798 pid_t reload_pid = -1; 2799 sig_atomic_t mode; 2800 2801 /* Ensure we are the main process */ 2802 assert(nsd->server_kind == NSD_SERVER_MAIN); 2803 2804 /* Add listener for the XFRD process */ 2805 netio_add_handler(netio, nsd->xfrd_listener); 2806 2807 #ifdef BIND8_STATS 2808 nsd->st = &nsd->stat_map[0]; 2809 nsd->st->db_disk = 0; 2810 nsd->st->db_mem = region_get_mem(nsd->db->region); 2811 #endif 2812 memset(&xfrs2process, 0, sizeof(xfrs2process)); 2813 memset(&last_task, 0, sizeof(last_task)); 2814 2815 /* Start the child processes that handle incoming queries */ 2816 if (server_start_children(nsd, server_region, netio, 2817 &nsd->xfrd_listener->fd) != 0) { 2818 send_children_quit(nsd); 2819 exit(1); 2820 } 2821 reload_listener.fd = -1; 2822 2823 /* This_child MUST be 0, because this is the parent process */ 2824 assert(nsd->this_child == 0); 2825 2826 /* Run the server until we get a shutdown signal */ 2827 while ((mode = nsd->mode) != NSD_SHUTDOWN) { 2828 /* Did we receive a signal that changes our mode? */ 2829 if(mode == NSD_RUN) { 2830 nsd->mode = mode = server_signal_mode(nsd); 2831 } 2832 2833 switch (mode) { 2834 case NSD_RUN: 2835 /* see if any child processes terminated */ 2836 while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) { 2837 int is_child = delete_child_pid(nsd, child_pid); 2838 if (is_child != -1 && nsd->children[is_child].need_to_exit) { 2839 if(nsd->children[is_child].child_fd == -1) 2840 nsd->children[is_child].has_exited = 1; 2841 parent_check_all_children_exited(nsd); 2842 } else if(is_child != -1) { 2843 log_msg(LOG_WARNING, 2844 "server %d died unexpectedly with status %d, restarting", 2845 (int) child_pid, status); 2846 restart_child_servers(nsd, server_region, netio, 2847 &nsd->xfrd_listener->fd); 2848 } else if (child_pid == reload_pid) { 2849 sig_atomic_t cmd = NSD_RELOAD_FAILED; 2850 pid_t mypid; 2851 log_msg(LOG_WARNING, 2852 "Reload process %d failed with status %d, continuing with old database", 2853 (int) child_pid, status); 2854 #ifdef HAVE_SETPROCTITLE 2855 setproctitle("main"); 2856 #endif 2857 #ifdef USE_LOG_PROCESS_ROLE 2858 log_set_process_role("main"); 2859 #endif 2860 reload_pid = -1; 2861 if(reload_listener.fd != -1) close(reload_listener.fd); 2862 netio_remove_handler(netio, &reload_listener); 2863 reload_listener.fd = -1; 2864 reload_listener.event_types = NETIO_EVENT_NONE; 2865 task_process_sync(nsd->task[nsd->mytask]); 2866 /* inform xfrd reload attempt ended */ 2867 if(!write_socket(nsd->xfrd_listener->fd, 2868 &cmd, sizeof(cmd))) { 2869 log_msg(LOG_ERR, "problems " 2870 "sending SOAEND to xfrd: %s", 2871 strerror(errno)); 2872 } 2873 mypid = getpid(); 2874 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 2875 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 2876 strerror(errno)); 2877 } 2878 #ifdef USE_DNSTAP 2879 } else if(nsd->dt_collector && child_pid == nsd->dt_collector->dt_pid) { 2880 log_msg(LOG_WARNING, 2881 "dnstap-collector %d terminated with status %d", 2882 (int) child_pid, status); 2883 if(nsd->dt_collector) { 2884 dt_collector_close(nsd->dt_collector, nsd); 2885 dt_collector_destroy(nsd->dt_collector, nsd); 2886 nsd->dt_collector = NULL; 2887 } 2888 /* Only respawn a crashed (or exited) 2889 * dnstap-collector when not reloading, 2890 * to not induce a reload during a 2891 * reload (which would seriously 2892 * disrupt nsd procedures and lead to 2893 * unpredictable results)! 2894 * 2895 * This will *leave* a dnstap-collector 2896 * process terminated, but because 2897 * signalling of the reload process to 2898 * the main process to respawn in this 2899 * situation will be cumbersome, and 2900 * because this situation is so 2901 * specific (and therefore hopefully 2902 * extremely rare or non-existing at 2903 * all), plus the fact that we are left 2904 * with a perfectly function NSD 2905 * (besides not logging dnstap 2906 * messages), I consider it acceptable 2907 * to leave this unresolved. 2908 */ 2909 if(reload_pid == -1 && nsd->options->dnstap_enable) { 2910 nsd->dt_collector = dt_collector_create(nsd); 2911 dt_collector_start(nsd->dt_collector, nsd); 2912 nsd->mode = NSD_RELOAD_REQ; 2913 } 2914 #endif 2915 } else if(status != 0) { 2916 /* check for status, because we get 2917 * the old-servermain because reload 2918 * is the process-parent of old-main, 2919 * and we get older server-processes 2920 * that are exiting after a reload */ 2921 log_msg(LOG_WARNING, 2922 "process %d terminated with status %d", 2923 (int) child_pid, status); 2924 } 2925 } 2926 if (child_pid == -1) { 2927 if (errno == EINTR) { 2928 continue; 2929 } 2930 if (errno != ECHILD) 2931 log_msg(LOG_WARNING, "wait failed: %s", strerror(errno)); 2932 } 2933 if (nsd->mode != NSD_RUN) 2934 break; 2935 2936 /* timeout to collect processes. In case no sigchild happens. */ 2937 timeout_spec.tv_sec = 1; 2938 timeout_spec.tv_nsec = 0; 2939 2940 /* listen on ports, timeout for collecting terminated children */ 2941 if(netio_dispatch(netio, &timeout_spec, 0) == -1) { 2942 if (errno != EINTR) { 2943 log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno)); 2944 } 2945 } 2946 if(nsd->restart_children) { 2947 restart_child_servers(nsd, server_region, netio, 2948 &nsd->xfrd_listener->fd); 2949 nsd->restart_children = 0; 2950 } 2951 if(nsd->reload_failed) { 2952 sig_atomic_t cmd = NSD_RELOAD_FAILED; 2953 pid_t mypid; 2954 nsd->reload_failed = 0; 2955 log_msg(LOG_WARNING, 2956 "Reload process %d failed, continuing with old database", 2957 (int) reload_pid); 2958 #ifdef HAVE_SETPROCTITLE 2959 setproctitle("main"); 2960 #endif 2961 #ifdef USE_LOG_PROCESS_ROLE 2962 log_set_process_role("main"); 2963 #endif 2964 reload_pid = -1; 2965 if(reload_listener.fd != -1) close(reload_listener.fd); 2966 netio_remove_handler(netio, &reload_listener); 2967 reload_listener.fd = -1; 2968 reload_listener.event_types = NETIO_EVENT_NONE; 2969 task_process_sync(nsd->task[nsd->mytask]); 2970 /* inform xfrd reload attempt ended */ 2971 if(!write_socket(nsd->xfrd_listener->fd, 2972 &cmd, sizeof(cmd))) { 2973 log_msg(LOG_ERR, "problems " 2974 "sending SOAEND to xfrd: %s", 2975 strerror(errno)); 2976 } 2977 mypid = getpid(); 2978 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 2979 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 2980 strerror(errno)); 2981 } 2982 } 2983 2984 break; 2985 case NSD_RELOAD_REQ: { 2986 sig_atomic_t cmd = NSD_RELOAD_REQ; 2987 log_msg(LOG_WARNING, "SIGHUP received, reloading..."); 2988 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2989 "main: ipc send reload_req to xfrd")); 2990 if(!write_socket(nsd->xfrd_listener->fd, 2991 &cmd, sizeof(cmd))) { 2992 log_msg(LOG_ERR, "server_main: could not send " 2993 "reload_req to xfrd: %s", strerror(errno)); 2994 } 2995 nsd->mode = NSD_RUN; 2996 } break; 2997 case NSD_RELOAD: 2998 /* Continue to run nsd after reload */ 2999 nsd->mode = NSD_RUN; 3000 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading...")); 3001 if (reload_pid != -1) { 3002 log_msg(LOG_WARNING, "Reload already in progress (pid = %d)", 3003 (int) reload_pid); 3004 break; 3005 } 3006 3007 /* switch the mytask to keep track of who owns task*/ 3008 nsd->mytask = 1 - nsd->mytask; 3009 if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) { 3010 log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno)); 3011 reload_pid = -1; 3012 break; 3013 } 3014 /* Execute the tasks that cannot fail */ 3015 #ifdef HAVE_SETPROCTITLE 3016 setproctitle("load"); 3017 #endif 3018 #ifdef USE_LOG_PROCESS_ROLE 3019 log_set_process_role("load"); 3020 #endif 3021 /* Already process the non xfr tasks, so that a failed 3022 * transfer (which can exit) will not nullify the 3023 * effects of the other tasks that will not exit. 3024 */ 3025 task_remap(nsd->task[nsd->mytask]); 3026 udb_ptr_init(&xfrs2process, nsd->task[nsd->mytask]); 3027 udb_ptr_init(&last_task , nsd->task[nsd->mytask]); 3028 /* last_task and xfrs2process MUST be unlinked in all 3029 * possible branches of the fork() below. 3030 * server_reload() will unlink them, but for failed 3031 * fork and for the "old-main" (child) process, we MUST 3032 * unlink them in the case statement below. 3033 * Unlink by setting the value to 0, because 3034 * reload_process_non_xfr_tasks() may clear (and 3035 * implicitly unlink) xfrs2process. 3036 */ 3037 reload_process_non_xfr_tasks(nsd, &xfrs2process 3038 , &last_task); 3039 /* Do actual reload */ 3040 reload_pid = fork(); 3041 switch (reload_pid) { 3042 case -1: 3043 log_msg(LOG_ERR, "fork failed: %s", strerror(errno)); 3044 udb_ptr_set(&last_task, nsd->task[nsd->mytask], 0); 3045 udb_ptr_set(&xfrs2process, nsd->task[nsd->mytask], 0); 3046 break; 3047 default: 3048 /* PARENT */ 3049 close(reload_sockets[0]); 3050 server_reload(nsd, server_region, netio 3051 , reload_sockets[1] 3052 , &xfrs2process 3053 , &last_task); 3054 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main")); 3055 close(reload_sockets[1]); 3056 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed")); 3057 /* drop stale xfrd ipc data */ 3058 ((struct ipc_handler_conn_data*)nsd-> 3059 xfrd_listener->user_data) 3060 ->conn->is_reading = 0; 3061 reload_pid = -1; 3062 reload_listener.fd = -1; 3063 reload_listener.event_types = NETIO_EVENT_NONE; 3064 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run")); 3065 break; 3066 case 0: 3067 /* CHILD */ 3068 /* server_main keep running until NSD_QUIT_SYNC 3069 * received from reload. */ 3070 close(reload_sockets[1]); 3071 #ifdef HAVE_SETPROCTITLE 3072 setproctitle("old-main"); 3073 #endif 3074 #ifdef USE_LOG_PROCESS_ROLE 3075 log_set_process_role("old-main"); 3076 #endif 3077 udb_ptr_set(&last_task, nsd->task[nsd->mytask], 0); 3078 udb_ptr_set(&xfrs2process, nsd->task[nsd->mytask], 0); 3079 reload_listener.fd = reload_sockets[0]; 3080 reload_listener.timeout = NULL; 3081 reload_listener.user_data = nsd; 3082 reload_listener.event_types = NETIO_EVENT_READ; 3083 reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */ 3084 netio_add_handler(netio, &reload_listener); 3085 reload_pid = getppid(); 3086 break; 3087 } 3088 if(reload_pid == -1) { 3089 /* Reset proctitle after "load" process exited 3090 * or when fork() failed 3091 */ 3092 #ifdef HAVE_SETPROCTITLE 3093 setproctitle("main"); 3094 #endif 3095 #ifdef USE_LOG_PROCESS_ROLE 3096 log_set_process_role("main"); 3097 #endif 3098 } 3099 break; 3100 case NSD_QUIT_SYNC: 3101 /* synchronisation of xfrd, parent and reload */ 3102 if(!nsd->quit_sync_done && reload_listener.fd != -1) { 3103 sig_atomic_t cmd = NSD_RELOAD; 3104 /* stop xfrd ipc writes in progress */ 3105 DEBUG(DEBUG_IPC,1, (LOG_INFO, 3106 "main: ipc send indication reload")); 3107 if(!write_socket(nsd->xfrd_listener->fd, 3108 &cmd, sizeof(cmd))) { 3109 log_msg(LOG_ERR, "server_main: could not send reload " 3110 "indication to xfrd: %s", strerror(errno)); 3111 } 3112 /* wait for ACK from xfrd */ 3113 DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd")); 3114 nsd->quit_sync_done = 1; 3115 } 3116 nsd->mode = NSD_RUN; 3117 break; 3118 case NSD_QUIT: 3119 /* silent shutdown during reload */ 3120 if(reload_listener.fd != -1) { 3121 /* acknowledge the quit, to sync reload that we will really quit now */ 3122 sig_atomic_t cmd = NSD_RELOAD; 3123 DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload")); 3124 if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) { 3125 log_msg(LOG_ERR, "server_main: " 3126 "could not ack quit: %s", strerror(errno)); 3127 } 3128 close(reload_listener.fd); 3129 } 3130 DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence")); 3131 /* only quit children after xfrd has acked */ 3132 send_children_quit(nsd); 3133 3134 #ifdef MEMCLEAN /* OS collects memory pages */ 3135 region_destroy(server_region); 3136 #endif 3137 server_shutdown(nsd); 3138 3139 /* ENOTREACH */ 3140 break; 3141 case NSD_SHUTDOWN: 3142 break; 3143 case NSD_REAP_CHILDREN: 3144 /* continue; wait for child in run loop */ 3145 nsd->mode = NSD_RUN; 3146 break; 3147 case NSD_STATS: 3148 #ifdef BIND8_STATS 3149 set_children_stats(nsd); 3150 #endif 3151 nsd->mode = NSD_RUN; 3152 break; 3153 default: 3154 log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode); 3155 nsd->mode = NSD_RUN; 3156 break; 3157 } 3158 } 3159 log_msg(LOG_WARNING, "signal received, shutting down..."); 3160 3161 /* close opened ports to avoid race with restart of nsd */ 3162 server_close_all_sockets(nsd->udp, nsd->ifs); 3163 server_close_all_sockets(nsd->tcp, nsd->ifs); 3164 daemon_remote_close(nsd->rc); 3165 send_children_quit_and_wait(nsd); 3166 3167 /* Unlink it if possible... */ 3168 unlinkpid(nsd->pidfile, nsd->username); 3169 unlink(nsd->task[0]->fname); 3170 unlink(nsd->task[1]->fname); 3171 #ifdef USE_ZONE_STATS 3172 unlink(nsd->zonestatfname[0]); 3173 unlink(nsd->zonestatfname[1]); 3174 #endif 3175 #ifdef BIND8_STATS 3176 server_stat_free(nsd); 3177 #endif 3178 #ifdef USE_DNSTAP 3179 dt_collector_close(nsd->dt_collector, nsd); 3180 #endif 3181 3182 if(reload_listener.fd != -1) { 3183 sig_atomic_t cmd = NSD_QUIT; 3184 DEBUG(DEBUG_IPC,1, (LOG_INFO, 3185 "main: ipc send quit to reload-process")); 3186 if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) { 3187 log_msg(LOG_ERR, "server_main: could not send quit to reload: %s", 3188 strerror(errno)); 3189 } 3190 fsync(reload_listener.fd); 3191 close(reload_listener.fd); 3192 /* wait for reload to finish processing */ 3193 while(1) { 3194 if(waitpid(reload_pid, NULL, 0) == -1) { 3195 if(errno == EINTR) continue; 3196 if(errno == ECHILD) break; 3197 log_msg(LOG_ERR, "waitpid(reload %d): %s", 3198 (int)reload_pid, strerror(errno)); 3199 } 3200 break; 3201 } 3202 } 3203 if(nsd->xfrd_listener->fd != -1) { 3204 /* complete quit, stop xfrd */ 3205 sig_atomic_t cmd = NSD_QUIT; 3206 DEBUG(DEBUG_IPC,1, (LOG_INFO, 3207 "main: ipc send quit to xfrd")); 3208 if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) { 3209 log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s", 3210 strerror(errno)); 3211 } 3212 fsync(nsd->xfrd_listener->fd); 3213 close(nsd->xfrd_listener->fd); 3214 (void)kill(nsd->pid, SIGTERM); 3215 } 3216 3217 #ifdef USE_XDP 3218 xdp_server_cleanup(&nsd->xdp.xdp_server); 3219 #endif 3220 3221 #ifdef MEMCLEAN /* OS collects memory pages */ 3222 region_destroy(server_region); 3223 #endif 3224 server_shutdown(nsd); 3225 } 3226 3227 static query_state_type 3228 server_process_query(struct nsd *nsd, struct query *query, uint32_t *now_p) 3229 { 3230 return query_process(query, nsd, now_p); 3231 } 3232 3233 static query_state_type 3234 server_process_query_udp(struct nsd *nsd, struct query *query, uint32_t *now_p) 3235 { 3236 #ifdef RATELIMIT 3237 if(query_process(query, nsd, now_p) != QUERY_DISCARDED) { 3238 if(query->edns.cookie_status != COOKIE_VALID 3239 && query->edns.cookie_status != COOKIE_VALID_REUSE 3240 && rrl_process_query(query)) 3241 return rrl_slip(query); 3242 else return QUERY_PROCESSED; 3243 } 3244 return QUERY_DISCARDED; 3245 #else 3246 return query_process(query, nsd, now_p); 3247 #endif 3248 } 3249 3250 const char* 3251 nsd_event_vs(void) 3252 { 3253 #ifdef USE_MINI_EVENT 3254 return ""; 3255 #else 3256 return event_get_version(); 3257 #endif 3258 } 3259 3260 #if !defined(USE_MINI_EVENT) && defined(EV_FEATURE_BACKENDS) 3261 static const char* ub_ev_backend2str(int b) 3262 { 3263 switch(b) { 3264 case EVBACKEND_SELECT: return "select"; 3265 case EVBACKEND_POLL: return "poll"; 3266 case EVBACKEND_EPOLL: return "epoll"; 3267 case EVBACKEND_KQUEUE: return "kqueue"; 3268 case EVBACKEND_DEVPOLL: return "devpoll"; 3269 case EVBACKEND_PORT: return "evport"; 3270 } 3271 return "unknown"; 3272 } 3273 #endif 3274 3275 const char* 3276 nsd_event_method(void) 3277 { 3278 #ifdef USE_MINI_EVENT 3279 return "select"; 3280 #else 3281 struct event_base* b = nsd_child_event_base(); 3282 const char* m; 3283 # ifdef EV_FEATURE_BACKENDS 3284 m = ub_ev_backend2str(ev_backend((struct ev_loop*)b)); 3285 # elif defined(HAVE_EVENT_BASE_GET_METHOD) 3286 m = event_base_get_method(b); 3287 # else 3288 m = "?"; 3289 # endif 3290 # ifdef MEMCLEAN 3291 event_base_free(b); 3292 # endif 3293 return m; 3294 #endif 3295 } 3296 3297 struct event_base* 3298 nsd_child_event_base(void) 3299 { 3300 struct event_base* base; 3301 #ifdef USE_MINI_EVENT 3302 static time_t secs; 3303 static struct timeval now; 3304 base = event_init(&secs, &now); 3305 #else 3306 # if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP) 3307 /* libev */ 3308 base = (struct event_base *)ev_default_loop(EVFLAG_AUTO); 3309 # else 3310 /* libevent */ 3311 # ifdef HAVE_EVENT_BASE_NEW 3312 base = event_base_new(); 3313 # else 3314 base = event_init(); 3315 # endif 3316 # endif 3317 #endif 3318 return base; 3319 } 3320 3321 static void 3322 add_udp_handler( 3323 struct nsd *nsd, 3324 struct nsd_socket *sock, 3325 struct udp_handler_data *data) 3326 { 3327 struct event *handler = &data->event; 3328 3329 data->nsd = nsd; 3330 data->socket = sock; 3331 3332 if(nsd->options->proxy_protocol_port && 3333 sockaddr_uses_proxy_protocol_port(nsd->options, 3334 (struct sockaddr *)&sock->addr.ai_addr)) { 3335 data->pp2_enabled = 1; 3336 } 3337 3338 memset(handler, 0, sizeof(*handler)); 3339 event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_udp, data); 3340 if(event_base_set(nsd->event_base, handler) != 0) 3341 log_msg(LOG_ERR, "nsd udp: event_base_set failed"); 3342 if(event_add(handler, NULL) != 0) 3343 log_msg(LOG_ERR, "nsd udp: event_add failed"); 3344 } 3345 3346 void 3347 add_tcp_handler( 3348 struct nsd *nsd, 3349 struct nsd_socket *sock, 3350 struct tcp_accept_handler_data *data) 3351 { 3352 struct event *handler = &data->event; 3353 3354 data->nsd = nsd; 3355 data->socket = sock; 3356 3357 if(nsd->options->proxy_protocol_port && 3358 sockaddr_uses_proxy_protocol_port(nsd->options, 3359 (struct sockaddr *)&sock->addr.ai_addr)) { 3360 data->pp2_enabled = 1; 3361 } 3362 3363 #ifdef HAVE_SSL 3364 if (nsd->tls_ctx && 3365 nsd->options->tls_port && 3366 using_tls_port((struct sockaddr *)&sock->addr.ai_addr, nsd->options->tls_port)) 3367 { 3368 data->tls_accept = 1; 3369 if(verbosity >= 2) { 3370 char buf[48]; 3371 addrport2str((void*)(struct sockaddr_storage*)&sock->addr.ai_addr, buf, sizeof(buf)); 3372 VERBOSITY(5, (LOG_NOTICE, "setup TCP for TLS service on interface %s", buf)); 3373 } 3374 } else { 3375 data->tls_accept = 0; 3376 } 3377 if (nsd->tls_auth_ctx && 3378 nsd->options->tls_auth_port && 3379 using_tls_port((struct sockaddr *)&sock->addr.ai_addr, nsd->options->tls_auth_port)) 3380 { 3381 data->tls_auth_accept = 1; 3382 if(verbosity >= 2) { 3383 char buf[48]; 3384 addrport2str((void*)(struct sockaddr_storage*)&sock->addr.ai_addr, buf, sizeof(buf)); 3385 VERBOSITY(4, (LOG_NOTICE, "setup TCP for TLS-AUTH service on interface %s", buf)); 3386 } 3387 3388 } else { 3389 data->tls_auth_accept = 0; 3390 } 3391 #endif 3392 3393 memset(handler, 0, sizeof(*handler)); 3394 event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_tcp_accept, data); 3395 if(event_base_set(nsd->event_base, handler) != 0) 3396 log_msg(LOG_ERR, "nsd tcp: event_base_set failed"); 3397 if(event_add(handler, NULL) != 0) 3398 log_msg(LOG_ERR, "nsd tcp: event_add failed"); 3399 data->event_added = 1; 3400 } 3401 3402 #ifdef USE_XDP 3403 static void 3404 add_xdp_handler(struct nsd *nsd, 3405 struct xdp_server *xdp, 3406 struct xdp_handler_data *data) { 3407 3408 int sock; 3409 struct event *handler = &data->event; 3410 3411 data->nsd = nsd; 3412 data->server = xdp; 3413 3414 memset(handler, 0, sizeof(*handler)); 3415 sock = xsk_socket__fd(xdp->xsks[xdp->queue_index].xsk); 3416 if (sock < 0) { 3417 log_msg(LOG_ERR, "xdp: xsk socket file descriptor is invalid: %s", 3418 strerror(errno)); 3419 return; 3420 } 3421 // TODO: check which EV_flags are needed 3422 event_set(handler, sock, EV_PERSIST|EV_READ, handle_xdp, data); 3423 if (event_base_set(nsd->event_base, handler) != 0) 3424 log_msg(LOG_ERR, "nsd xdp: event_base_set failed"); 3425 if (event_add(handler, NULL) != 0) 3426 log_msg(LOG_ERR, "nsd xdp: event_add failed"); 3427 } 3428 #endif 3429 3430 /* 3431 * Serve DNS request to verifiers (short-lived) 3432 */ 3433 static void server_verify(struct nsd *nsd, int cmdsocket, 3434 struct sigaction* old_sigchld) 3435 { 3436 size_t size = 0; 3437 struct event cmd_event, signal_event, exit_event; 3438 struct zone *zone; 3439 3440 assert(nsd != NULL); 3441 3442 zone = verify_next_zone(nsd, NULL); 3443 if(zone == NULL) 3444 return; 3445 3446 nsd->server_region = region_create(xalloc, free); 3447 nsd->event_base = nsd_child_event_base(); 3448 3449 nsd->next_zone_to_verify = zone; 3450 nsd->verifier_count = 0; 3451 nsd->verifier_limit = nsd->options->verifier_count; 3452 size = sizeof(struct verifier) * nsd->verifier_limit; 3453 if(pipe(nsd->verifier_pipe) == -1) { 3454 log_msg(LOG_ERR, "verify: could not create pipe: %s", 3455 strerror(errno)); 3456 goto fail_pipe; 3457 } 3458 fcntl(nsd->verifier_pipe[0], F_SETFD, FD_CLOEXEC); 3459 fcntl(nsd->verifier_pipe[1], F_SETFD, FD_CLOEXEC); 3460 nsd->verifiers = region_alloc_zero(nsd->server_region, size); 3461 3462 for(size_t i = 0; i < nsd->verifier_limit; i++) { 3463 nsd->verifiers[i].nsd = nsd; 3464 nsd->verifiers[i].zone = NULL; 3465 nsd->verifiers[i].pid = -1; 3466 nsd->verifiers[i].output_stream.fd = -1; 3467 nsd->verifiers[i].output_stream.priority = LOG_INFO; 3468 nsd->verifiers[i].error_stream.fd = -1; 3469 nsd->verifiers[i].error_stream.priority = LOG_ERR; 3470 } 3471 3472 event_set(&cmd_event, cmdsocket, EV_READ|EV_PERSIST, verify_handle_command, nsd); 3473 if(event_base_set(nsd->event_base, &cmd_event) != 0 || 3474 event_add(&cmd_event, NULL) != 0) 3475 { 3476 log_msg(LOG_ERR, "verify: could not add command event"); 3477 goto fail; 3478 } 3479 3480 event_set(&signal_event, SIGCHLD, EV_SIGNAL|EV_PERSIST, verify_handle_signal, nsd); 3481 if(event_base_set(nsd->event_base, &signal_event) != 0 || 3482 signal_add(&signal_event, NULL) != 0) 3483 { 3484 log_msg(LOG_ERR, "verify: could not add signal event"); 3485 goto fail; 3486 } 3487 3488 event_set(&exit_event, nsd->verifier_pipe[0], EV_READ|EV_PERSIST, verify_handle_exit, nsd); 3489 if(event_base_set(nsd->event_base, &exit_event) != 0 || 3490 event_add(&exit_event, NULL) != 0) 3491 { 3492 log_msg(LOG_ERR, "verify: could not add exit event"); 3493 goto fail; 3494 } 3495 3496 memset(msgs, 0, sizeof(msgs)); 3497 for (int i = 0; i < NUM_RECV_PER_SELECT; i++) { 3498 queries[i] = query_create(nsd->server_region, 3499 compressed_dname_offsets, 3500 compression_table_size, compressed_dnames); 3501 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3502 iovecs[i].iov_base = buffer_begin(queries[i]->packet); 3503 iovecs[i].iov_len = buffer_remaining(queries[i]->packet); 3504 msgs[i].msg_hdr.msg_iov = &iovecs[i]; 3505 msgs[i].msg_hdr.msg_iovlen = 1; 3506 msgs[i].msg_hdr.msg_name = &queries[i]->remote_addr; 3507 msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen; 3508 } 3509 3510 for (size_t i = 0; i < nsd->verify_ifs; i++) { 3511 struct udp_handler_data *data; 3512 data = region_alloc_zero( 3513 nsd->server_region, sizeof(*data)); 3514 add_udp_handler(nsd, &nsd->verify_udp[i], data); 3515 } 3516 3517 tcp_accept_handler_count = nsd->verify_ifs; 3518 tcp_accept_handlers = region_alloc_array(nsd->server_region, 3519 nsd->verify_ifs, sizeof(*tcp_accept_handlers)); 3520 3521 for (size_t i = 0; i < nsd->verify_ifs; i++) { 3522 struct tcp_accept_handler_data *data; 3523 data = &tcp_accept_handlers[i]; 3524 memset(data, 0, sizeof(*data)); 3525 add_tcp_handler(nsd, &nsd->verify_tcp[i], data); 3526 } 3527 3528 while(nsd->next_zone_to_verify != NULL && 3529 nsd->verifier_count < nsd->verifier_limit) 3530 { 3531 verify_zone(nsd, nsd->next_zone_to_verify); 3532 nsd->next_zone_to_verify 3533 = verify_next_zone(nsd, nsd->next_zone_to_verify); 3534 } 3535 3536 /* short-lived main loop */ 3537 event_base_dispatch(nsd->event_base); 3538 3539 /* remove command and exit event handlers */ 3540 event_del(&exit_event); 3541 event_del(&cmd_event); 3542 3543 assert(nsd->next_zone_to_verify == NULL || nsd->mode == NSD_QUIT); 3544 assert(nsd->verifier_count == 0 || nsd->mode == NSD_QUIT); 3545 signal_del(&signal_event); 3546 fail: 3547 sigaction(SIGCHLD, old_sigchld, NULL); 3548 close(nsd->verifier_pipe[0]); 3549 close(nsd->verifier_pipe[1]); 3550 fail_pipe: 3551 event_base_free(nsd->event_base); 3552 region_destroy(nsd->server_region); 3553 3554 nsd->event_base = NULL; 3555 nsd->server_region = NULL; 3556 nsd->verifier_limit = 0; 3557 nsd->verifier_pipe[0] = -1; 3558 nsd->verifier_pipe[1] = -1; 3559 nsd->verifiers = NULL; 3560 } 3561 3562 /* 3563 * Serve DNS requests. 3564 */ 3565 void 3566 server_child(struct nsd *nsd) 3567 { 3568 size_t i, from, numifs; 3569 region_type *server_region = region_create(xalloc, free); 3570 struct event_base* event_base = nsd_child_event_base(); 3571 sig_atomic_t mode; 3572 #ifdef USE_LOG_PROCESS_ROLE 3573 static char child_name[20]; 3574 #endif 3575 3576 if(!event_base) { 3577 log_msg(LOG_ERR, "nsd server could not create event base"); 3578 exit(1); 3579 } 3580 nsd->event_base = event_base; 3581 nsd->server_region = server_region; 3582 3583 #ifdef RATELIMIT 3584 rrl_init(nsd->this_child->child_num); 3585 #endif 3586 3587 assert(nsd->server_kind != NSD_SERVER_MAIN); 3588 3589 #ifdef HAVE_SETPROCTITLE 3590 setproctitle("server %d", nsd->this_child->child_num + 1); 3591 #endif 3592 #ifdef USE_LOG_PROCESS_ROLE 3593 snprintf(child_name, sizeof(child_name), "srv%d", 3594 nsd->this_child->child_num + 1); 3595 log_set_process_role(child_name); 3596 #endif 3597 DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started")); 3598 3599 #ifdef HAVE_CPUSET_T 3600 if(nsd->use_cpu_affinity) { 3601 set_cpu_affinity(nsd->this_child->cpuset); 3602 } 3603 #endif 3604 #ifdef BIND8_STATS 3605 nsd->st = &nsd->stats_per_child[nsd->stat_current] 3606 [nsd->this_child->child_num]; 3607 nsd->st->boot = nsd->stat_map[0].boot; 3608 memcpy(&nsd->stat_proc, nsd->st, sizeof(nsd->stat_proc)); 3609 #endif 3610 3611 if (!(nsd->server_kind & NSD_SERVER_TCP)) { 3612 server_close_all_sockets(nsd->tcp, nsd->ifs); 3613 } 3614 if (!(nsd->server_kind & NSD_SERVER_UDP)) { 3615 server_close_all_sockets(nsd->udp, nsd->ifs); 3616 } 3617 3618 if (nsd->this_child->parent_fd != -1) { 3619 struct event *handler; 3620 struct ipc_handler_conn_data* user_data = 3621 (struct ipc_handler_conn_data*)region_alloc( 3622 server_region, sizeof(struct ipc_handler_conn_data)); 3623 user_data->nsd = nsd; 3624 user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ); 3625 3626 handler = (struct event*) region_alloc( 3627 server_region, sizeof(*handler)); 3628 memset(handler, 0, sizeof(*handler)); 3629 event_set(handler, nsd->this_child->parent_fd, EV_PERSIST| 3630 EV_READ, child_handle_parent_command, user_data); 3631 if(event_base_set(event_base, handler) != 0) 3632 log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed"); 3633 if(event_add(handler, NULL) != 0) 3634 log_msg(LOG_ERR, "nsd ipcchild: event_add failed"); 3635 } 3636 3637 if(nsd->reuseport) { 3638 numifs = nsd->ifs / nsd->reuseport; 3639 from = numifs * nsd->this_child->child_num; 3640 if(from+numifs > nsd->ifs) { /* should not happen */ 3641 from = 0; 3642 numifs = nsd->ifs; 3643 } 3644 } else { 3645 from = 0; 3646 numifs = nsd->ifs; 3647 } 3648 3649 if ((nsd->server_kind & NSD_SERVER_UDP)) { 3650 int child = nsd->this_child->child_num; 3651 memset(msgs, 0, sizeof(msgs)); 3652 for (i = 0; i < NUM_RECV_PER_SELECT; i++) { 3653 queries[i] = query_create(server_region, 3654 compressed_dname_offsets, 3655 compression_table_size, compressed_dnames); 3656 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3657 iovecs[i].iov_base = buffer_begin(queries[i]->packet); 3658 iovecs[i].iov_len = buffer_remaining(queries[i]->packet); 3659 msgs[i].msg_hdr.msg_iov = &iovecs[i]; 3660 msgs[i].msg_hdr.msg_iovlen = 1; 3661 msgs[i].msg_hdr.msg_name = &queries[i]->remote_addr; 3662 msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen; 3663 } 3664 3665 for (i = 0; i < nsd->ifs; i++) { 3666 int listen; 3667 struct udp_handler_data *data; 3668 3669 listen = nsd_bitset_isset(nsd->udp[i].servers, child); 3670 3671 if(i >= from && i < (from + numifs) && listen) { 3672 data = region_alloc_zero( 3673 nsd->server_region, sizeof(*data)); 3674 add_udp_handler(nsd, &nsd->udp[i], data); 3675 } else { 3676 /* close sockets intended for other servers */ 3677 server_close_socket(&nsd->udp[i]); 3678 } 3679 } 3680 } 3681 3682 /* 3683 * Keep track of all the TCP accept handlers so we can enable 3684 * and disable them based on the current number of active TCP 3685 * connections. 3686 */ 3687 if ((nsd->server_kind & NSD_SERVER_TCP)) { 3688 int child = nsd->this_child->child_num; 3689 tcp_accept_handler_count = numifs; 3690 tcp_accept_handlers = region_alloc_array(server_region, 3691 numifs, sizeof(*tcp_accept_handlers)); 3692 3693 for (i = 0; i < nsd->ifs; i++) { 3694 int listen; 3695 struct tcp_accept_handler_data *data; 3696 3697 listen = nsd_bitset_isset(nsd->tcp[i].servers, child); 3698 3699 if(i >= from && i < (from + numifs) && listen) { 3700 data = &tcp_accept_handlers[i-from]; 3701 memset(data, 0, sizeof(*data)); 3702 add_tcp_handler(nsd, &nsd->tcp[i], data); 3703 } else { 3704 /* close sockets intended for other servers */ 3705 server_close_socket(&nsd->tcp[i]); 3706 } 3707 } 3708 } else { 3709 tcp_accept_handler_count = 0; 3710 } 3711 3712 #ifdef USE_XDP 3713 if (nsd->options->xdp_interface) { 3714 /* don't try to bind more sockets than there are queues available */ 3715 if ((int)nsd->xdp.xdp_server.queue_count <= nsd->this_child->child_num) { 3716 log_msg(LOG_WARNING, 3717 "xdp: server-count exceeds available queues (%d) on " 3718 "interface %s, skipping xdp in this process", 3719 nsd->xdp.xdp_server.queue_count, 3720 nsd->xdp.xdp_server.interface_name); 3721 } else { 3722 struct xdp_handler_data *data; 3723 const int scratch_data_len = 1; 3724 void *scratch_data = region_alloc_zero(nsd->server_region, 3725 scratch_data_len); 3726 3727 nsd->xdp.xdp_server.queue_index = nsd->this_child->child_num; 3728 nsd->xdp.xdp_server.queries = xdp_queries; 3729 3730 log_msg(LOG_INFO, 3731 "xdp: using socket with queue_id %d on interface %s", 3732 nsd->xdp.xdp_server.queue_index, 3733 nsd->xdp.xdp_server.interface_name); 3734 3735 data = region_alloc_zero(nsd->server_region, sizeof(*data)); 3736 add_xdp_handler(nsd, &nsd->xdp.xdp_server, data); 3737 3738 for (i = 0; i < XDP_RX_BATCH_SIZE; i++) { 3739 /* Be aware that the buffer is initialized with scratch data 3740 * and will be filled by the xdp handle and receive function 3741 * that receives the packet data. 3742 * Using scratch data so that the existing functions in regards 3743 * to queries and buffers don't break by use of NULL pointers */ 3744 struct buffer *buffer = region_alloc_zero( 3745 nsd->server_region, 3746 sizeof(struct buffer)); 3747 buffer_create_from(buffer, scratch_data, scratch_data_len); 3748 xdp_queries[i] = query_create_with_buffer( 3749 server_region, 3750 compressed_dname_offsets, 3751 compression_table_size, 3752 compressed_dnames, 3753 buffer); 3754 query_reset(xdp_queries[i], UDP_MAX_MESSAGE_LEN, 0); 3755 } 3756 } 3757 } 3758 #endif 3759 3760 /* The main loop... */ 3761 while ((mode = nsd->mode) != NSD_QUIT) { 3762 if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd); 3763 3764 /* Do we need to do the statistics... */ 3765 if (mode == NSD_STATS) { 3766 #ifdef BIND8_STATS 3767 int p = nsd->st_period; 3768 nsd->st_period = 1; /* force stats printout */ 3769 /* Dump the statistics */ 3770 bind8_stats(nsd); 3771 nsd->st_period = p; 3772 #else /* !BIND8_STATS */ 3773 log_msg(LOG_NOTICE, "Statistics support not enabled at compile time."); 3774 #endif /* BIND8_STATS */ 3775 3776 nsd->mode = NSD_RUN; 3777 } 3778 else if (mode == NSD_REAP_CHILDREN) { 3779 /* got signal, notify parent. parent reaps terminated children. */ 3780 if (nsd->this_child->parent_fd != -1) { 3781 sig_atomic_t parent_notify = NSD_REAP_CHILDREN; 3782 if (write(nsd->this_child->parent_fd, 3783 &parent_notify, 3784 sizeof(parent_notify)) == -1) 3785 { 3786 log_msg(LOG_ERR, "problems sending command from %d to parent: %s", 3787 (int) nsd->this_child->pid, strerror(errno)); 3788 } 3789 } else /* no parent, so reap 'em */ 3790 while (waitpid(-1, NULL, WNOHANG) > 0) ; 3791 nsd->mode = NSD_RUN; 3792 } 3793 else if(mode == NSD_RUN) { 3794 /* Wait for a query... */ 3795 if(event_base_loop(event_base, EVLOOP_ONCE) == -1) { 3796 if (errno != EINTR) { 3797 log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno)); 3798 break; 3799 } 3800 } 3801 } else if(mode == NSD_QUIT) { 3802 /* ignore here, quit */ 3803 } else { 3804 log_msg(LOG_ERR, "mode bad value %d, back to service.", 3805 (int)mode); 3806 nsd->mode = NSD_RUN; 3807 } 3808 } 3809 3810 /* This part is seemingly never reached as the loop WOULD exit on NSD_QUIT, 3811 * but nsd->mode is only set to NSD_QUIT in ipc_child_quit. However, that 3812 * function also calls exit(). */ 3813 3814 service_remaining_tcp(nsd); 3815 #ifdef BIND8_STATS 3816 bind8_stats(nsd); 3817 #endif /* BIND8_STATS */ 3818 3819 #ifdef MEMCLEAN /* OS collects memory pages */ 3820 #ifdef RATELIMIT 3821 rrl_deinit(nsd->this_child->child_num); 3822 #endif 3823 event_base_free(event_base); 3824 region_destroy(server_region); 3825 #endif 3826 server_shutdown(nsd); 3827 } 3828 3829 static void remaining_tcp_timeout(int ATTR_UNUSED(fd), short event, void* arg) 3830 { 3831 int* timed_out = (int*)arg; 3832 assert((event & EV_TIMEOUT)); (void)event; 3833 /* wake up the service tcp thread, note event is no longer 3834 * registered */ 3835 *timed_out = 1; 3836 } 3837 3838 void 3839 service_remaining_tcp(struct nsd* nsd) 3840 { 3841 struct tcp_handler_data* p; 3842 struct event_base* event_base; 3843 /* check if it is needed */ 3844 if(nsd->current_tcp_count == 0 || tcp_active_list == NULL) 3845 return; 3846 VERBOSITY(5, (LOG_INFO, "service remaining TCP connections")); 3847 #ifdef USE_DNSTAP 3848 /* remove dnstap collector, we cannot write there because the new 3849 * child process is using the file descriptor, or the child 3850 * process after that. */ 3851 dt_collector_destroy(nsd->dt_collector, nsd); 3852 nsd->dt_collector = NULL; 3853 #endif 3854 /* setup event base */ 3855 event_base = nsd_child_event_base(); 3856 if(!event_base) { 3857 log_msg(LOG_ERR, "nsd remain tcp could not create event base"); 3858 return; 3859 } 3860 /* register tcp connections */ 3861 for(p = tcp_active_list; p != NULL; p = p->next) { 3862 struct timeval timeout; 3863 int fd = p->event.ev_fd; 3864 #ifdef USE_MINI_EVENT 3865 short event = p->event.ev_flags & (EV_READ|EV_WRITE); 3866 #else 3867 short event = p->event.ev_events & (EV_READ|EV_WRITE); 3868 #endif 3869 void (*fn)(int, short, void*); 3870 #ifdef HAVE_SSL 3871 if(p->tls) { 3872 if((event&EV_READ)) 3873 fn = handle_tls_reading; 3874 else fn = handle_tls_writing; 3875 } else if(p->tls_auth) { 3876 if((event&EV_READ)) 3877 fn = handle_tls_reading; 3878 else fn = handle_tls_writing; 3879 } else { 3880 #endif 3881 if((event&EV_READ)) 3882 fn = handle_tcp_reading; 3883 else fn = handle_tcp_writing; 3884 #ifdef HAVE_SSL 3885 } 3886 #endif 3887 3888 p->tcp_no_more_queries = 1; 3889 /* set timeout to 3 seconds (previously 1/10 second) */ 3890 if(p->tcp_timeout > 3000) 3891 p->tcp_timeout = 3000; 3892 timeout.tv_sec = p->tcp_timeout / 1000; 3893 timeout.tv_usec = (p->tcp_timeout % 1000)*1000; 3894 event_del(&p->event); 3895 memset(&p->event, 0, sizeof(p->event)); 3896 event_set(&p->event, fd, EV_PERSIST | event | EV_TIMEOUT, 3897 fn, p); 3898 if(event_base_set(event_base, &p->event) != 0) 3899 log_msg(LOG_ERR, "event base set failed"); 3900 if(event_add(&p->event, &timeout) != 0) 3901 log_msg(LOG_ERR, "event add failed"); 3902 } 3903 3904 /* handle it */ 3905 while(nsd->current_tcp_count > 0) { 3906 mode_t m = server_signal_mode(nsd); 3907 struct event timeout; 3908 struct timeval tv; 3909 int timed_out = 0; 3910 if(m == NSD_QUIT || m == NSD_SHUTDOWN || 3911 m == NSD_REAP_CHILDREN) { 3912 /* quit */ 3913 break; 3914 } 3915 /* timer */ 3916 /* have to do something every 3 seconds */ 3917 tv.tv_sec = 3; 3918 tv.tv_usec = 0; 3919 memset(&timeout, 0, sizeof(timeout)); 3920 event_set(&timeout, -1, EV_TIMEOUT, remaining_tcp_timeout, 3921 &timed_out); 3922 if(event_base_set(event_base, &timeout) != 0) 3923 log_msg(LOG_ERR, "remaintcp timer: event_base_set failed"); 3924 if(event_add(&timeout, &tv) != 0) 3925 log_msg(LOG_ERR, "remaintcp timer: event_add failed"); 3926 3927 /* service loop */ 3928 if(event_base_loop(event_base, EVLOOP_ONCE) == -1) { 3929 if (errno != EINTR) { 3930 log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno)); 3931 break; 3932 } 3933 } 3934 if(!timed_out) { 3935 event_del(&timeout); 3936 } else { 3937 /* timed out, quit */ 3938 VERBOSITY(5, (LOG_INFO, "service remaining TCP connections: timed out, quit")); 3939 break; 3940 } 3941 } 3942 #ifdef MEMCLEAN 3943 event_base_free(event_base); 3944 #endif 3945 /* continue to quit after return */ 3946 } 3947 3948 /* Implement recvmmsg and sendmmsg if the platform does not. These functions 3949 * are always used, even if nonblocking operations are broken, in which case 3950 * NUM_RECV_PER_SELECT is defined to 1 (one). 3951 */ 3952 #if defined(HAVE_RECVMMSG) 3953 #define nsd_recvmmsg recvmmsg 3954 #else /* !HAVE_RECVMMSG */ 3955 3956 static int 3957 nsd_recvmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, 3958 int flags, struct timespec *timeout) 3959 { 3960 unsigned int vpos = 0; 3961 ssize_t rcvd; 3962 3963 /* timeout is ignored, ensure caller does not expect it to work */ 3964 assert(timeout == NULL); (void)timeout; 3965 3966 while(vpos < vlen) { 3967 rcvd = recvfrom(sockfd, 3968 msgvec[vpos].msg_hdr.msg_iov->iov_base, 3969 msgvec[vpos].msg_hdr.msg_iov->iov_len, 3970 flags, 3971 msgvec[vpos].msg_hdr.msg_name, 3972 &msgvec[vpos].msg_hdr.msg_namelen); 3973 if(rcvd < 0) { 3974 break; 3975 } else { 3976 assert((unsigned long long)rcvd <= (unsigned long long)UINT_MAX); 3977 msgvec[vpos].msg_len = (unsigned int)rcvd; 3978 vpos++; 3979 } 3980 } 3981 3982 if(vpos) { 3983 /* error will be picked up next time */ 3984 return (int)vpos; 3985 } else if(errno == 0) { 3986 return 0; 3987 } else if(errno == EAGAIN) { 3988 return 0; 3989 } 3990 3991 return -1; 3992 } 3993 #endif /* HAVE_RECVMMSG */ 3994 3995 #ifdef HAVE_SENDMMSG 3996 #define nsd_sendmmsg(...) sendmmsg(__VA_ARGS__) 3997 #else /* !HAVE_SENDMMSG */ 3998 3999 static int 4000 nsd_sendmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, int flags) 4001 { 4002 unsigned int vpos = 0; 4003 ssize_t snd; 4004 4005 while(vpos < vlen) { 4006 assert(msgvec[vpos].msg_hdr.msg_iovlen == 1); 4007 snd = sendto(sockfd, 4008 msgvec[vpos].msg_hdr.msg_iov->iov_base, 4009 msgvec[vpos].msg_hdr.msg_iov->iov_len, 4010 flags, 4011 msgvec[vpos].msg_hdr.msg_name, 4012 msgvec[vpos].msg_hdr.msg_namelen); 4013 if(snd < 0) { 4014 break; 4015 } else { 4016 msgvec[vpos].msg_len = (unsigned int)snd; 4017 vpos++; 4018 } 4019 } 4020 4021 if(vpos) { 4022 return (int)vpos; 4023 } else if(errno == 0) { 4024 return 0; 4025 } 4026 4027 return -1; 4028 } 4029 #endif /* HAVE_SENDMMSG */ 4030 4031 static int 4032 port_is_zero( 4033 #ifdef INET6 4034 struct sockaddr_storage *addr 4035 #else 4036 struct sockaddr_in *addr 4037 #endif 4038 ) 4039 { 4040 #ifdef INET6 4041 if(addr->ss_family == AF_INET6) { 4042 return (((struct sockaddr_in6 *)addr)->sin6_port) == 0; 4043 } else if(addr->ss_family == AF_INET) { 4044 return (((struct sockaddr_in *)addr)->sin_port) == 0; 4045 } 4046 return 0; 4047 #else 4048 if(addr->sin_family == AF_INET) { 4049 return addr->sin_port == 0; 4050 } 4051 return 0; 4052 #endif 4053 } 4054 4055 /* Parses the PROXYv2 header from buf and updates the struct. 4056 * Returns 1 on success, 0 on failure. */ 4057 static int 4058 consume_pp2_header(struct buffer* buf, struct query* q, int stream) 4059 { 4060 size_t size; 4061 struct pp2_header* header; 4062 int err = pp2_read_header(buffer_begin(buf), buffer_remaining(buf)); 4063 if(err) { 4064 VERBOSITY(4, (LOG_ERR, "proxy-protocol: could not parse " 4065 "PROXYv2 header: %s", pp_lookup_error(err))); 4066 return 0; 4067 } 4068 header = (struct pp2_header*)buffer_begin(buf); 4069 size = PP2_HEADER_SIZE + read_uint16(&header->len); 4070 if(size > buffer_limit(buf)) { 4071 VERBOSITY(4, (LOG_ERR, "proxy-protocol: not enough buffer " 4072 "size to read PROXYv2 header")); 4073 return 0; 4074 } 4075 if((header->ver_cmd & 0xF) == PP2_CMD_LOCAL) { 4076 /* A connection from the proxy itself. 4077 * No need to do anything with addresses. */ 4078 goto done; 4079 } 4080 if(header->fam_prot == PP2_UNSPEC_UNSPEC) { 4081 /* Unspecified family and protocol. This could be used for 4082 * health checks by proxies. 4083 * No need to do anything with addresses. */ 4084 goto done; 4085 } 4086 /* Read the proxied address */ 4087 switch(header->fam_prot) { 4088 case PP2_INET_STREAM: 4089 case PP2_INET_DGRAM: 4090 { 4091 struct sockaddr_in* addr = 4092 (struct sockaddr_in*)&q->client_addr; 4093 addr->sin_family = AF_INET; 4094 memmove(&addr->sin_addr.s_addr, 4095 &header->addr.addr4.src_addr, 4); 4096 memmove(&addr->sin_port, &header->addr.addr4.src_port, 4097 2); 4098 q->client_addrlen = (socklen_t)sizeof(struct sockaddr_in); 4099 } 4100 /* Ignore the destination address; it should be us. */ 4101 break; 4102 #ifdef INET6 4103 case PP2_INET6_STREAM: 4104 case PP2_INET6_DGRAM: 4105 { 4106 struct sockaddr_in6* addr = 4107 (struct sockaddr_in6*)&q->client_addr; 4108 memset(addr, 0, sizeof(*addr)); 4109 addr->sin6_family = AF_INET6; 4110 memmove(&addr->sin6_addr, 4111 header->addr.addr6.src_addr, 16); 4112 memmove(&addr->sin6_port, &header->addr.addr6.src_port, 4113 2); 4114 q->client_addrlen = (socklen_t)sizeof(struct sockaddr_in6); 4115 } 4116 /* Ignore the destination address; it should be us. */ 4117 break; 4118 #endif /* INET6 */ 4119 default: 4120 VERBOSITY(2, (LOG_ERR, "proxy-protocol: unsupported " 4121 "family and protocol 0x%x", 4122 (int)header->fam_prot)); 4123 return 0; 4124 } 4125 q->is_proxied = 1; 4126 done: 4127 if(!stream) { 4128 /* We are reading a whole packet; 4129 * Move the rest of the data to overwrite the PROXYv2 header */ 4130 /* XXX can we do better to avoid memmove? */ 4131 memmove(header, ((char*)header)+size, buffer_limit(buf)-size); 4132 buffer_set_limit(buf, buffer_limit(buf)-size); 4133 } 4134 return 1; 4135 } 4136 4137 static void 4138 handle_udp(int fd, short event, void* arg) 4139 { 4140 struct udp_handler_data *data = (struct udp_handler_data *) arg; 4141 int received, sent, recvcount, i; 4142 struct query *q; 4143 uint32_t now = 0; 4144 4145 if (!(event & EV_READ)) { 4146 return; 4147 } 4148 recvcount = nsd_recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL); 4149 /* this printf strangely gave a performance increase on Linux */ 4150 /* printf("recvcount %d \n", recvcount); */ 4151 if (recvcount == -1) { 4152 if (errno != EAGAIN && errno != EINTR) { 4153 log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno)); 4154 STATUP(data->nsd, rxerr); 4155 /* No zone statup */ 4156 } 4157 /* Simply no data available */ 4158 return; 4159 } 4160 for (i = 0; i < recvcount; i++) { 4161 loopstart: 4162 received = msgs[i].msg_len; 4163 queries[i]->remote_addrlen = msgs[i].msg_hdr.msg_namelen; 4164 queries[i]->client_addrlen = (socklen_t)sizeof(queries[i]->client_addr); 4165 queries[i]->is_proxied = 0; 4166 q = queries[i]; 4167 if (received == -1) { 4168 log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror( 4169 #if defined(HAVE_RECVMMSG) 4170 msgs[i].msg_hdr.msg_flags 4171 #else 4172 errno 4173 #endif 4174 )); 4175 STATUP(data->nsd, rxerr); 4176 /* No zone statup */ 4177 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 4178 iovecs[i].iov_len = buffer_remaining(q->packet); 4179 msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen; 4180 goto swap_drop; 4181 } 4182 4183 /* Account... */ 4184 #ifdef BIND8_STATS 4185 if (data->socket->addr.ai_family == AF_INET) { 4186 STATUP(data->nsd, qudp); 4187 } else if (data->socket->addr.ai_family == AF_INET6) { 4188 STATUP(data->nsd, qudp6); 4189 } 4190 #endif 4191 4192 buffer_skip(q->packet, received); 4193 buffer_flip(q->packet); 4194 if(data->pp2_enabled && !consume_pp2_header(q->packet, q, 0)) { 4195 VERBOSITY(2, (LOG_ERR, "proxy-protocol: could not " 4196 "consume PROXYv2 header")); 4197 goto swap_drop; 4198 } 4199 if(!q->is_proxied) { 4200 q->client_addrlen = q->remote_addrlen; 4201 memmove(&q->client_addr, &q->remote_addr, 4202 q->remote_addrlen); 4203 } 4204 #ifdef USE_DNSTAP 4205 /* 4206 * sending UDP-query with server address (local) and client address to dnstap process 4207 */ 4208 log_addr("query from client", &q->client_addr); 4209 log_addr("to server (local)", (void*)&data->socket->addr.ai_addr); 4210 if(verbosity >= 6 && q->is_proxied) 4211 log_addr("query via proxy", &q->remote_addr); 4212 dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &q->client_addr, q->client_addrlen, 4213 q->tcp, q->packet); 4214 #endif /* USE_DNSTAP */ 4215 4216 /* Process and answer the query... */ 4217 if (server_process_query_udp(data->nsd, q, &now) != QUERY_DISCARDED) { 4218 if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) { 4219 STATUP(data->nsd, nona); 4220 ZTATUP(data->nsd, q->zone, nona); 4221 } 4222 4223 #ifdef USE_ZONE_STATS 4224 if (data->socket->addr.ai_family == AF_INET) { 4225 ZTATUP(data->nsd, q->zone, qudp); 4226 } else if (data->socket->addr.ai_family == AF_INET6) { 4227 ZTATUP(data->nsd, q->zone, qudp6); 4228 } 4229 #endif 4230 4231 /* Add EDNS0 and TSIG info if necessary. */ 4232 query_add_optional(q, data->nsd, &now); 4233 4234 buffer_flip(q->packet); 4235 iovecs[i].iov_len = buffer_remaining(q->packet); 4236 #ifdef BIND8_STATS 4237 /* Account the rcode & TC... */ 4238 STATUP2(data->nsd, rcode, RCODE(q->packet)); 4239 ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet)); 4240 if (TC(q->packet)) { 4241 STATUP(data->nsd, truncated); 4242 ZTATUP(data->nsd, q->zone, truncated); 4243 } 4244 #endif /* BIND8_STATS */ 4245 #ifdef USE_DNSTAP 4246 /* 4247 * sending UDP-response with server address (local) and client address to dnstap process 4248 */ 4249 log_addr("from server (local)", (void*)&data->socket->addr.ai_addr); 4250 log_addr("response to client", &q->client_addr); 4251 if(verbosity >= 6 && q->is_proxied) 4252 log_addr("response via proxy", &q->remote_addr); 4253 dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, 4254 &q->client_addr, q->client_addrlen, q->tcp, q->packet, 4255 q->zone); 4256 #endif /* USE_DNSTAP */ 4257 } else { 4258 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 4259 iovecs[i].iov_len = buffer_remaining(q->packet); 4260 msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen; 4261 swap_drop: 4262 STATUP(data->nsd, dropped); 4263 ZTATUP(data->nsd, q->zone, dropped); 4264 if(i != recvcount-1) { 4265 /* swap with last and decrease recvcount */ 4266 struct mmsghdr mtmp = msgs[i]; 4267 struct iovec iotmp = iovecs[i]; 4268 recvcount--; 4269 msgs[i] = msgs[recvcount]; 4270 iovecs[i] = iovecs[recvcount]; 4271 queries[i] = queries[recvcount]; 4272 msgs[recvcount] = mtmp; 4273 iovecs[recvcount] = iotmp; 4274 queries[recvcount] = q; 4275 msgs[i].msg_hdr.msg_iov = &iovecs[i]; 4276 msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount]; 4277 goto loopstart; 4278 } else { recvcount --; } 4279 } 4280 } 4281 4282 /* send until all are sent */ 4283 i = 0; 4284 while(i<recvcount) { 4285 sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0); 4286 if(sent == -1) { 4287 if(errno == ENOBUFS || 4288 #ifdef EWOULDBLOCK 4289 errno == EWOULDBLOCK || 4290 #endif 4291 errno == EAGAIN) { 4292 /* block to wait until send buffer avail */ 4293 int flag, errstore; 4294 if((flag = fcntl(fd, F_GETFL)) == -1) { 4295 log_msg(LOG_ERR, "cannot fcntl F_GETFL: %s", strerror(errno)); 4296 flag = 0; 4297 } 4298 flag &= ~O_NONBLOCK; 4299 if(fcntl(fd, F_SETFL, flag) == -1) 4300 log_msg(LOG_ERR, "cannot fcntl F_SETFL 0: %s", strerror(errno)); 4301 sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0); 4302 errstore = errno; 4303 flag |= O_NONBLOCK; 4304 if(fcntl(fd, F_SETFL, flag) == -1) 4305 log_msg(LOG_ERR, "cannot fcntl F_SETFL O_NONBLOCK: %s", strerror(errno)); 4306 if(sent != -1) { 4307 i += sent; 4308 continue; 4309 } 4310 errno = errstore; 4311 } 4312 if(errno == EINVAL) { 4313 /* skip the invalid argument entry, 4314 * send the remaining packets in the list */ 4315 if(!(port_is_zero((void*)&queries[i]->remote_addr) && 4316 verbosity < 3)) { 4317 const char* es = strerror(errno); 4318 char a[64]; 4319 addrport2str((void*)&queries[i]->remote_addr, a, sizeof(a)); 4320 log_msg(LOG_ERR, "sendmmsg skip invalid argument [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es); 4321 } 4322 i += 1; 4323 continue; 4324 } 4325 /* don't log transient network full errors, unless 4326 * on higher verbosity */ 4327 if(!(errno == ENOBUFS && verbosity < 1) && 4328 #ifdef EWOULDBLOCK 4329 errno != EWOULDBLOCK && 4330 #endif 4331 errno != EAGAIN) { 4332 const char* es = strerror(errno); 4333 char a[64]; 4334 addrport2str((void*)&queries[i]->remote_addr, a, sizeof(a)); 4335 log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es); 4336 } 4337 #ifdef BIND8_STATS 4338 data->nsd->st->txerr += recvcount-i; 4339 #endif /* BIND8_STATS */ 4340 break; 4341 } 4342 i += sent; 4343 } 4344 for(i=0; i<recvcount; i++) { 4345 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 4346 iovecs[i].iov_len = buffer_remaining(queries[i]->packet); 4347 msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen; 4348 } 4349 } 4350 4351 #ifdef HAVE_SSL 4352 /* 4353 * Setup an event for the tcp handler. 4354 */ 4355 static void 4356 tcp_handler_setup_event(struct tcp_handler_data* data, void (*fn)(int, short, void *), 4357 int fd, short event) 4358 { 4359 struct timeval timeout; 4360 struct event_base* ev_base; 4361 4362 timeout.tv_sec = data->nsd->tcp_timeout; 4363 timeout.tv_usec = 0L; 4364 4365 ev_base = data->event.ev_base; 4366 event_del(&data->event); 4367 memset(&data->event, 0, sizeof(data->event)); 4368 event_set(&data->event, fd, event, fn, data); 4369 if(event_base_set(ev_base, &data->event) != 0) 4370 log_msg(LOG_ERR, "event base set failed"); 4371 if(event_add(&data->event, &timeout) != 0) 4372 log_msg(LOG_ERR, "event add failed"); 4373 } 4374 #endif /* HAVE_SSL */ 4375 4376 static void 4377 cleanup_tcp_handler(struct tcp_handler_data* data) 4378 { 4379 event_del(&data->event); 4380 #ifdef HAVE_SSL 4381 if(data->tls) { 4382 SSL_shutdown(data->tls); 4383 SSL_free(data->tls); 4384 data->tls = NULL; 4385 } 4386 if(data->tls_auth) { 4387 SSL_shutdown(data->tls_auth); 4388 SSL_free(data->tls_auth); 4389 data->tls_auth = NULL; 4390 } 4391 #endif 4392 data->pp2_header_state = pp2_header_none; 4393 close(data->event.ev_fd); 4394 if(data->prev) 4395 data->prev->next = data->next; 4396 else tcp_active_list = data->next; 4397 if(data->next) 4398 data->next->prev = data->prev; 4399 4400 /* 4401 * Enable the TCP accept handlers when the current number of 4402 * TCP connections is about to drop below the maximum number 4403 * of TCP connections. 4404 */ 4405 if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) { 4406 configure_handler_event_types(EV_READ|EV_PERSIST); 4407 if(slowaccept) { 4408 event_del(&slowaccept_event); 4409 slowaccept = 0; 4410 } 4411 } 4412 --data->nsd->current_tcp_count; 4413 assert(data->nsd->current_tcp_count >= 0); 4414 4415 region_destroy(data->region); 4416 } 4417 4418 /* Read more data into the buffer for tcp read. Pass the amount of additional 4419 * data required. Returns false if nothing needs to be done this event, or 4420 * true if the additional data is in the buffer. */ 4421 static int 4422 more_read_buf_tcp(int fd, struct tcp_handler_data* data, void* bufpos, 4423 size_t add_amount, ssize_t* received) 4424 { 4425 *received = read(fd, bufpos, add_amount); 4426 if (*received == -1) { 4427 if (errno == EAGAIN || errno == EINTR) { 4428 /* 4429 * Read would block, wait until more 4430 * data is available. 4431 */ 4432 return 0; 4433 } else { 4434 char buf[48]; 4435 if(data->query) { 4436 addr2str(&data->query->remote_addr, buf, sizeof(buf)); 4437 } else { 4438 snprintf(buf, sizeof(buf), "unknown"); 4439 } 4440 #ifdef ECONNRESET 4441 if (verbosity >= 2 || errno != ECONNRESET) 4442 #endif /* ECONNRESET */ 4443 log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno)); 4444 cleanup_tcp_handler(data); 4445 return 0; 4446 } 4447 } else if (*received == 0) { 4448 /* EOF */ 4449 cleanup_tcp_handler(data); 4450 return 0; 4451 } 4452 return 1; 4453 } 4454 4455 static void 4456 handle_tcp_reading(int fd, short event, void* arg) 4457 { 4458 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 4459 ssize_t received; 4460 struct event_base* ev_base; 4461 struct timeval timeout; 4462 uint32_t now = 0; 4463 4464 if ((event & EV_TIMEOUT)) { 4465 /* Connection timed out. */ 4466 cleanup_tcp_handler(data); 4467 return; 4468 } 4469 4470 if ((data->nsd->tcp_query_count > 0 && 4471 data->query_count >= data->nsd->tcp_query_count) || 4472 (data->query_count > 0 && data->tcp_no_more_queries)) 4473 { 4474 /* No more queries allowed on this tcp connection. */ 4475 cleanup_tcp_handler(data); 4476 return; 4477 } 4478 4479 assert((event & EV_READ)); 4480 4481 if (data->bytes_transmitted == 0 && data->query_needs_reset) { 4482 query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1); 4483 data->query_needs_reset = 0; 4484 } 4485 4486 if(data->pp2_enabled && data->pp2_header_state != pp2_header_done) { 4487 struct pp2_header* header = NULL; 4488 size_t want_read_size = 0; 4489 size_t current_read_size = 0; 4490 if(data->pp2_header_state == pp2_header_none) { 4491 want_read_size = PP2_HEADER_SIZE; 4492 if(buffer_remaining(data->query->packet) < 4493 want_read_size) { 4494 VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header")); 4495 cleanup_tcp_handler(data); 4496 return; 4497 } 4498 VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading fixed part of PROXYv2 header (len %lu)", (unsigned long)want_read_size)); 4499 current_read_size = want_read_size; 4500 if(data->bytes_transmitted < current_read_size) { 4501 if(!more_read_buf_tcp(fd, data, 4502 (void*)buffer_at(data->query->packet, 4503 data->bytes_transmitted), 4504 current_read_size - data->bytes_transmitted, 4505 &received)) 4506 return; 4507 data->bytes_transmitted += received; 4508 buffer_skip(data->query->packet, received); 4509 if(data->bytes_transmitted != current_read_size) 4510 return; 4511 data->pp2_header_state = pp2_header_init; 4512 } 4513 } 4514 if(data->pp2_header_state == pp2_header_init) { 4515 int err; 4516 err = pp2_read_header(buffer_begin(data->query->packet), 4517 buffer_limit(data->query->packet)); 4518 if(err) { 4519 VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not parse PROXYv2 header: %s", pp_lookup_error(err))); 4520 cleanup_tcp_handler(data); 4521 return; 4522 } 4523 header = (struct pp2_header*)buffer_begin(data->query->packet); 4524 want_read_size = ntohs(header->len); 4525 if(buffer_limit(data->query->packet) < 4526 PP2_HEADER_SIZE + want_read_size) { 4527 VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header")); 4528 cleanup_tcp_handler(data); 4529 return; 4530 } 4531 VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading variable part of PROXYv2 header (len %lu)", (unsigned long)want_read_size)); 4532 current_read_size = PP2_HEADER_SIZE + want_read_size; 4533 if(want_read_size == 0) { 4534 /* nothing more to read; header is complete */ 4535 data->pp2_header_state = pp2_header_done; 4536 } else if(data->bytes_transmitted < current_read_size) { 4537 if(!more_read_buf_tcp(fd, data, 4538 (void*)buffer_at(data->query->packet, 4539 data->bytes_transmitted), 4540 current_read_size - data->bytes_transmitted, 4541 &received)) 4542 return; 4543 data->bytes_transmitted += received; 4544 buffer_skip(data->query->packet, received); 4545 if(data->bytes_transmitted != current_read_size) 4546 return; 4547 data->pp2_header_state = pp2_header_done; 4548 } 4549 } 4550 if(data->pp2_header_state != pp2_header_done || !header) { 4551 VERBOSITY(6, (LOG_ERR, "proxy-protocol: wrong state for the PROXYv2 header")); 4552 4553 cleanup_tcp_handler(data); 4554 return; 4555 } 4556 buffer_flip(data->query->packet); 4557 if(!consume_pp2_header(data->query->packet, data->query, 1)) { 4558 VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not consume PROXYv2 header")); 4559 4560 cleanup_tcp_handler(data); 4561 return; 4562 } 4563 /* Clear and reset the buffer to read the following 4564 * DNS packet(s). */ 4565 buffer_clear(data->query->packet); 4566 data->bytes_transmitted = 0; 4567 } 4568 4569 /* 4570 * Check if we received the leading packet length bytes yet. 4571 */ 4572 if (data->bytes_transmitted < sizeof(uint16_t)) { 4573 if(!more_read_buf_tcp(fd, data, 4574 (char*) &data->query->tcplen + data->bytes_transmitted, 4575 sizeof(uint16_t) - data->bytes_transmitted, &received)) 4576 return; 4577 data->bytes_transmitted += received; 4578 if (data->bytes_transmitted < sizeof(uint16_t)) { 4579 /* 4580 * Not done with the tcplen yet, wait for more 4581 * data to become available. 4582 */ 4583 return; 4584 } 4585 assert(data->bytes_transmitted == sizeof(uint16_t)); 4586 4587 data->query->tcplen = ntohs(data->query->tcplen); 4588 4589 /* 4590 * Minimum query size is: 4591 * 4592 * Size of the header (12) 4593 * + Root domain name (1) 4594 * + Query class (2) 4595 * + Query type (2) 4596 */ 4597 if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) { 4598 VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection")); 4599 cleanup_tcp_handler(data); 4600 return; 4601 } 4602 4603 if (data->query->tcplen > data->query->maxlen) { 4604 VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection")); 4605 cleanup_tcp_handler(data); 4606 return; 4607 } 4608 4609 buffer_set_limit(data->query->packet, data->query->tcplen); 4610 } 4611 4612 assert(buffer_remaining(data->query->packet) > 0); 4613 4614 /* Read the (remaining) query data. */ 4615 if(!more_read_buf_tcp(fd, data, buffer_current(data->query->packet), 4616 buffer_remaining(data->query->packet), &received)) 4617 return; 4618 data->bytes_transmitted += received; 4619 buffer_skip(data->query->packet, received); 4620 if (buffer_remaining(data->query->packet) > 0) { 4621 /* 4622 * Message not yet complete, wait for more data to 4623 * become available. 4624 */ 4625 return; 4626 } 4627 4628 assert(buffer_position(data->query->packet) == data->query->tcplen); 4629 4630 /* Account... */ 4631 #ifdef BIND8_STATS 4632 #ifndef INET6 4633 STATUP(data->nsd, ctcp); 4634 #else 4635 if (data->query->remote_addr.ss_family == AF_INET) { 4636 STATUP(data->nsd, ctcp); 4637 } else if (data->query->remote_addr.ss_family == AF_INET6) { 4638 STATUP(data->nsd, ctcp6); 4639 } 4640 #endif 4641 #endif /* BIND8_STATS */ 4642 4643 /* We have a complete query, process it. */ 4644 4645 /* tcp-query-count: handle query counter ++ */ 4646 data->query_count++; 4647 4648 buffer_flip(data->query->packet); 4649 #ifdef USE_DNSTAP 4650 /* 4651 * and send TCP-query with found address (local) and client address to dnstap process 4652 */ 4653 log_addr("query from client", &data->query->client_addr); 4654 log_addr("to server (local)", (void*)&data->socket->addr.ai_addr); 4655 if(verbosity >= 6 && data->query->is_proxied) 4656 log_addr("query via proxy", &data->query->remote_addr); 4657 dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr, 4658 data->query->client_addrlen, data->query->tcp, data->query->packet); 4659 #endif /* USE_DNSTAP */ 4660 data->query_state = server_process_query(data->nsd, data->query, &now); 4661 if (data->query_state == QUERY_DISCARDED) { 4662 /* Drop the packet and the entire connection... */ 4663 STATUP(data->nsd, dropped); 4664 ZTATUP(data->nsd, data->query->zone, dropped); 4665 cleanup_tcp_handler(data); 4666 return; 4667 } 4668 4669 #ifdef BIND8_STATS 4670 if (RCODE(data->query->packet) == RCODE_OK 4671 && !AA(data->query->packet)) 4672 { 4673 STATUP(data->nsd, nona); 4674 ZTATUP(data->nsd, data->query->zone, nona); 4675 } 4676 #endif /* BIND8_STATS */ 4677 4678 #ifdef USE_ZONE_STATS 4679 #ifndef INET6 4680 ZTATUP(data->nsd, data->query->zone, ctcp); 4681 #else 4682 if (data->query->remote_addr.ss_family == AF_INET) { 4683 ZTATUP(data->nsd, data->query->zone, ctcp); 4684 } else if (data->query->remote_addr.ss_family == AF_INET6) { 4685 ZTATUP(data->nsd, data->query->zone, ctcp6); 4686 } 4687 #endif 4688 #endif /* USE_ZONE_STATS */ 4689 4690 query_add_optional(data->query, data->nsd, &now); 4691 4692 /* Switch to the tcp write handler. */ 4693 buffer_flip(data->query->packet); 4694 data->query->tcplen = buffer_remaining(data->query->packet); 4695 #ifdef BIND8_STATS 4696 /* Account the rcode & TC... */ 4697 STATUP2(data->nsd, rcode, RCODE(data->query->packet)); 4698 ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet)); 4699 if (TC(data->query->packet)) { 4700 STATUP(data->nsd, truncated); 4701 ZTATUP(data->nsd, data->query->zone, truncated); 4702 } 4703 #endif /* BIND8_STATS */ 4704 #ifdef USE_DNSTAP 4705 /* 4706 * sending TCP-response with found (earlier) address (local) and client address to dnstap process 4707 */ 4708 log_addr("from server (local)", (void*)&data->socket->addr.ai_addr); 4709 log_addr("response to client", &data->query->client_addr); 4710 if(verbosity >= 6 && data->query->is_proxied) 4711 log_addr("response via proxy", &data->query->remote_addr); 4712 dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr, 4713 data->query->client_addrlen, data->query->tcp, data->query->packet, 4714 data->query->zone); 4715 #endif /* USE_DNSTAP */ 4716 data->bytes_transmitted = 0; 4717 4718 timeout.tv_sec = data->tcp_timeout / 1000; 4719 timeout.tv_usec = (data->tcp_timeout % 1000)*1000; 4720 4721 ev_base = data->event.ev_base; 4722 event_del(&data->event); 4723 memset(&data->event, 0, sizeof(data->event)); 4724 event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT, 4725 handle_tcp_writing, data); 4726 if(event_base_set(ev_base, &data->event) != 0) 4727 log_msg(LOG_ERR, "event base set tcpr failed"); 4728 if(event_add(&data->event, &timeout) != 0) 4729 log_msg(LOG_ERR, "event add tcpr failed"); 4730 /* see if we can write the answer right away(usually so,EAGAIN ifnot)*/ 4731 handle_tcp_writing(fd, EV_WRITE, data); 4732 } 4733 4734 static void 4735 handle_tcp_writing(int fd, short event, void* arg) 4736 { 4737 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 4738 ssize_t sent; 4739 struct query *q = data->query; 4740 struct timeval timeout; 4741 struct event_base* ev_base; 4742 uint32_t now = 0; 4743 4744 if ((event & EV_TIMEOUT) || !q) { 4745 /* Connection timed out. */ 4746 /* Or data->query is NULL, in which case nothing to do. */ 4747 cleanup_tcp_handler(data); 4748 return; 4749 } 4750 4751 assert((event & EV_WRITE)); 4752 4753 if (data->bytes_transmitted < sizeof(q->tcplen)) { 4754 /* Writing the response packet length. */ 4755 uint16_t n_tcplen = htons(q->tcplen); 4756 #ifdef HAVE_WRITEV 4757 struct iovec iov[2]; 4758 iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted; 4759 iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted; 4760 iov[1].iov_base = buffer_begin(q->packet); 4761 iov[1].iov_len = buffer_limit(q->packet); 4762 sent = writev(fd, iov, 2); 4763 #else /* HAVE_WRITEV */ 4764 sent = write(fd, 4765 (const char *) &n_tcplen + data->bytes_transmitted, 4766 sizeof(n_tcplen) - data->bytes_transmitted); 4767 #endif /* HAVE_WRITEV */ 4768 if (sent == -1) { 4769 if (errno == EAGAIN || errno == EINTR) { 4770 /* 4771 * Write would block, wait until 4772 * socket becomes writable again. 4773 */ 4774 return; 4775 } else { 4776 #ifdef ECONNRESET 4777 if(verbosity >= 2 || errno != ECONNRESET) 4778 #endif /* ECONNRESET */ 4779 #ifdef EPIPE 4780 if(verbosity >= 2 || errno != EPIPE) 4781 #endif /* EPIPE 'broken pipe' */ 4782 { 4783 char client_ip[128]; 4784 if(data->query) { 4785 addr2str(&data->query->client_addr, client_ip, sizeof(client_ip)); 4786 } else { 4787 snprintf(client_ip, sizeof(client_ip), "unknown"); 4788 } 4789 log_msg(LOG_ERR, "failed writing to tcp from %s: %s", client_ip, strerror(errno)); 4790 } 4791 cleanup_tcp_handler(data); 4792 return; 4793 } 4794 } 4795 4796 data->bytes_transmitted += sent; 4797 if (data->bytes_transmitted < sizeof(q->tcplen)) { 4798 /* 4799 * Writing not complete, wait until socket 4800 * becomes writable again. 4801 */ 4802 return; 4803 } 4804 4805 #ifdef HAVE_WRITEV 4806 sent -= sizeof(n_tcplen); 4807 /* handle potential 'packet done' code */ 4808 goto packet_could_be_done; 4809 #endif 4810 } 4811 4812 sent = write(fd, 4813 buffer_current(q->packet), 4814 buffer_remaining(q->packet)); 4815 if (sent == -1) { 4816 if (errno == EAGAIN || errno == EINTR) { 4817 /* 4818 * Write would block, wait until 4819 * socket becomes writable again. 4820 */ 4821 return; 4822 } else { 4823 #ifdef ECONNRESET 4824 if(verbosity >= 2 || errno != ECONNRESET) 4825 #endif /* ECONNRESET */ 4826 #ifdef EPIPE 4827 if(verbosity >= 2 || errno != EPIPE) 4828 #endif /* EPIPE 'broken pipe' */ 4829 { 4830 char client_ip[128]; 4831 if(data->query) { 4832 addr2str(&data->query->client_addr, client_ip, sizeof(client_ip)); 4833 } else { 4834 snprintf(client_ip, sizeof(client_ip), "unknown"); 4835 } 4836 log_msg(LOG_ERR, "failed writing to tcp from %s: %s", client_ip, strerror(errno)); 4837 } 4838 cleanup_tcp_handler(data); 4839 return; 4840 } 4841 } 4842 4843 data->bytes_transmitted += sent; 4844 #ifdef HAVE_WRITEV 4845 packet_could_be_done: 4846 #endif 4847 buffer_skip(q->packet, sent); 4848 if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) { 4849 /* 4850 * Still more data to write when socket becomes 4851 * writable again. 4852 */ 4853 return; 4854 } 4855 4856 assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen)); 4857 4858 if (data->query_state == QUERY_IN_AXFR || 4859 data->query_state == QUERY_IN_IXFR) { 4860 /* Continue processing AXFR and writing back results. */ 4861 buffer_clear(q->packet); 4862 if(data->query_state == QUERY_IN_AXFR) 4863 data->query_state = query_axfr(data->nsd, q, 0); 4864 else data->query_state = query_ixfr(data->nsd, q); 4865 if (data->query_state != QUERY_PROCESSED) { 4866 query_add_optional(data->query, data->nsd, &now); 4867 4868 /* Reset data. */ 4869 buffer_flip(q->packet); 4870 q->tcplen = buffer_remaining(q->packet); 4871 data->bytes_transmitted = 0; 4872 /* Reset timeout. */ 4873 timeout.tv_sec = data->tcp_timeout / 1000; 4874 timeout.tv_usec = (data->tcp_timeout % 1000)*1000; 4875 ev_base = data->event.ev_base; 4876 event_del(&data->event); 4877 memset(&data->event, 0, sizeof(data->event)); 4878 event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT, 4879 handle_tcp_writing, data); 4880 if(event_base_set(ev_base, &data->event) != 0) 4881 log_msg(LOG_ERR, "event base set tcpw failed"); 4882 if(event_add(&data->event, &timeout) != 0) 4883 log_msg(LOG_ERR, "event add tcpw failed"); 4884 4885 /* 4886 * Write data if/when the socket is writable 4887 * again. 4888 */ 4889 return; 4890 } 4891 } 4892 4893 /* 4894 * Done sending, wait for the next request to arrive on the 4895 * TCP socket by installing the TCP read handler. 4896 */ 4897 if ((data->nsd->tcp_query_count > 0 && 4898 data->query_count >= data->nsd->tcp_query_count) || 4899 data->tcp_no_more_queries) { 4900 4901 (void) shutdown(fd, SHUT_WR); 4902 } 4903 4904 data->bytes_transmitted = 0; 4905 data->query_needs_reset = 1; 4906 4907 timeout.tv_sec = data->tcp_timeout / 1000; 4908 timeout.tv_usec = (data->tcp_timeout % 1000)*1000; 4909 ev_base = data->event.ev_base; 4910 event_del(&data->event); 4911 memset(&data->event, 0, sizeof(data->event)); 4912 event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT, 4913 handle_tcp_reading, data); 4914 if(event_base_set(ev_base, &data->event) != 0) 4915 log_msg(LOG_ERR, "event base set tcpw failed"); 4916 if(event_add(&data->event, &timeout) != 0) 4917 log_msg(LOG_ERR, "event add tcpw failed"); 4918 } 4919 4920 #ifdef HAVE_SSL 4921 /** create SSL object and associate fd */ 4922 static SSL* 4923 incoming_ssl_fd(SSL_CTX* ctx, int fd) 4924 { 4925 SSL* ssl = SSL_new((SSL_CTX*)ctx); 4926 if(!ssl) { 4927 log_crypto_err("could not SSL_new"); 4928 return NULL; 4929 } 4930 SSL_set_accept_state(ssl); 4931 (void)SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY); 4932 if(!SSL_set_fd(ssl, fd)) { 4933 log_crypto_err("could not SSL_set_fd"); 4934 SSL_free(ssl); 4935 return NULL; 4936 } 4937 return ssl; 4938 } 4939 4940 /** TLS handshake to upgrade TCP connection */ 4941 static int 4942 tls_handshake(struct tcp_handler_data* data, int fd, int writing) 4943 { 4944 int r; 4945 if(data->shake_state == tls_hs_read_event) { 4946 /* read condition satisfied back to writing */ 4947 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); 4948 data->shake_state = tls_hs_none; 4949 return 1; 4950 } 4951 if(data->shake_state == tls_hs_write_event) { 4952 /* write condition satisfied back to reading */ 4953 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); 4954 data->shake_state = tls_hs_none; 4955 return 1; 4956 } 4957 4958 /* (continue to) setup the TLS connection */ 4959 ERR_clear_error(); 4960 if(data->tls_auth) 4961 r = SSL_do_handshake(data->tls_auth); 4962 else 4963 r = SSL_do_handshake(data->tls); 4964 4965 if(r != 1) { 4966 int want; 4967 if(data->tls_auth) 4968 want = SSL_get_error(data->tls_auth, r); 4969 else 4970 want = SSL_get_error(data->tls, r); 4971 if(want == SSL_ERROR_WANT_READ) { 4972 if(data->shake_state == tls_hs_read) { 4973 /* try again later */ 4974 return 1; 4975 } 4976 data->shake_state = tls_hs_read; 4977 /* switch back to reading mode */ 4978 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); 4979 return 1; 4980 } else if(want == SSL_ERROR_WANT_WRITE) { 4981 if(data->shake_state == tls_hs_write) { 4982 /* try again later */ 4983 return 1; 4984 } 4985 data->shake_state = tls_hs_write; 4986 /* switch back to writing mode */ 4987 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); 4988 return 1; 4989 } else { 4990 if(r == 0) 4991 VERBOSITY(5, (LOG_ERR, "TLS handshake: connection closed prematurely")); 4992 else { 4993 unsigned long err = ERR_get_error(); 4994 if(!squelch_err_ssl_handshake(err)) { 4995 char a[64], s[256]; 4996 if(data->query) { 4997 addr2str(&data->query->remote_addr, a, sizeof(a)); 4998 } else { 4999 snprintf(a, sizeof(a), "unknown"); 5000 } 5001 snprintf(s, sizeof(s), "TLS handshake failed from %s", a); 5002 log_crypto_from_err(LOG_ERR, s, err); 5003 } 5004 } 5005 cleanup_tcp_handler(data); 5006 return 0; 5007 } 5008 } 5009 5010 /* Use to log successful upgrade for testing - could be removed*/ 5011 if(data->tls_auth) 5012 VERBOSITY(5, (LOG_INFO, "TLS-AUTH handshake succeeded.")); 5013 else 5014 VERBOSITY(5, (LOG_INFO, "TLS handshake succeeded.")); 5015 /* set back to the event we need to have when reading (or writing) */ 5016 if(data->shake_state == tls_hs_read && writing) { 5017 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); 5018 } else if(data->shake_state == tls_hs_write && !writing) { 5019 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); 5020 } 5021 data->shake_state = tls_hs_none; 5022 return 1; 5023 } 5024 5025 /* Read more data into the buffer for tls read. Pass the amount of additional 5026 * data required. Returns false if nothing needs to be done this event, or 5027 * true if the additional data is in the buffer. */ 5028 static int 5029 more_read_buf_tls(int fd, struct tcp_handler_data* data, void* bufpos, 5030 size_t add_amount, ssize_t* received) 5031 { 5032 int r; 5033 ERR_clear_error(); 5034 if(data->tls_auth) 5035 r = (*received=SSL_read(data->tls_auth, bufpos, add_amount)); 5036 else 5037 r = (*received=SSL_read(data->tls, bufpos, add_amount)); 5038 if(r <= 0) { 5039 int want; 5040 if(data->tls_auth) 5041 want = SSL_get_error(data->tls_auth, *received); 5042 else 5043 want = SSL_get_error(data->tls, *received); 5044 if(want == SSL_ERROR_ZERO_RETURN) { 5045 cleanup_tcp_handler(data); 5046 return 0; /* shutdown, closed */ 5047 } else if(want == SSL_ERROR_WANT_READ) { 5048 /* wants to be called again */ 5049 return 0; 5050 } 5051 else if(want == SSL_ERROR_WANT_WRITE) { 5052 /* switch to writing */ 5053 data->shake_state = tls_hs_write_event; 5054 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 5055 return 0; 5056 } 5057 cleanup_tcp_handler(data); 5058 log_crypto_err("could not SSL_read"); 5059 return 0; 5060 } 5061 return 1; 5062 } 5063 5064 /** handle TLS reading of incoming query */ 5065 static void 5066 handle_tls_reading(int fd, short event, void* arg) 5067 { 5068 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 5069 ssize_t received; 5070 uint32_t now = 0; 5071 5072 if ((event & EV_TIMEOUT)) { 5073 /* Connection timed out. */ 5074 cleanup_tcp_handler(data); 5075 return; 5076 } 5077 5078 if ((data->nsd->tcp_query_count > 0 && 5079 data->query_count >= data->nsd->tcp_query_count) || 5080 (data->query_count > 0 && data->tcp_no_more_queries)) 5081 { 5082 /* No more queries allowed on this tcp connection. */ 5083 cleanup_tcp_handler(data); 5084 return; 5085 } 5086 5087 assert((event & EV_READ)); 5088 5089 if (data->bytes_transmitted == 0 && data->query_needs_reset) { 5090 query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1); 5091 data->query_needs_reset = 0; 5092 } 5093 5094 if(data->shake_state != tls_hs_none) { 5095 if(!tls_handshake(data, fd, 0)) 5096 return; 5097 if(data->shake_state != tls_hs_none) 5098 return; 5099 } 5100 5101 if(data->pp2_enabled && data->pp2_header_state != pp2_header_done) { 5102 struct pp2_header* header = NULL; 5103 size_t want_read_size = 0; 5104 size_t current_read_size = 0; 5105 if(data->pp2_header_state == pp2_header_none) { 5106 want_read_size = PP2_HEADER_SIZE; 5107 if(buffer_remaining(data->query->packet) < 5108 want_read_size) { 5109 VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header")); 5110 cleanup_tcp_handler(data); 5111 return; 5112 } 5113 VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading fixed part of PROXYv2 header (len %lu)", (unsigned long)want_read_size)); 5114 current_read_size = want_read_size; 5115 if(data->bytes_transmitted < current_read_size) { 5116 if(!more_read_buf_tls(fd, data, 5117 buffer_at(data->query->packet, 5118 data->bytes_transmitted), 5119 current_read_size - data->bytes_transmitted, 5120 &received)) 5121 return; 5122 data->bytes_transmitted += received; 5123 buffer_skip(data->query->packet, received); 5124 if(data->bytes_transmitted != current_read_size) 5125 return; 5126 data->pp2_header_state = pp2_header_init; 5127 } 5128 } 5129 if(data->pp2_header_state == pp2_header_init) { 5130 int err; 5131 err = pp2_read_header(buffer_begin(data->query->packet), 5132 buffer_limit(data->query->packet)); 5133 if(err) { 5134 VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not parse PROXYv2 header: %s", pp_lookup_error(err))); 5135 cleanup_tcp_handler(data); 5136 return; 5137 } 5138 header = (struct pp2_header*)buffer_begin(data->query->packet); 5139 want_read_size = ntohs(header->len); 5140 if(buffer_limit(data->query->packet) < 5141 PP2_HEADER_SIZE + want_read_size) { 5142 VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header")); 5143 cleanup_tcp_handler(data); 5144 return; 5145 } 5146 VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading variable part of PROXYv2 header (len %lu)", (unsigned long)want_read_size)); 5147 current_read_size = PP2_HEADER_SIZE + want_read_size; 5148 if(want_read_size == 0) { 5149 /* nothing more to read; header is complete */ 5150 data->pp2_header_state = pp2_header_done; 5151 } else if(data->bytes_transmitted < current_read_size) { 5152 if(!more_read_buf_tls(fd, data, 5153 buffer_at(data->query->packet, 5154 data->bytes_transmitted), 5155 current_read_size - data->bytes_transmitted, 5156 &received)) 5157 return; 5158 data->bytes_transmitted += received; 5159 buffer_skip(data->query->packet, received); 5160 if(data->bytes_transmitted != current_read_size) 5161 return; 5162 data->pp2_header_state = pp2_header_done; 5163 } 5164 } 5165 if(data->pp2_header_state != pp2_header_done || !header) { 5166 VERBOSITY(6, (LOG_ERR, "proxy-protocol: wrong state for the PROXYv2 header")); 5167 cleanup_tcp_handler(data); 5168 return; 5169 } 5170 buffer_flip(data->query->packet); 5171 if(!consume_pp2_header(data->query->packet, data->query, 1)) { 5172 VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not consume PROXYv2 header")); 5173 cleanup_tcp_handler(data); 5174 return; 5175 } 5176 /* Clear and reset the buffer to read the following 5177 * DNS packet(s). */ 5178 buffer_clear(data->query->packet); 5179 data->bytes_transmitted = 0; 5180 } 5181 /* 5182 * Check if we received the leading packet length bytes yet. 5183 */ 5184 if(data->bytes_transmitted < sizeof(uint16_t)) { 5185 if(!more_read_buf_tls(fd, data, 5186 (char *) &data->query->tcplen + data->bytes_transmitted, 5187 sizeof(uint16_t) - data->bytes_transmitted, &received)) 5188 return; 5189 data->bytes_transmitted += received; 5190 if (data->bytes_transmitted < sizeof(uint16_t)) { 5191 /* 5192 * Not done with the tcplen yet, wait for more 5193 * data to become available. 5194 */ 5195 return; 5196 } 5197 5198 assert(data->bytes_transmitted == sizeof(uint16_t)); 5199 5200 data->query->tcplen = ntohs(data->query->tcplen); 5201 5202 /* 5203 * Minimum query size is: 5204 * 5205 * Size of the header (12) 5206 * + Root domain name (1) 5207 * + Query class (2) 5208 * + Query type (2) 5209 */ 5210 if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) { 5211 VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection")); 5212 cleanup_tcp_handler(data); 5213 return; 5214 } 5215 5216 if (data->query->tcplen > data->query->maxlen) { 5217 VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection")); 5218 cleanup_tcp_handler(data); 5219 return; 5220 } 5221 5222 buffer_set_limit(data->query->packet, data->query->tcplen); 5223 } 5224 5225 assert(buffer_remaining(data->query->packet) > 0); 5226 5227 /* Read the (remaining) query data. */ 5228 if(!more_read_buf_tls(fd, data, buffer_current(data->query->packet), 5229 buffer_remaining(data->query->packet), &received)) 5230 return; 5231 data->bytes_transmitted += received; 5232 buffer_skip(data->query->packet, received); 5233 if (buffer_remaining(data->query->packet) > 0) { 5234 /* 5235 * Message not yet complete, wait for more data to 5236 * become available. 5237 */ 5238 return; 5239 } 5240 5241 assert(buffer_position(data->query->packet) == data->query->tcplen); 5242 5243 /* Account... */ 5244 #ifndef INET6 5245 STATUP(data->nsd, ctls); 5246 #else 5247 if (data->query->remote_addr.ss_family == AF_INET) { 5248 STATUP(data->nsd, ctls); 5249 } else if (data->query->remote_addr.ss_family == AF_INET6) { 5250 STATUP(data->nsd, ctls6); 5251 } 5252 #endif 5253 5254 /* We have a complete query, process it. */ 5255 5256 /* tcp-query-count: handle query counter ++ */ 5257 data->query_count++; 5258 5259 buffer_flip(data->query->packet); 5260 #ifdef USE_DNSTAP 5261 /* 5262 * and send TCP-query with found address (local) and client address to dnstap process 5263 */ 5264 log_addr("query from client", &data->query->client_addr); 5265 log_addr("to server (local)", (void*)&data->socket->addr.ai_addr); 5266 if(verbosity >= 6 && data->query->is_proxied) 5267 log_addr("query via proxy", &data->query->remote_addr); 5268 dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr, 5269 data->query->client_addrlen, data->query->tcp, data->query->packet); 5270 #endif /* USE_DNSTAP */ 5271 data->query_state = server_process_query(data->nsd, data->query, &now); 5272 if (data->query_state == QUERY_DISCARDED) { 5273 /* Drop the packet and the entire connection... */ 5274 STATUP(data->nsd, dropped); 5275 ZTATUP(data->nsd, data->query->zone, dropped); 5276 cleanup_tcp_handler(data); 5277 return; 5278 } 5279 5280 #ifdef BIND8_STATS 5281 if (RCODE(data->query->packet) == RCODE_OK 5282 && !AA(data->query->packet)) 5283 { 5284 STATUP(data->nsd, nona); 5285 ZTATUP(data->nsd, data->query->zone, nona); 5286 } 5287 #endif /* BIND8_STATS */ 5288 5289 #ifdef USE_ZONE_STATS 5290 #ifndef INET6 5291 ZTATUP(data->nsd, data->query->zone, ctls); 5292 #else 5293 if (data->query->remote_addr.ss_family == AF_INET) { 5294 ZTATUP(data->nsd, data->query->zone, ctls); 5295 } else if (data->query->remote_addr.ss_family == AF_INET6) { 5296 ZTATUP(data->nsd, data->query->zone, ctls6); 5297 } 5298 #endif 5299 #endif /* USE_ZONE_STATS */ 5300 5301 query_add_optional(data->query, data->nsd, &now); 5302 5303 /* Switch to the tcp write handler. */ 5304 buffer_flip(data->query->packet); 5305 data->query->tcplen = buffer_remaining(data->query->packet); 5306 #ifdef BIND8_STATS 5307 /* Account the rcode & TC... */ 5308 STATUP2(data->nsd, rcode, RCODE(data->query->packet)); 5309 ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet)); 5310 if (TC(data->query->packet)) { 5311 STATUP(data->nsd, truncated); 5312 ZTATUP(data->nsd, data->query->zone, truncated); 5313 } 5314 #endif /* BIND8_STATS */ 5315 #ifdef USE_DNSTAP 5316 /* 5317 * sending TCP-response with found (earlier) address (local) and client address to dnstap process 5318 */ 5319 log_addr("from server (local)", (void*)&data->socket->addr.ai_addr); 5320 log_addr("response to client", &data->query->client_addr); 5321 if(verbosity >= 6 && data->query->is_proxied) 5322 log_addr("response via proxy", &data->query->remote_addr); 5323 dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr, 5324 data->query->client_addrlen, data->query->tcp, data->query->packet, 5325 data->query->zone); 5326 #endif /* USE_DNSTAP */ 5327 data->bytes_transmitted = 0; 5328 5329 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 5330 5331 /* see if we can write the answer right away(usually so,EAGAIN ifnot)*/ 5332 handle_tls_writing(fd, EV_WRITE, data); 5333 } 5334 5335 /** handle TLS writing of outgoing response */ 5336 static void 5337 handle_tls_writing(int fd, short event, void* arg) 5338 { 5339 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 5340 ssize_t sent; 5341 struct query *q = data->query; 5342 /* static variable that holds reassembly buffer used to put the 5343 * TCP length in front of the packet, like writev. */ 5344 static buffer_type* global_tls_temp_buffer = NULL; 5345 buffer_type* write_buffer; 5346 uint32_t now = 0; 5347 5348 if ((event & EV_TIMEOUT) || !q) { 5349 /* Connection timed out. */ 5350 /* Or data->query is NULL, in which case nothing to do. */ 5351 cleanup_tcp_handler(data); 5352 return; 5353 } 5354 5355 assert((event & EV_WRITE)); 5356 5357 if(data->shake_state != tls_hs_none) { 5358 if(!tls_handshake(data, fd, 1)) 5359 return; 5360 if(data->shake_state != tls_hs_none) 5361 return; 5362 } 5363 5364 if(data->tls_auth) 5365 (void)SSL_set_mode(data->tls_auth, SSL_MODE_ENABLE_PARTIAL_WRITE); 5366 else 5367 (void)SSL_set_mode(data->tls, SSL_MODE_ENABLE_PARTIAL_WRITE); 5368 5369 /* If we are writing the start of a message, we must include the length 5370 * this is done with a copy into write_buffer. */ 5371 write_buffer = NULL; 5372 if (data->bytes_transmitted == 0) { 5373 if(!global_tls_temp_buffer) { 5374 /* gets deallocated when nsd shuts down from 5375 * nsd.region */ 5376 global_tls_temp_buffer = buffer_create(nsd.region, 5377 QIOBUFSZ + sizeof(q->tcplen)); 5378 if (!global_tls_temp_buffer) { 5379 return; 5380 } 5381 } 5382 write_buffer = global_tls_temp_buffer; 5383 buffer_clear(write_buffer); 5384 buffer_write_u16(write_buffer, q->tcplen); 5385 buffer_write(write_buffer, buffer_current(q->packet), 5386 (int)buffer_remaining(q->packet)); 5387 buffer_flip(write_buffer); 5388 } else { 5389 write_buffer = q->packet; 5390 } 5391 5392 /* Write the response */ 5393 ERR_clear_error(); 5394 if(data->tls_auth) 5395 sent = SSL_write(data->tls_auth, buffer_current(write_buffer), buffer_remaining(write_buffer)); 5396 else 5397 sent = SSL_write(data->tls, buffer_current(write_buffer), buffer_remaining(write_buffer)); 5398 if(sent <= 0) { 5399 int want; 5400 if(data->tls_auth) 5401 want = SSL_get_error(data->tls_auth, sent); 5402 else 5403 want = SSL_get_error(data->tls, sent); 5404 if(want == SSL_ERROR_ZERO_RETURN) { 5405 cleanup_tcp_handler(data); 5406 /* closed */ 5407 } else if(want == SSL_ERROR_WANT_READ) { 5408 /* switch back to reading */ 5409 data->shake_state = tls_hs_read_event; 5410 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT); 5411 } else if(want != SSL_ERROR_WANT_WRITE) { 5412 cleanup_tcp_handler(data); 5413 { 5414 char client_ip[128], e[188]; 5415 if(data->query) { 5416 addr2str(&data->query->client_addr, client_ip, sizeof(client_ip)); 5417 } else { 5418 snprintf(client_ip, sizeof(client_ip), "unknown"); 5419 } 5420 snprintf(e, sizeof(e), "failed writing to tls from %s: %s", 5421 client_ip, "SSL_write error"); 5422 log_crypto_err(e); 5423 } 5424 } 5425 return; 5426 } 5427 5428 buffer_skip(write_buffer, sent); 5429 if(buffer_remaining(write_buffer) != 0) { 5430 /* If not all sent, sync up the real buffer if it wasn't used.*/ 5431 if (data->bytes_transmitted == 0 && (ssize_t)sent > (ssize_t)sizeof(q->tcplen)) { 5432 buffer_skip(q->packet, (ssize_t)sent - (ssize_t)sizeof(q->tcplen)); 5433 } 5434 } 5435 5436 data->bytes_transmitted += sent; 5437 if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) { 5438 /* 5439 * Still more data to write when socket becomes 5440 * writable again. 5441 */ 5442 return; 5443 } 5444 5445 assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen)); 5446 5447 if (data->query_state == QUERY_IN_AXFR || 5448 data->query_state == QUERY_IN_IXFR) { 5449 /* Continue processing AXFR and writing back results. */ 5450 buffer_clear(q->packet); 5451 if(data->query_state == QUERY_IN_AXFR) 5452 data->query_state = query_axfr(data->nsd, q, 0); 5453 else data->query_state = query_ixfr(data->nsd, q); 5454 if (data->query_state != QUERY_PROCESSED) { 5455 query_add_optional(data->query, data->nsd, &now); 5456 5457 /* Reset data. */ 5458 buffer_flip(q->packet); 5459 q->tcplen = buffer_remaining(q->packet); 5460 data->bytes_transmitted = 0; 5461 /* Reset to writing mode. */ 5462 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 5463 5464 /* 5465 * Write data if/when the socket is writable 5466 * again. 5467 */ 5468 return; 5469 } 5470 } 5471 5472 /* 5473 * Done sending, wait for the next request to arrive on the 5474 * TCP socket by installing the TCP read handler. 5475 */ 5476 if ((data->nsd->tcp_query_count > 0 && 5477 data->query_count >= data->nsd->tcp_query_count) || 5478 data->tcp_no_more_queries) { 5479 5480 (void) shutdown(fd, SHUT_WR); 5481 } 5482 5483 data->bytes_transmitted = 0; 5484 data->query_needs_reset = 1; 5485 5486 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT); 5487 } 5488 #endif 5489 5490 static void 5491 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event), 5492 void* ATTR_UNUSED(arg)) 5493 { 5494 if(slowaccept) { 5495 configure_handler_event_types(EV_PERSIST | EV_READ); 5496 slowaccept = 0; 5497 } 5498 } 5499 5500 static int perform_accept(int fd, struct sockaddr *addr, socklen_t *addrlen) 5501 { 5502 #ifndef HAVE_ACCEPT4 5503 int s = accept(fd, addr, addrlen); 5504 if (s != -1) { 5505 if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) { 5506 log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno)); 5507 close(s); 5508 s = -1; 5509 errno=EINTR; /* stop error printout as error in accept4 5510 by setting this errno, it omits printout, in 5511 later code that calls nsd_accept4 */ 5512 } 5513 } 5514 return s; 5515 #else 5516 return accept4(fd, addr, addrlen, SOCK_NONBLOCK); 5517 #endif /* HAVE_ACCEPT4 */ 5518 } 5519 5520 /* 5521 * Handle an incoming TCP connection. The connection is accepted and 5522 * a new TCP reader event handler is added. The TCP handler 5523 * is responsible for cleanup when the connection is closed. 5524 */ 5525 static void 5526 handle_tcp_accept(int fd, short event, void* arg) 5527 { 5528 struct tcp_accept_handler_data *data 5529 = (struct tcp_accept_handler_data *) arg; 5530 int s; 5531 int reject = 0; 5532 struct tcp_handler_data *tcp_data; 5533 region_type *tcp_region; 5534 #ifdef INET6 5535 struct sockaddr_storage addr; 5536 #else 5537 struct sockaddr_in addr; 5538 #endif 5539 socklen_t addrlen; 5540 struct timeval timeout; 5541 5542 if (!(event & EV_READ)) { 5543 return; 5544 } 5545 5546 if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) { 5547 reject = data->nsd->options->tcp_reject_overflow; 5548 if (!reject) { 5549 return; 5550 } 5551 } 5552 5553 /* Accept it... */ 5554 addrlen = sizeof(addr); 5555 s = perform_accept(fd, (struct sockaddr *) &addr, &addrlen); 5556 if (s == -1) { 5557 /** 5558 * EMFILE and ENFILE is a signal that the limit of open 5559 * file descriptors has been reached. Pause accept(). 5560 * EINTR is a signal interrupt. The others are various OS ways 5561 * of saying that the client has closed the connection. 5562 */ 5563 if (errno == EMFILE || errno == ENFILE) { 5564 if (!slowaccept) { 5565 /* disable accept events */ 5566 struct timeval tv; 5567 configure_handler_event_types(0); 5568 tv.tv_sec = SLOW_ACCEPT_TIMEOUT; 5569 tv.tv_usec = 0L; 5570 memset(&slowaccept_event, 0, 5571 sizeof(slowaccept_event)); 5572 event_set(&slowaccept_event, -1, EV_TIMEOUT, 5573 handle_slowaccept_timeout, NULL); 5574 (void)event_base_set(data->event.ev_base, 5575 &slowaccept_event); 5576 (void)event_add(&slowaccept_event, &tv); 5577 slowaccept = 1; 5578 /* We don't want to spam the logs here */ 5579 } 5580 } else if (errno != EINTR 5581 && errno != EWOULDBLOCK 5582 #ifdef ECONNABORTED 5583 && errno != ECONNABORTED 5584 #endif /* ECONNABORTED */ 5585 #ifdef EPROTO 5586 && errno != EPROTO 5587 #endif /* EPROTO */ 5588 ) { 5589 log_msg(LOG_ERR, "accept failed: %s", strerror(errno)); 5590 } 5591 return; 5592 } 5593 5594 if (reject) { 5595 shutdown(s, SHUT_RDWR); 5596 close(s); 5597 return; 5598 } 5599 5600 /* 5601 * This region is deallocated when the TCP connection is 5602 * closed by the TCP handler. 5603 */ 5604 tcp_region = region_create(xalloc, free); 5605 tcp_data = (struct tcp_handler_data *) region_alloc( 5606 tcp_region, sizeof(struct tcp_handler_data)); 5607 tcp_data->region = tcp_region; 5608 tcp_data->query = query_create(tcp_region, compressed_dname_offsets, 5609 compression_table_size, compressed_dnames); 5610 tcp_data->nsd = data->nsd; 5611 tcp_data->query_count = 0; 5612 #ifdef HAVE_SSL 5613 tcp_data->shake_state = tls_hs_none; 5614 /* initialize both incase of dangling pointers */ 5615 tcp_data->tls = NULL; 5616 tcp_data->tls_auth = NULL; 5617 #endif 5618 tcp_data->query_needs_reset = 1; 5619 tcp_data->pp2_enabled = data->pp2_enabled; 5620 tcp_data->pp2_header_state = pp2_header_none; 5621 tcp_data->prev = NULL; 5622 tcp_data->next = NULL; 5623 5624 tcp_data->query_state = QUERY_PROCESSED; 5625 tcp_data->bytes_transmitted = 0; 5626 memcpy(&tcp_data->query->remote_addr, &addr, addrlen); 5627 tcp_data->query->remote_addrlen = addrlen; 5628 /* Copy remote_address to client_address. 5629 * Simplest way/time for streams to do that. */ 5630 memcpy(&tcp_data->query->client_addr, &addr, addrlen); 5631 tcp_data->query->client_addrlen = addrlen; 5632 tcp_data->query->is_proxied = 0; 5633 5634 tcp_data->tcp_no_more_queries = 0; 5635 tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000; 5636 if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) { 5637 /* very busy, give smaller timeout */ 5638 tcp_data->tcp_timeout = 200; 5639 } 5640 memset(&tcp_data->event, 0, sizeof(tcp_data->event)); 5641 timeout.tv_sec = tcp_data->tcp_timeout / 1000; 5642 timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000; 5643 5644 #ifdef USE_DNSTAP 5645 /* save the address of the connection */ 5646 tcp_data->socket = data->socket; 5647 #endif /* USE_DNSTAP */ 5648 5649 #ifdef HAVE_SSL 5650 if (data->tls_accept) { 5651 tcp_data->tls = incoming_ssl_fd(tcp_data->nsd->tls_ctx, s); 5652 if(!tcp_data->tls) { 5653 close(s); 5654 return; 5655 } 5656 tcp_data->query->tls = tcp_data->tls; 5657 tcp_data->shake_state = tls_hs_read; 5658 memset(&tcp_data->event, 0, sizeof(tcp_data->event)); 5659 event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT, 5660 handle_tls_reading, tcp_data); 5661 } else if (data->tls_auth_accept) { 5662 tcp_data->tls_auth = incoming_ssl_fd(tcp_data->nsd->tls_auth_ctx, s); 5663 if(!tcp_data->tls_auth) { 5664 close(s); 5665 return; 5666 } 5667 tcp_data->query->tls_auth = tcp_data->tls_auth; 5668 tcp_data->shake_state = tls_hs_read; 5669 memset(&tcp_data->event, 0, sizeof(tcp_data->event)); 5670 event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT, 5671 handle_tls_reading, tcp_data); 5672 } else { 5673 #endif 5674 memset(&tcp_data->event, 0, sizeof(tcp_data->event)); 5675 event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT, 5676 handle_tcp_reading, tcp_data); 5677 #ifdef HAVE_SSL 5678 } 5679 #endif 5680 if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) { 5681 log_msg(LOG_ERR, "cannot set tcp event base"); 5682 close(s); 5683 region_destroy(tcp_region); 5684 return; 5685 } 5686 if(event_add(&tcp_data->event, &timeout) != 0) { 5687 log_msg(LOG_ERR, "cannot add tcp to event base"); 5688 close(s); 5689 region_destroy(tcp_region); 5690 return; 5691 } 5692 if(tcp_active_list) { 5693 tcp_active_list->prev = tcp_data; 5694 tcp_data->next = tcp_active_list; 5695 } 5696 tcp_active_list = tcp_data; 5697 5698 /* 5699 * Keep track of the total number of TCP handlers installed so 5700 * we can stop accepting connections when the maximum number 5701 * of simultaneous TCP connections is reached. 5702 * 5703 * If tcp-reject-overflow is enabled, however, then we do not 5704 * change the handler event type; we keep it as-is and accept 5705 * overflow TCP connections only so that we can forcibly kill 5706 * them off. 5707 */ 5708 ++data->nsd->current_tcp_count; 5709 if (!data->nsd->options->tcp_reject_overflow && 5710 data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) 5711 { 5712 configure_handler_event_types(0); 5713 } 5714 } 5715 5716 #ifdef USE_XDP 5717 static void handle_xdp(int fd, short event, void* arg) { 5718 struct xdp_handler_data *data = (struct xdp_handler_data*) arg; 5719 5720 if ((event & EV_READ)) 5721 xdp_handle_recv_and_send(data->server); 5722 (void)fd; 5723 } 5724 #endif 5725 5726 static void 5727 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout) 5728 { 5729 size_t i; 5730 assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0); 5731 for (i = 0; i < nsd->child_count; ++i) { 5732 if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) { 5733 if (write(nsd->children[i].child_fd, 5734 &command, 5735 sizeof(command)) == -1) 5736 { 5737 if(errno != EAGAIN && errno != EINTR) 5738 log_msg(LOG_ERR, "problems sending command %d to server %d: %s", 5739 (int) command, 5740 (int) nsd->children[i].pid, 5741 strerror(errno)); 5742 } else if (timeout > 0) { 5743 (void)block_read(NULL, 5744 nsd->children[i].child_fd, 5745 &command, sizeof(command), timeout); 5746 } 5747 fsync(nsd->children[i].child_fd); 5748 close(nsd->children[i].child_fd); 5749 nsd->children[i].child_fd = -1; 5750 } 5751 } 5752 } 5753 5754 static void 5755 send_children_quit(struct nsd* nsd) 5756 { 5757 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit")); 5758 send_children_command(nsd, NSD_QUIT, 0); 5759 } 5760 5761 static void 5762 send_children_quit_and_wait(struct nsd* nsd) 5763 { 5764 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait")); 5765 send_children_command(nsd, NSD_QUIT_CHILD, 3); 5766 } 5767 5768 #ifdef BIND8_STATS 5769 static void 5770 set_children_stats(struct nsd* nsd) 5771 { 5772 size_t i; 5773 assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0); 5774 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children")); 5775 for (i = 0; i < nsd->child_count; ++i) { 5776 nsd->children[i].need_to_send_STATS = 1; 5777 nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE; 5778 } 5779 } 5780 #endif /* BIND8_STATS */ 5781 5782 static void 5783 configure_handler_event_types(short event_types) 5784 { 5785 size_t i; 5786 5787 for (i = 0; i < tcp_accept_handler_count; ++i) { 5788 struct event* handler = &tcp_accept_handlers[i].event; 5789 if(event_types) { 5790 /* reassign */ 5791 int fd = handler->ev_fd; 5792 struct event_base* base = handler->ev_base; 5793 if(tcp_accept_handlers[i].event_added) 5794 event_del(handler); 5795 memset(handler, 0, sizeof(*handler)); 5796 event_set(handler, fd, event_types, 5797 handle_tcp_accept, &tcp_accept_handlers[i]); 5798 if(event_base_set(base, handler) != 0) 5799 log_msg(LOG_ERR, "conhand: cannot event_base"); 5800 if(event_add(handler, NULL) != 0) 5801 log_msg(LOG_ERR, "conhand: cannot event_add"); 5802 tcp_accept_handlers[i].event_added = 1; 5803 } else { 5804 /* remove */ 5805 if(tcp_accept_handlers[i].event_added) { 5806 event_del(handler); 5807 tcp_accept_handlers[i].event_added = 0; 5808 } 5809 } 5810 } 5811 } 5812