1 /* $NetBSD: tcp.c,v 1.14 2026/04/08 00:16:16 christos Exp $ */ 2 3 /* 4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC") 5 * 6 * SPDX-License-Identifier: MPL-2.0 7 * 8 * This Source Code Form is subject to the terms of the Mozilla Public 9 * License, v. 2.0. If a copy of the MPL was not distributed with this 10 * file, you can obtain one at https://mozilla.org/MPL/2.0/. 11 * 12 * See the COPYRIGHT file distributed with this work for additional 13 * information regarding copyright ownership. 14 */ 15 16 #include <libgen.h> 17 #include <string.h> 18 #include <unistd.h> 19 20 #include <isc/async.h> 21 #include <isc/atomic.h> 22 #include <isc/barrier.h> 23 #include <isc/buffer.h> 24 #include <isc/condition.h> 25 #include <isc/errno.h> 26 #include <isc/log.h> 27 #include <isc/magic.h> 28 #include <isc/mem.h> 29 #include <isc/netmgr.h> 30 #include <isc/quota.h> 31 #include <isc/random.h> 32 #include <isc/refcount.h> 33 #include <isc/region.h> 34 #include <isc/result.h> 35 #include <isc/sockaddr.h> 36 #include <isc/stdtime.h> 37 #include <isc/thread.h> 38 #include <isc/util.h> 39 #include <isc/uv.h> 40 41 #include "../loop_p.h" 42 #include "netmgr-int.h" 43 44 static atomic_uint_fast32_t last_tcpquota_log = 0; 45 46 static bool 47 can_log_tcp_quota(void) { 48 isc_stdtime_t last; 49 isc_stdtime_t now = isc_stdtime_now(); 50 last = atomic_exchange_relaxed(&last_tcpquota_log, now); 51 if (now != last) { 52 return true; 53 } 54 55 return false; 56 } 57 58 static isc_result_t 59 tcp_connect_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req); 60 61 static isc_result_t 62 tcp_send_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req); 63 static void 64 tcp_connect_cb(uv_connect_t *uvreq, int status); 65 static void 66 tcp_stop_cb(uv_handle_t *handle); 67 68 static void 69 tcp_connection_cb(uv_stream_t *server, int status); 70 71 static void 72 tcp_close_cb(uv_handle_t *uvhandle); 73 74 static isc_result_t 75 accept_connection(isc_nmsocket_t *ssock); 76 77 static void 78 quota_accept_cb(void *arg); 79 80 static isc_result_t 81 tcp_connect_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) { 82 isc__networker_t *worker = NULL; 83 isc_result_t result = ISC_R_UNSET; 84 int r; 85 86 REQUIRE(VALID_NMSOCK(sock)); 87 REQUIRE(VALID_UVREQ(req)); 88 89 REQUIRE(sock->tid == isc_tid()); 90 91 worker = sock->worker; 92 93 sock->connecting = true; 94 95 /* 2 minute timeout */ 96 result = isc__nm_socket_connectiontimeout(sock->fd, 120 * 1000); 97 RUNTIME_CHECK(result == ISC_R_SUCCESS); 98 99 r = uv_tcp_init(&worker->loop->loop, &sock->uv_handle.tcp); 100 UV_RUNTIME_CHECK(uv_tcp_init, r); 101 uv_handle_set_data(&sock->uv_handle.handle, sock); 102 103 r = uv_timer_init(&worker->loop->loop, &sock->read_timer); 104 UV_RUNTIME_CHECK(uv_timer_init, r); 105 uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock); 106 107 r = uv_tcp_open(&sock->uv_handle.tcp, sock->fd); 108 if (r != 0) { 109 isc__nm_closesocket(sock->fd); 110 isc__nm_incstats(sock, STATID_OPENFAIL); 111 return isc_uverr2result(r); 112 } 113 isc__nm_incstats(sock, STATID_OPEN); 114 115 if (req->local.length != 0) { 116 r = uv_tcp_bind(&sock->uv_handle.tcp, &req->local.type.sa, 0); 117 if (r != 0) { 118 isc__nm_incstats(sock, STATID_BINDFAIL); 119 return isc_uverr2result(r); 120 } 121 } 122 123 isc__nm_set_network_buffers(sock->worker->netmgr, 124 &sock->uv_handle.handle); 125 126 uv_handle_set_data(&req->uv_req.handle, req); 127 r = uv_tcp_connect(&req->uv_req.connect, &sock->uv_handle.tcp, 128 &req->peer.type.sa, tcp_connect_cb); 129 if (r != 0) { 130 isc__nm_incstats(sock, STATID_CONNECTFAIL); 131 return isc_uverr2result(r); 132 } 133 134 uv_handle_set_data((uv_handle_t *)&sock->read_timer, 135 &req->uv_req.connect); 136 isc__nmsocket_timer_start(sock); 137 138 return ISC_R_SUCCESS; 139 } 140 141 static void 142 tcp_connect_cb(uv_connect_t *uvreq, int status) { 143 isc_result_t result = ISC_R_UNSET; 144 isc__nm_uvreq_t *req = NULL; 145 isc_nmsocket_t *sock = uv_handle_get_data((uv_handle_t *)uvreq->handle); 146 struct sockaddr_storage ss; 147 isc__networker_t *worker = NULL; 148 int r; 149 150 REQUIRE(VALID_NMSOCK(sock)); 151 REQUIRE(sock->tid == isc_tid()); 152 153 worker = sock->worker; 154 155 req = uv_handle_get_data((uv_handle_t *)uvreq); 156 157 REQUIRE(VALID_UVREQ(req)); 158 REQUIRE(VALID_NMHANDLE(req->handle)); 159 160 INSIST(sock->connecting); 161 162 if (sock->timedout || status == UV_ETIMEDOUT) { 163 /* Connection timed-out */ 164 result = ISC_R_TIMEDOUT; 165 goto error; 166 } else if (isc__nm_closing(worker)) { 167 /* Network manager shutting down */ 168 result = ISC_R_SHUTTINGDOWN; 169 goto error; 170 } else if (isc__nmsocket_closing(sock)) { 171 /* Connection canceled */ 172 result = ISC_R_CANCELED; 173 goto error; 174 } else if (status == UV_EADDRINUSE) { 175 /* 176 * On FreeBSD the TCP connect() call sometimes results in a 177 * spurious transient EADDRINUSE. Try a few more times before 178 * giving up. 179 */ 180 if (--req->connect_tries > 0) { 181 r = uv_tcp_connect(&req->uv_req.connect, 182 &sock->uv_handle.tcp, 183 &req->peer.type.sa, tcp_connect_cb); 184 if (r != 0) { 185 result = isc_uverr2result(r); 186 goto error; 187 } 188 return; 189 } 190 result = isc_uverr2result(status); 191 goto error; 192 } else if (status != 0) { 193 result = isc_uverr2result(status); 194 goto error; 195 } 196 197 isc__nmsocket_timer_stop(sock); 198 uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock); 199 200 isc__nm_incstats(sock, STATID_CONNECT); 201 r = uv_tcp_getpeername(&sock->uv_handle.tcp, (struct sockaddr *)&ss, 202 &(int){ sizeof(ss) }); 203 if (r != 0) { 204 result = isc_uverr2result(r); 205 goto error; 206 } 207 208 sock->connecting = false; 209 sock->connected = true; 210 211 result = isc_sockaddr_fromsockaddr(&sock->peer, (struct sockaddr *)&ss); 212 RUNTIME_CHECK(result == ISC_R_SUCCESS); 213 214 isc__nm_connectcb(sock, req, ISC_R_SUCCESS, false); 215 216 return; 217 error: 218 isc__nm_failed_connect_cb(sock, req, result, false); 219 } 220 221 void 222 isc_nm_tcpconnect(isc_nm_t *mgr, isc_sockaddr_t *local, isc_sockaddr_t *peer, 223 isc_nm_cb_t connect_cb, void *connect_cbarg, 224 unsigned int timeout) { 225 isc_result_t result = ISC_R_SUCCESS; 226 isc_nmsocket_t *sock = NULL; 227 isc__nm_uvreq_t *req = NULL; 228 sa_family_t sa_family; 229 isc__networker_t *worker = NULL; 230 uv_os_sock_t fd = -1; 231 in_port_t port_low, port_high; 232 233 REQUIRE(VALID_NM(mgr)); 234 REQUIRE(local != NULL); 235 REQUIRE(peer != NULL); 236 237 worker = &mgr->workers[isc_tid()]; 238 239 if (isc__nm_closing(worker)) { 240 connect_cb(NULL, ISC_R_SHUTTINGDOWN, connect_cbarg); 241 return; 242 } 243 244 sa_family = peer->type.sa.sa_family; 245 246 result = isc__nm_socket(sa_family, SOCK_STREAM, 0, &fd); 247 if (result != ISC_R_SUCCESS) { 248 connect_cb(NULL, result, connect_cbarg); 249 return; 250 } 251 252 sock = isc_mempool_get(worker->nmsocket_pool); 253 isc__nmsocket_init(sock, worker, isc_nm_tcpsocket, local, NULL); 254 255 sock->connect_timeout = timeout; 256 sock->fd = fd; 257 sock->client = true; 258 259 req = isc__nm_uvreq_get(sock); 260 req->cb.connect = connect_cb; 261 req->cbarg = connect_cbarg; 262 req->peer = *peer; 263 req->local = *local; 264 req->handle = isc__nmhandle_get(sock, &req->peer, &sock->iface); 265 266 (void)isc__nm_socket_min_mtu(sock->fd, sa_family); 267 (void)isc__nm_socket_tcp_maxseg(sock->fd, NM_MAXSEG); 268 port_low = (sa_family == AF_INET) ? mgr->port_low4 : mgr->port_low6; 269 port_high = (sa_family == AF_INET) ? mgr->port_high4 : mgr->port_high6; 270 result = isc__nm_socket_max_port_range(sock->fd, sa_family, port_low, 271 port_high); 272 if (result != ISC_R_SUCCESS) { 273 isc__nmsocket_log(sock, ISC_LOG_DEBUG(99), 274 "setting up IP_BIND_ADDRESS_NO_PORT or " 275 "IP_LOCAL_PORT_RANGE failed: %s\n", 276 result == ISC_R_RANGE 277 ? isc_result_totext(result) 278 : strerror(errno)); 279 } 280 281 sock->active = true; 282 283 result = tcp_connect_direct(sock, req); 284 if (result != ISC_R_SUCCESS) { 285 sock->active = false; 286 isc__nm_tcp_close(sock); 287 isc__nm_connectcb(sock, req, result, true); 288 } 289 290 /* 291 * The sock is now attached to the handle. 292 */ 293 isc__nmsocket_detach(&sock); 294 } 295 296 static uv_os_sock_t 297 isc__nm_tcp_lb_socket(isc_nm_t *mgr, sa_family_t sa_family) { 298 isc_result_t result; 299 uv_os_sock_t sock; 300 301 result = isc__nm_socket(sa_family, SOCK_STREAM, 0, &sock); 302 RUNTIME_CHECK(result == ISC_R_SUCCESS); 303 304 (void)isc__nm_socket_v6only(sock, sa_family); 305 306 /* FIXME: set mss */ 307 308 result = isc__nm_socket_reuse(sock, 1); 309 RUNTIME_CHECK(result == ISC_R_SUCCESS); 310 311 if (mgr->load_balance_sockets) { 312 result = isc__nm_socket_reuse_lb(sock); 313 RUNTIME_CHECK(result == ISC_R_SUCCESS); 314 } 315 316 return sock; 317 } 318 319 static void 320 start_tcp_child_job(void *arg) { 321 isc_nmsocket_t *sock = arg; 322 323 REQUIRE(VALID_NMSOCK(sock)); 324 REQUIRE(VALID_NMSOCK(sock->parent)); 325 REQUIRE(sock->type == isc_nm_tcpsocket); 326 REQUIRE(sock->tid == isc_tid()); 327 328 sa_family_t sa_family = sock->iface.type.sa.sa_family; 329 int r, flags = 0; 330 isc_result_t result = ISC_R_UNSET; 331 isc_loop_t *loop = sock->worker->loop; 332 struct sockaddr_storage ss; 333 334 (void)isc__nm_socket_min_mtu(sock->fd, sa_family); 335 (void)isc__nm_socket_tcp_maxseg(sock->fd, NM_MAXSEG); 336 337 r = uv_tcp_init(&loop->loop, &sock->uv_handle.tcp); 338 UV_RUNTIME_CHECK(uv_tcp_init, r); 339 uv_handle_set_data(&sock->uv_handle.handle, sock); 340 /* This keeps the socket alive after everything else is gone */ 341 isc__nmsocket_attach(sock, &(isc_nmsocket_t *){ NULL }); 342 343 r = uv_timer_init(&loop->loop, &sock->read_timer); 344 UV_RUNTIME_CHECK(uv_timer_init, r); 345 uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock); 346 347 r = uv_tcp_open(&sock->uv_handle.tcp, sock->fd); 348 if (r < 0) { 349 isc__nm_closesocket(sock->fd); 350 isc__nm_incstats(sock, STATID_OPENFAIL); 351 goto done; 352 } 353 isc__nm_incstats(sock, STATID_OPEN); 354 355 if (sa_family == AF_INET6) { 356 flags = UV_TCP_IPV6ONLY; 357 } 358 359 if (sock->worker->netmgr->load_balance_sockets) { 360 r = isc__nm_tcp_freebind(&sock->uv_handle.tcp, 361 &sock->iface.type.sa, flags); 362 if (r < 0) { 363 isc__nm_incstats(sock, STATID_BINDFAIL); 364 goto done; 365 } 366 } else if (sock->tid == 0) { 367 r = isc__nm_tcp_freebind(&sock->uv_handle.tcp, 368 &sock->iface.type.sa, flags); 369 if (r < 0) { 370 isc__nm_incstats(sock, STATID_BINDFAIL); 371 goto done; 372 } 373 sock->parent->uv_handle.tcp.flags = sock->uv_handle.tcp.flags; 374 } else { 375 /* The socket is already bound, just copy the flags */ 376 sock->uv_handle.tcp.flags = sock->parent->uv_handle.tcp.flags; 377 } 378 379 isc__nm_set_network_buffers(sock->worker->netmgr, 380 &sock->uv_handle.handle); 381 382 /* 383 * The callback will run in the same thread uv_listen() was called 384 * from, so a race with tcp_connection_cb() isn't possible. 385 */ 386 r = uv_listen((uv_stream_t *)&sock->uv_handle.tcp, sock->backlog, 387 tcp_connection_cb); 388 if (r != 0) { 389 isc__nmsocket_log(sock, ISC_LOG_ERROR, "uv_listen failed: %s", 390 isc_result_totext(isc_uverr2result(r))); 391 isc__nm_incstats(sock, STATID_BINDFAIL); 392 goto done; 393 } 394 395 if (sock->tid == 0) { 396 r = uv_tcp_getsockname(&sock->uv_handle.tcp, 397 (struct sockaddr *)&ss, 398 &(int){ sizeof(ss) }); 399 if (r != 0) { 400 goto done; 401 } 402 403 result = isc_sockaddr_fromsockaddr(&sock->parent->iface, 404 (struct sockaddr *)&ss); 405 if (result != ISC_R_SUCCESS) { 406 goto done_result; 407 } 408 } 409 410 done: 411 result = isc_uverr2result(r); 412 413 done_result: 414 if (result != ISC_R_SUCCESS) { 415 sock->pquota = NULL; 416 } 417 418 sock->result = result; 419 420 REQUIRE(!loop->paused); 421 422 if (sock->tid != 0) { 423 isc_barrier_wait(&sock->parent->listen_barrier); 424 } 425 } 426 427 static void 428 start_tcp_child(isc_nm_t *mgr, isc_sockaddr_t *iface, isc_nmsocket_t *sock, 429 uv_os_sock_t fd, int tid) { 430 isc_nmsocket_t *csock = &sock->children[tid]; 431 isc__networker_t *worker = &mgr->workers[tid]; 432 433 isc__nmsocket_init(csock, worker, isc_nm_tcpsocket, iface, sock); 434 csock->accept_cb = sock->accept_cb; 435 csock->accept_cbarg = sock->accept_cbarg; 436 csock->backlog = sock->backlog; 437 438 /* 439 * Quota isn't attached, just assigned. 440 */ 441 csock->pquota = sock->pquota; 442 443 if (mgr->load_balance_sockets) { 444 UNUSED(fd); 445 csock->fd = isc__nm_tcp_lb_socket(mgr, 446 iface->type.sa.sa_family); 447 } else { 448 csock->fd = dup(fd); 449 } 450 REQUIRE(csock->fd >= 0); 451 452 if (tid == 0) { 453 start_tcp_child_job(csock); 454 } else { 455 isc_async_run(worker->loop, start_tcp_child_job, csock); 456 } 457 } 458 459 isc_result_t 460 isc_nm_listentcp(isc_nm_t *mgr, uint32_t workers, isc_sockaddr_t *iface, 461 isc_nm_accept_cb_t accept_cb, void *accept_cbarg, int backlog, 462 isc_quota_t *quota, isc_nmsocket_t **sockp) { 463 isc_nmsocket_t *sock = NULL; 464 uv_os_sock_t fd = -1; 465 isc_result_t result = ISC_R_UNSET; 466 isc__networker_t *worker = NULL; 467 468 REQUIRE(VALID_NM(mgr)); 469 REQUIRE(isc_tid() == 0); 470 471 if (workers == 0) { 472 workers = mgr->nloops; 473 } 474 REQUIRE(workers <= mgr->nloops); 475 476 worker = &mgr->workers[0]; 477 sock = isc_mempool_get(worker->nmsocket_pool); 478 isc__nmsocket_init(sock, worker, isc_nm_tcplistener, iface, NULL); 479 480 sock->nchildren = (workers == ISC_NM_LISTEN_ALL) ? (uint32_t)mgr->nloops 481 : workers; 482 sock->children = isc_mem_cget(worker->mctx, sock->nchildren, 483 sizeof(sock->children[0])); 484 485 isc__nmsocket_barrier_init(sock); 486 487 sock->accept_cb = accept_cb; 488 sock->accept_cbarg = accept_cbarg; 489 sock->backlog = backlog; 490 sock->pquota = quota; 491 492 if (!mgr->load_balance_sockets) { 493 fd = isc__nm_tcp_lb_socket(mgr, iface->type.sa.sa_family); 494 } 495 496 start_tcp_child(mgr, iface, sock, fd, 0); 497 result = sock->children[0].result; 498 INSIST(result != ISC_R_UNSET); 499 500 for (size_t i = 1; i < sock->nchildren; i++) { 501 start_tcp_child(mgr, iface, sock, fd, i); 502 } 503 504 isc_barrier_wait(&sock->listen_barrier); 505 506 if (!mgr->load_balance_sockets) { 507 isc__nm_closesocket(fd); 508 } 509 510 /* 511 * If any of the child sockets have failed then isc_nm_listentcp 512 * fails. 513 */ 514 for (size_t i = 1; i < sock->nchildren; i++) { 515 if (result == ISC_R_SUCCESS && 516 sock->children[i].result != ISC_R_SUCCESS) 517 { 518 result = sock->children[i].result; 519 } 520 } 521 522 if (result != ISC_R_SUCCESS) { 523 sock->active = false; 524 isc__nm_tcp_stoplistening(sock); 525 isc_nmsocket_close(&sock); 526 527 return result; 528 } 529 530 sock->active = true; 531 532 *sockp = sock; 533 return ISC_R_SUCCESS; 534 } 535 536 static void 537 tcp_connection_cb(uv_stream_t *server, int status) { 538 isc_nmsocket_t *ssock = uv_handle_get_data((uv_handle_t *)server); 539 isc_result_t result; 540 541 REQUIRE(ssock->accept_cb != NULL); 542 543 if (status != 0) { 544 result = isc_uverr2result(status); 545 goto done; 546 } 547 548 REQUIRE(VALID_NMSOCK(ssock)); 549 REQUIRE(ssock->tid == isc_tid()); 550 551 if (isc__nmsocket_closing(ssock)) { 552 result = ISC_R_CANCELED; 553 goto done; 554 } 555 556 /* Prepare the child socket */ 557 isc_nmsocket_t *csock = isc_mempool_get(ssock->worker->nmsocket_pool); 558 isc__nmsocket_init(csock, ssock->worker, isc_nm_tcpsocket, 559 &ssock->iface, NULL); 560 isc__nmsocket_attach(ssock, &csock->server); 561 562 if (csock->server->pquota != NULL) { 563 result = isc_quota_acquire_cb(csock->server->pquota, 564 &csock->quotacb, quota_accept_cb, 565 csock); 566 if (result == ISC_R_QUOTA) { 567 csock->quota_accept_ts = isc_time_monotonic(); 568 isc__nm_incstats(ssock, STATID_ACCEPTFAIL); 569 goto done; 570 } 571 } 572 573 result = accept_connection(csock); 574 done: 575 isc__nm_accept_connection_log(ssock, result, can_log_tcp_quota()); 576 } 577 578 static void 579 stop_tcp_child_job(void *arg) { 580 isc_nmsocket_t *sock = arg; 581 582 REQUIRE(VALID_NMSOCK(sock)); 583 REQUIRE(sock->tid == isc_tid()); 584 REQUIRE(sock->parent != NULL); 585 REQUIRE(sock->type == isc_nm_tcpsocket); 586 REQUIRE(!sock->closing); 587 588 sock->active = false; 589 sock->closing = true; 590 591 /* 592 * The order of the close operation is important here, the uv_close() 593 * gets scheduled in the reverse order, so we need to close the timer 594 * last, so its gone by the time we destroy the socket 595 */ 596 597 /* 2. close the listening socket */ 598 isc__nmsocket_clearcb(sock); 599 isc__nm_stop_reading(sock); 600 uv_close(&sock->uv_handle.handle, tcp_stop_cb); 601 602 /* 1. close the read timer */ 603 isc__nmsocket_timer_stop(sock); 604 uv_close(&sock->read_timer, NULL); 605 606 REQUIRE(!sock->worker->loop->paused); 607 isc_barrier_wait(&sock->parent->stop_barrier); 608 } 609 610 static void 611 stop_tcp_child(isc_nmsocket_t *sock) { 612 REQUIRE(VALID_NMSOCK(sock)); 613 614 if (sock->tid == 0) { 615 stop_tcp_child_job(sock); 616 } else { 617 isc_async_run(sock->worker->loop, stop_tcp_child_job, sock); 618 } 619 } 620 621 void 622 isc__nm_tcp_stoplistening(isc_nmsocket_t *sock) { 623 REQUIRE(VALID_NMSOCK(sock)); 624 REQUIRE(sock->type == isc_nm_tcplistener); 625 REQUIRE(sock->tid == isc_tid()); 626 REQUIRE(sock->tid == 0); 627 REQUIRE(!sock->closing); 628 629 sock->closing = true; 630 631 /* Mark the parent socket inactive */ 632 sock->active = false; 633 634 /* Stop all the other threads' children */ 635 for (size_t i = 1; i < sock->nchildren; i++) { 636 stop_tcp_child(&sock->children[i]); 637 } 638 639 /* Stop the child for the main thread */ 640 stop_tcp_child(&sock->children[0]); 641 642 /* Stop the parent */ 643 sock->closed = true; 644 645 isc__nmsocket_prep_destroy(sock); 646 } 647 648 static void 649 tcp_stop_cb(uv_handle_t *handle) { 650 isc_nmsocket_t *sock = uv_handle_get_data(handle); 651 uv_handle_set_data(handle, NULL); 652 653 REQUIRE(VALID_NMSOCK(sock)); 654 REQUIRE(sock->tid == isc_tid()); 655 REQUIRE(sock->closing); 656 REQUIRE(sock->type == isc_nm_tcpsocket); 657 REQUIRE(!sock->closed); 658 659 sock->closed = true; 660 661 isc__nm_incstats(sock, STATID_CLOSE); 662 663 isc__nmsocket_detach(&sock); 664 } 665 666 void 667 isc__nm_tcp_failed_read_cb(isc_nmsocket_t *sock, isc_result_t result, 668 bool async) { 669 REQUIRE(VALID_NMSOCK(sock)); 670 REQUIRE(result != ISC_R_SUCCESS); 671 672 isc__nmsocket_timer_stop(sock); 673 isc__nm_stop_reading(sock); 674 sock->reading = false; 675 676 if (sock->recv_cb != NULL) { 677 isc__nm_uvreq_t *req = isc__nm_get_read_req(sock, NULL); 678 isc__nmsocket_clearcb(sock); 679 isc__nm_readcb(sock, req, result, async); 680 } 681 682 isc__nmsocket_prep_destroy(sock); 683 } 684 685 void 686 isc__nm_tcp_read(isc_nmhandle_t *handle, isc_nm_recv_cb_t cb, void *cbarg) { 687 isc_nmsocket_t *sock; 688 isc_nm_t *netmgr; 689 isc_result_t result; 690 691 REQUIRE(VALID_NMHANDLE(handle)); 692 REQUIRE(VALID_NMSOCK(handle->sock)); 693 694 sock = handle->sock; 695 netmgr = sock->worker->netmgr; 696 697 REQUIRE(sock->type == isc_nm_tcpsocket); 698 REQUIRE(sock->statichandle == handle); 699 700 sock->recv_cb = cb; 701 sock->recv_cbarg = cbarg; 702 703 /* Initialize the timer */ 704 if (sock->read_timeout == 0) { 705 sock->read_timeout = 706 sock->keepalive 707 ? atomic_load_relaxed(&netmgr->keepalive) 708 : atomic_load_relaxed(&netmgr->idle); 709 } 710 711 if (isc__nmsocket_closing(sock)) { 712 result = ISC_R_CANCELED; 713 goto failure; 714 } 715 716 if (!sock->reading_throttled) { 717 result = isc__nm_start_reading(sock); 718 if (result != ISC_R_SUCCESS) { 719 goto failure; 720 } 721 } 722 723 sock->reading = true; 724 725 if (!sock->manual_read_timer) { 726 isc__nmsocket_timer_start(sock); 727 } 728 729 return; 730 failure: 731 isc__nm_tcp_failed_read_cb(sock, result, true); 732 } 733 734 void 735 isc__nm_tcp_read_stop(isc_nmhandle_t *handle) { 736 REQUIRE(VALID_NMHANDLE(handle)); 737 REQUIRE(VALID_NMSOCK(handle->sock)); 738 739 isc_nmsocket_t *sock = handle->sock; 740 741 if (!sock->manual_read_timer) { 742 isc__nmsocket_timer_stop(sock); 743 } 744 isc__nm_stop_reading(sock); 745 sock->reading = false; 746 747 return; 748 } 749 750 void 751 isc__nm_tcp_read_cb(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf) { 752 isc_nmsocket_t *sock = uv_handle_get_data((uv_handle_t *)stream); 753 isc__nm_uvreq_t *req = NULL; 754 isc_nm_t *netmgr = NULL; 755 756 REQUIRE(VALID_NMSOCK(sock)); 757 REQUIRE(sock->tid == isc_tid()); 758 REQUIRE(buf != NULL); 759 760 netmgr = sock->worker->netmgr; 761 762 if (isc__nmsocket_closing(sock)) { 763 isc__nm_tcp_failed_read_cb(sock, ISC_R_CANCELED, false); 764 goto free; 765 } 766 767 if (nread < 0) { 768 if (nread != UV_EOF) { 769 isc__nm_incstats(sock, STATID_RECVFAIL); 770 } 771 772 isc__nm_tcp_failed_read_cb(sock, isc_uverr2result(nread), 773 false); 774 775 goto free; 776 } 777 778 req = isc__nm_get_read_req(sock, NULL); 779 780 /* 781 * The callback will be called synchronously because the 782 * result is ISC_R_SUCCESS, so we don't need to retain 783 * the buffer 784 */ 785 req->uvbuf.base = buf->base; 786 req->uvbuf.len = nread; 787 788 if (!sock->client) { 789 sock->read_timeout = 790 sock->keepalive 791 ? atomic_load_relaxed(&netmgr->keepalive) 792 : atomic_load_relaxed(&netmgr->idle); 793 } 794 795 isc__nm_readcb(sock, req, ISC_R_SUCCESS, false); 796 797 if (!sock->client && sock->reading) { 798 /* 799 * Stop reading if we have accumulated enough bytes in the send 800 * queue; this means that the TCP client is not reading back the 801 * data we sending to it, and there's no reason to continue 802 * processing more incoming DNS messages, if the client is not 803 * reading back the responses. 804 */ 805 size_t write_queue_size = 806 uv_stream_get_write_queue_size(&sock->uv_handle.stream); 807 808 if (write_queue_size >= ISC_NETMGR_TCP_SENDBUF_SIZE) { 809 isc__nmsocket_log( 810 sock, ISC_LOG_DEBUG(3), 811 "throttling TCP connection, the other side is " 812 "not reading the data (%zu)", 813 write_queue_size); 814 sock->reading_throttled = true; 815 isc__nm_stop_reading(sock); 816 } 817 } else if (uv_is_active(&sock->uv_handle.handle) && 818 !sock->manual_read_timer) 819 { 820 /* The readcb could have paused the reading */ 821 /* The timer will be updated */ 822 isc__nmsocket_timer_restart(sock); 823 } 824 825 free: 826 if (nread < 0) { 827 /* 828 * The buffer may be a null buffer on error. 829 */ 830 if (buf->base == NULL && buf->len == 0) { 831 return; 832 } 833 } 834 835 isc__nm_free_uvbuf(sock, buf); 836 } 837 838 /* 839 * This is called after we get a quota_accept_cb() callback. 840 */ 841 static void 842 tcpaccept_cb(void *arg) { 843 isc_nmsocket_t *csock = arg; 844 isc_nmsocket_t *ssock = csock->server; 845 846 REQUIRE(VALID_NMSOCK(csock)); 847 REQUIRE(csock->tid == isc_tid()); 848 849 isc_result_t result = accept_connection(csock); 850 isc__nm_accept_connection_log(ssock, result, can_log_tcp_quota()); 851 isc__nmsocket_detach(&csock); 852 } 853 854 static void 855 quota_accept_cb(void *arg) { 856 isc_nmsocket_t *csock = arg; 857 isc_nmsocket_t *ssock = csock->server; 858 859 REQUIRE(VALID_NMSOCK(csock)); 860 861 /* 862 * This needs to be asynchronous, because the quota might have been 863 * released by a different child socket. 864 */ 865 if (csock->tid == isc_tid()) { 866 isc_result_t result = accept_connection(csock); 867 isc__nm_accept_connection_log(ssock, result, 868 can_log_tcp_quota()); 869 } else { 870 isc__nmsocket_attach(csock, &(isc_nmsocket_t *){ NULL }); 871 isc_async_run(csock->worker->loop, tcpaccept_cb, csock); 872 } 873 } 874 875 static isc_result_t 876 accept_connection(isc_nmsocket_t *csock) { 877 int r; 878 isc_result_t result; 879 struct sockaddr_storage ss; 880 isc_sockaddr_t local; 881 isc_nmhandle_t *handle = NULL; 882 883 REQUIRE(VALID_NMSOCK(csock)); 884 REQUIRE(VALID_NMSOCK(csock->server)); 885 REQUIRE(csock->tid == isc_tid()); 886 887 csock->accepting = true; 888 csock->accept_cb = csock->server->accept_cb; 889 csock->accept_cbarg = csock->server->accept_cbarg; 890 csock->recv_cb = csock->server->recv_cb; 891 csock->recv_cbarg = csock->server->recv_cbarg; 892 csock->read_timeout = atomic_load_relaxed(&csock->worker->netmgr->init); 893 894 r = uv_tcp_init(&csock->worker->loop->loop, &csock->uv_handle.tcp); 895 UV_RUNTIME_CHECK(uv_tcp_init, r); 896 uv_handle_set_data(&csock->uv_handle.handle, csock); 897 898 r = uv_timer_init(&csock->worker->loop->loop, &csock->read_timer); 899 UV_RUNTIME_CHECK(uv_timer_init, r); 900 uv_handle_set_data((uv_handle_t *)&csock->read_timer, csock); 901 902 if (csock->server->pquota != NULL) { 903 isc__nm_incstats(csock, STATID_CLIENTS); 904 } 905 906 /* 907 * We need to initialize the tcp and timer before failing because 908 * isc__nm_tcp_close() can't handle uninitalized TCP nmsocket. 909 */ 910 if (isc__nmsocket_closing(csock)) { 911 result = ISC_R_CANCELED; 912 goto failure; 913 } 914 915 r = uv_accept(&csock->server->uv_handle.stream, 916 &csock->uv_handle.stream); 917 if (r != 0) { 918 result = isc_uverr2result(r); 919 goto failure; 920 } 921 922 /* Check if the connection is not expired */ 923 if (csock->quota_accept_ts != 0) { 924 /* The timestamp is given in nanoseconds */ 925 const uint64_t time_elapsed_ms = 926 (isc_time_monotonic() - csock->quota_accept_ts) / 927 NS_PER_MS; 928 929 if (time_elapsed_ms >= csock->read_timeout) { 930 /* 931 * At this point we have received a connection from a 932 * queue of accepted connections (via uv_accept()), but 933 * it has expired. We cannot do anything better than 934 * drop it on the floor at this point. 935 */ 936 result = ISC_R_TIMEDOUT; 937 goto failure; 938 } else { 939 /* Adjust the initial read timeout accordingly */ 940 csock->read_timeout -= time_elapsed_ms; 941 } 942 } 943 944 r = uv_tcp_getpeername(&csock->uv_handle.tcp, (struct sockaddr *)&ss, 945 &(int){ sizeof(ss) }); 946 if (r != 0) { 947 result = isc_uverr2result(r); 948 goto failure; 949 } 950 951 result = isc_sockaddr_fromsockaddr(&csock->peer, 952 (struct sockaddr *)&ss); 953 if (result != ISC_R_SUCCESS) { 954 goto failure; 955 } 956 957 r = uv_tcp_getsockname(&csock->uv_handle.tcp, (struct sockaddr *)&ss, 958 &(int){ sizeof(ss) }); 959 if (r != 0) { 960 result = isc_uverr2result(r); 961 goto failure; 962 } 963 964 result = isc_sockaddr_fromsockaddr(&local, (struct sockaddr *)&ss); 965 if (result != ISC_R_SUCCESS) { 966 goto failure; 967 } 968 969 handle = isc__nmhandle_get(csock, NULL, &local); 970 971 result = csock->accept_cb(handle, ISC_R_SUCCESS, csock->accept_cbarg); 972 if (result != ISC_R_SUCCESS) { 973 isc_nmhandle_detach(&handle); 974 goto failure; 975 } 976 977 csock->accepting = false; 978 979 isc__nm_incstats(csock, STATID_ACCEPT); 980 981 /* 982 * The acceptcb needs to attach to the handle if it wants to keep the 983 * connection alive 984 */ 985 isc_nmhandle_detach(&handle); 986 987 /* 988 * sock is now attached to the handle. 989 */ 990 isc__nmsocket_detach(&csock); 991 992 return ISC_R_SUCCESS; 993 994 failure: 995 csock->active = false; 996 csock->accepting = false; 997 998 if (result != ISC_R_NOTCONNECTED) { 999 /* IGNORE: The client disconnected before we could accept */ 1000 isc__nmsocket_log(csock, ISC_LOG_ERROR, 1001 "Accepting TCP connection failed: %s", 1002 isc_result_totext(result)); 1003 } 1004 1005 isc__nmsocket_prep_destroy(csock); 1006 1007 isc__nmsocket_detach(&csock); 1008 1009 return result; 1010 } 1011 1012 static void 1013 tcp_send(isc_nmhandle_t *handle, const isc_region_t *region, isc_nm_cb_t cb, 1014 void *cbarg, const bool dnsmsg) { 1015 REQUIRE(VALID_NMHANDLE(handle)); 1016 REQUIRE(VALID_NMSOCK(handle->sock)); 1017 1018 isc_nmsocket_t *sock = handle->sock; 1019 isc_result_t result; 1020 isc__nm_uvreq_t *uvreq = NULL; 1021 isc_nm_t *netmgr = sock->worker->netmgr; 1022 1023 REQUIRE(sock->type == isc_nm_tcpsocket); 1024 REQUIRE(sock->tid == isc_tid()); 1025 1026 uvreq = isc__nm_uvreq_get(sock); 1027 if (dnsmsg) { 1028 *(uint16_t *)uvreq->tcplen = htons(region->length); 1029 } 1030 uvreq->uvbuf.base = (char *)region->base; 1031 uvreq->uvbuf.len = region->length; 1032 1033 isc_nmhandle_attach(handle, &uvreq->handle); 1034 1035 uvreq->cb.send = cb; 1036 uvreq->cbarg = cbarg; 1037 1038 if (sock->write_timeout == 0) { 1039 sock->write_timeout = 1040 sock->keepalive 1041 ? atomic_load_relaxed(&netmgr->keepalive) 1042 : atomic_load_relaxed(&netmgr->idle); 1043 } 1044 1045 result = tcp_send_direct(sock, uvreq); 1046 if (result != ISC_R_SUCCESS) { 1047 isc__nm_incstats(sock, STATID_SENDFAIL); 1048 isc__nm_failed_send_cb(sock, uvreq, result, true); 1049 } 1050 1051 return; 1052 } 1053 1054 void 1055 isc__nm_tcp_send(isc_nmhandle_t *handle, const isc_region_t *region, 1056 isc_nm_cb_t cb, void *cbarg) { 1057 tcp_send(handle, region, cb, cbarg, false); 1058 } 1059 1060 void 1061 isc__nm_tcp_senddns(isc_nmhandle_t *handle, const isc_region_t *region, 1062 isc_nm_cb_t cb, void *cbarg) { 1063 tcp_send(handle, region, cb, cbarg, true); 1064 } 1065 1066 static void 1067 tcp_maybe_restart_reading(isc_nmsocket_t *sock) { 1068 if (!sock->client && sock->reading && 1069 !uv_is_active(&sock->uv_handle.handle)) 1070 { 1071 /* 1072 * Restart reading if we have less data in the send queue than 1073 * the send buffer size, this means that the TCP client has 1074 * started reading some data again. Starting reading when we go 1075 * under the limit instead of waiting for all data has been 1076 * flushed allows faster recovery (in case there was a 1077 * congestion and now there isn't). 1078 */ 1079 size_t write_queue_size = 1080 uv_stream_get_write_queue_size(&sock->uv_handle.stream); 1081 if (write_queue_size < ISC_NETMGR_TCP_SENDBUF_SIZE) { 1082 isc__nmsocket_log( 1083 sock, ISC_LOG_DEBUG(3), 1084 "resuming TCP connection, the other side " 1085 "is reading the data again (%zu)", 1086 write_queue_size); 1087 isc__nm_start_reading(sock); 1088 sock->reading_throttled = false; 1089 } 1090 } 1091 } 1092 1093 static void 1094 tcp_send_cb(uv_write_t *req, int status) { 1095 isc__nm_uvreq_t *uvreq = (isc__nm_uvreq_t *)req->data; 1096 isc_nmsocket_t *sock = NULL; 1097 1098 REQUIRE(VALID_UVREQ(uvreq)); 1099 REQUIRE(VALID_NMSOCK(uvreq->sock)); 1100 1101 sock = uvreq->sock; 1102 1103 isc_nm_timer_stop(uvreq->timer); 1104 isc_nm_timer_detach(&uvreq->timer); 1105 1106 if (status < 0) { 1107 isc__nm_incstats(sock, STATID_SENDFAIL); 1108 isc__nm_failed_send_cb(sock, uvreq, isc_uverr2result(status), 1109 false); 1110 if (!sock->client && sock->reading) { 1111 /* 1112 * As we are resuming reading, it is not throttled 1113 * anymore (technically). 1114 */ 1115 sock->reading_throttled = false; 1116 isc__nm_start_reading(sock); 1117 isc__nmsocket_reset(sock); 1118 } 1119 return; 1120 } 1121 1122 isc__nm_sendcb(sock, uvreq, ISC_R_SUCCESS, false); 1123 tcp_maybe_restart_reading(sock); 1124 } 1125 1126 static isc_result_t 1127 tcp_send_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) { 1128 REQUIRE(VALID_NMSOCK(sock)); 1129 REQUIRE(VALID_UVREQ(req)); 1130 REQUIRE(sock->tid == isc_tid()); 1131 REQUIRE(sock->type == isc_nm_tcpsocket); 1132 1133 int r; 1134 uv_buf_t bufs[2] = { { 0 }, { 0 } }; /* ugly, but required for old GCC 1135 versions */ 1136 size_t nbufs = 1; 1137 1138 if (isc__nmsocket_closing(sock)) { 1139 return ISC_R_CANCELED; 1140 } 1141 1142 /* Check if we are not trying to send a DNS message */ 1143 if (*(uint16_t *)req->tcplen == 0) { 1144 bufs[0].base = req->uvbuf.base; 1145 bufs[0].len = req->uvbuf.len; 1146 1147 r = uv_try_write(&sock->uv_handle.stream, bufs, nbufs); 1148 1149 if (r == (int)(bufs[0].len)) { 1150 /* Wrote everything */ 1151 isc__nm_sendcb(sock, req, ISC_R_SUCCESS, true); 1152 tcp_maybe_restart_reading(sock); 1153 return ISC_R_SUCCESS; 1154 } else if (r > 0) { 1155 bufs[0].base += (size_t)r; 1156 bufs[0].len -= (size_t)r; 1157 } else if (!(r == UV_ENOSYS || r == UV_EAGAIN)) { 1158 return isc_uverr2result(r); 1159 } 1160 } else { 1161 nbufs = 2; 1162 bufs[0].base = req->tcplen; 1163 bufs[0].len = 2; 1164 bufs[1].base = req->uvbuf.base; 1165 bufs[1].len = req->uvbuf.len; 1166 1167 r = uv_try_write(&sock->uv_handle.stream, bufs, nbufs); 1168 1169 if (r == (int)(bufs[0].len + bufs[1].len)) { 1170 /* Wrote everything */ 1171 isc__nm_sendcb(sock, req, ISC_R_SUCCESS, true); 1172 tcp_maybe_restart_reading(sock); 1173 return ISC_R_SUCCESS; 1174 } else if (r == 1) { 1175 /* Partial write of DNSMSG length */ 1176 bufs[0].base = req->tcplen + 1; 1177 bufs[0].len = 1; 1178 } else if (r > 0) { 1179 /* Partial write of DNSMSG */ 1180 nbufs = 1; 1181 bufs[0].base = req->uvbuf.base + (r - 2); 1182 bufs[0].len = req->uvbuf.len - (r - 2); 1183 } else if (!(r == UV_ENOSYS || r == UV_EAGAIN)) { 1184 return isc_uverr2result(r); 1185 } 1186 } 1187 1188 if (!sock->client && sock->reading) { 1189 sock->reading_throttled = true; 1190 isc__nm_stop_reading(sock); 1191 } 1192 isc__nmsocket_log(sock, ISC_LOG_DEBUG(3), 1193 "%sthe other side is not " 1194 "reading the data, switching to uv_write()", 1195 !sock->client && sock->reading 1196 ? "throttling TCP connection, " 1197 : ""); 1198 1199 r = uv_write(&req->uv_req.write, &sock->uv_handle.stream, bufs, nbufs, 1200 tcp_send_cb); 1201 if (r < 0) { 1202 return isc_uverr2result(r); 1203 } 1204 1205 isc_nm_timer_create(req->handle, isc__nmsocket_writetimeout_cb, req, 1206 &req->timer); 1207 if (sock->write_timeout > 0) { 1208 isc_nm_timer_start(req->timer, sock->write_timeout); 1209 } 1210 1211 return ISC_R_SUCCESS; 1212 } 1213 1214 static void 1215 tcp_close_sock(isc_nmsocket_t *sock) { 1216 REQUIRE(VALID_NMSOCK(sock)); 1217 REQUIRE(sock->tid == isc_tid()); 1218 REQUIRE(sock->closing); 1219 REQUIRE(!sock->closed); 1220 1221 sock->closed = true; 1222 sock->connected = false; 1223 1224 isc__nm_incstats(sock, STATID_CLOSE); 1225 1226 if (sock->server != NULL) { 1227 if (sock->server->pquota != NULL) { 1228 isc__nm_decstats(sock, STATID_CLIENTS); 1229 isc_quota_release(sock->server->pquota); 1230 } 1231 isc__nmsocket_detach(&sock->server); 1232 } 1233 1234 isc__nmsocket_prep_destroy(sock); 1235 } 1236 1237 static void 1238 tcp_close_cb(uv_handle_t *handle) { 1239 isc_nmsocket_t *sock = uv_handle_get_data(handle); 1240 uv_handle_set_data(handle, NULL); 1241 1242 tcp_close_sock(sock); 1243 } 1244 1245 void 1246 isc__nm_tcp_close(isc_nmsocket_t *sock) { 1247 REQUIRE(VALID_NMSOCK(sock)); 1248 REQUIRE(sock->type == isc_nm_tcpsocket); 1249 REQUIRE(!isc__nmsocket_active(sock)); 1250 REQUIRE(sock->tid == isc_tid()); 1251 REQUIRE(sock->parent == NULL); 1252 REQUIRE(!sock->closing); 1253 1254 sock->closing = true; 1255 1256 /* 1257 * The order of the close operation is important here, the uv_close() 1258 * gets scheduled in the reverse order, so we need to close the timer 1259 * last, so its gone by the time we destroy the socket 1260 */ 1261 1262 if (!uv_is_closing(&sock->uv_handle.handle)) { 1263 /* Normal order of operation */ 1264 1265 /* 2. close the socket + destroy the socket in callback */ 1266 isc__nmsocket_clearcb(sock); 1267 isc__nm_stop_reading(sock); 1268 sock->reading = false; 1269 uv_close(&sock->uv_handle.handle, tcp_close_cb); 1270 1271 /* 1. close the timer */ 1272 isc__nmsocket_timer_stop(sock); 1273 uv_close((uv_handle_t *)&sock->read_timer, NULL); 1274 } else { 1275 /* The socket was already closed elsewhere */ 1276 1277 /* 1. close the timer + destroy the socket in callback */ 1278 isc__nmsocket_timer_stop(sock); 1279 uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock); 1280 uv_close((uv_handle_t *)&sock->read_timer, tcp_close_cb); 1281 } 1282 } 1283 1284 static void 1285 tcp_close_connect_cb(uv_handle_t *handle) { 1286 isc_nmsocket_t *sock = uv_handle_get_data(handle); 1287 1288 REQUIRE(VALID_NMSOCK(sock)); 1289 1290 REQUIRE(sock->tid == isc_tid()); 1291 1292 isc__nmsocket_prep_destroy(sock); 1293 isc__nmsocket_detach(&sock); 1294 } 1295 1296 void 1297 isc__nm_tcp_shutdown(isc_nmsocket_t *sock) { 1298 REQUIRE(VALID_NMSOCK(sock)); 1299 REQUIRE(sock->tid == isc_tid()); 1300 REQUIRE(sock->type == isc_nm_tcpsocket); 1301 1302 /* 1303 * If the socket is active, mark it inactive and 1304 * continue. If it isn't active, stop now. 1305 */ 1306 if (!sock->active) { 1307 return; 1308 } 1309 sock->active = false; 1310 1311 INSIST(!sock->accepting); 1312 1313 if (sock->connecting) { 1314 isc_nmsocket_t *tsock = NULL; 1315 isc__nmsocket_attach(sock, &tsock); 1316 uv_close(&sock->uv_handle.handle, tcp_close_connect_cb); 1317 return; 1318 } 1319 1320 /* There's a handle attached to the socket (from accept or connect) */ 1321 if (sock->statichandle) { 1322 isc__nm_failed_read_cb(sock, ISC_R_SHUTTINGDOWN, false); 1323 return; 1324 } 1325 1326 /* Destroy the non-listening socket */ 1327 if (sock->parent == NULL) { 1328 isc__nmsocket_prep_destroy(sock); 1329 return; 1330 } 1331 1332 /* Destroy the listening socket if on the same loop */ 1333 if (sock->tid == sock->parent->tid) { 1334 isc__nmsocket_prep_destroy(sock->parent); 1335 } 1336 } 1337 1338 void 1339 isc__nmhandle_tcp_set_manual_timer(isc_nmhandle_t *handle, const bool manual) { 1340 isc_nmsocket_t *sock; 1341 1342 REQUIRE(VALID_NMHANDLE(handle)); 1343 sock = handle->sock; 1344 REQUIRE(VALID_NMSOCK(sock)); 1345 REQUIRE(sock->type == isc_nm_tcpsocket); 1346 REQUIRE(sock->tid == isc_tid()); 1347 REQUIRE(!uv_is_active(&sock->uv_handle.handle)); 1348 1349 sock->manual_read_timer = manual; 1350 } 1351