1 /* $NetBSD: tcp.c,v 1.13 2025/05/21 14:48:05 christos Exp $ */ 2 3 /* 4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC") 5 * 6 * SPDX-License-Identifier: MPL-2.0 7 * 8 * This Source Code Form is subject to the terms of the Mozilla Public 9 * License, v. 2.0. If a copy of the MPL was not distributed with this 10 * file, you can obtain one at https://mozilla.org/MPL/2.0/. 11 * 12 * See the COPYRIGHT file distributed with this work for additional 13 * information regarding copyright ownership. 14 */ 15 16 #include <libgen.h> 17 #include <unistd.h> 18 19 #include <isc/async.h> 20 #include <isc/atomic.h> 21 #include <isc/barrier.h> 22 #include <isc/buffer.h> 23 #include <isc/condition.h> 24 #include <isc/errno.h> 25 #include <isc/log.h> 26 #include <isc/magic.h> 27 #include <isc/mem.h> 28 #include <isc/netmgr.h> 29 #include <isc/quota.h> 30 #include <isc/random.h> 31 #include <isc/refcount.h> 32 #include <isc/region.h> 33 #include <isc/result.h> 34 #include <isc/sockaddr.h> 35 #include <isc/stdtime.h> 36 #include <isc/thread.h> 37 #include <isc/util.h> 38 #include <isc/uv.h> 39 40 #include "../loop_p.h" 41 #include "netmgr-int.h" 42 43 static atomic_uint_fast32_t last_tcpquota_log = 0; 44 45 static bool 46 can_log_tcp_quota(void) { 47 isc_stdtime_t last; 48 isc_stdtime_t now = isc_stdtime_now(); 49 last = atomic_exchange_relaxed(&last_tcpquota_log, now); 50 if (now != last) { 51 return true; 52 } 53 54 return false; 55 } 56 57 static isc_result_t 58 tcp_connect_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req); 59 60 static isc_result_t 61 tcp_send_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req); 62 static void 63 tcp_connect_cb(uv_connect_t *uvreq, int status); 64 static void 65 tcp_stop_cb(uv_handle_t *handle); 66 67 static void 68 tcp_connection_cb(uv_stream_t *server, int status); 69 70 static void 71 tcp_close_cb(uv_handle_t *uvhandle); 72 73 static isc_result_t 74 accept_connection(isc_nmsocket_t *ssock); 75 76 static void 77 quota_accept_cb(void *arg); 78 79 static isc_result_t 80 tcp_connect_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) { 81 isc__networker_t *worker = NULL; 82 isc_result_t result = ISC_R_UNSET; 83 int r; 84 85 REQUIRE(VALID_NMSOCK(sock)); 86 REQUIRE(VALID_UVREQ(req)); 87 88 REQUIRE(sock->tid == isc_tid()); 89 90 worker = sock->worker; 91 92 sock->connecting = true; 93 94 /* 2 minute timeout */ 95 result = isc__nm_socket_connectiontimeout(sock->fd, 120 * 1000); 96 RUNTIME_CHECK(result == ISC_R_SUCCESS); 97 98 r = uv_tcp_init(&worker->loop->loop, &sock->uv_handle.tcp); 99 UV_RUNTIME_CHECK(uv_tcp_init, r); 100 uv_handle_set_data(&sock->uv_handle.handle, sock); 101 102 r = uv_timer_init(&worker->loop->loop, &sock->read_timer); 103 UV_RUNTIME_CHECK(uv_timer_init, r); 104 uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock); 105 106 r = uv_tcp_open(&sock->uv_handle.tcp, sock->fd); 107 if (r != 0) { 108 isc__nm_closesocket(sock->fd); 109 isc__nm_incstats(sock, STATID_OPENFAIL); 110 return isc_uverr2result(r); 111 } 112 isc__nm_incstats(sock, STATID_OPEN); 113 114 if (req->local.length != 0) { 115 r = uv_tcp_bind(&sock->uv_handle.tcp, &req->local.type.sa, 0); 116 if (r != 0) { 117 isc__nm_incstats(sock, STATID_BINDFAIL); 118 return isc_uverr2result(r); 119 } 120 } 121 122 isc__nm_set_network_buffers(sock->worker->netmgr, 123 &sock->uv_handle.handle); 124 125 uv_handle_set_data(&req->uv_req.handle, req); 126 r = uv_tcp_connect(&req->uv_req.connect, &sock->uv_handle.tcp, 127 &req->peer.type.sa, tcp_connect_cb); 128 if (r != 0) { 129 isc__nm_incstats(sock, STATID_CONNECTFAIL); 130 return isc_uverr2result(r); 131 } 132 133 uv_handle_set_data((uv_handle_t *)&sock->read_timer, 134 &req->uv_req.connect); 135 isc__nmsocket_timer_start(sock); 136 137 return ISC_R_SUCCESS; 138 } 139 140 static void 141 tcp_connect_cb(uv_connect_t *uvreq, int status) { 142 isc_result_t result = ISC_R_UNSET; 143 isc__nm_uvreq_t *req = NULL; 144 isc_nmsocket_t *sock = uv_handle_get_data((uv_handle_t *)uvreq->handle); 145 struct sockaddr_storage ss; 146 isc__networker_t *worker = NULL; 147 int r; 148 149 REQUIRE(VALID_NMSOCK(sock)); 150 REQUIRE(sock->tid == isc_tid()); 151 152 worker = sock->worker; 153 154 req = uv_handle_get_data((uv_handle_t *)uvreq); 155 156 REQUIRE(VALID_UVREQ(req)); 157 REQUIRE(VALID_NMHANDLE(req->handle)); 158 159 INSIST(sock->connecting); 160 161 if (sock->timedout || status == UV_ETIMEDOUT) { 162 /* Connection timed-out */ 163 result = ISC_R_TIMEDOUT; 164 goto error; 165 } else if (isc__nm_closing(worker)) { 166 /* Network manager shutting down */ 167 result = ISC_R_SHUTTINGDOWN; 168 goto error; 169 } else if (isc__nmsocket_closing(sock)) { 170 /* Connection canceled */ 171 result = ISC_R_CANCELED; 172 goto error; 173 } else if (status == UV_EADDRINUSE) { 174 /* 175 * On FreeBSD the TCP connect() call sometimes results in a 176 * spurious transient EADDRINUSE. Try a few more times before 177 * giving up. 178 */ 179 if (--req->connect_tries > 0) { 180 r = uv_tcp_connect(&req->uv_req.connect, 181 &sock->uv_handle.tcp, 182 &req->peer.type.sa, tcp_connect_cb); 183 if (r != 0) { 184 result = isc_uverr2result(r); 185 goto error; 186 } 187 return; 188 } 189 result = isc_uverr2result(status); 190 goto error; 191 } else if (status != 0) { 192 result = isc_uverr2result(status); 193 goto error; 194 } 195 196 isc__nmsocket_timer_stop(sock); 197 uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock); 198 199 isc__nm_incstats(sock, STATID_CONNECT); 200 r = uv_tcp_getpeername(&sock->uv_handle.tcp, (struct sockaddr *)&ss, 201 &(int){ sizeof(ss) }); 202 if (r != 0) { 203 result = isc_uverr2result(r); 204 goto error; 205 } 206 207 sock->connecting = false; 208 sock->connected = true; 209 210 result = isc_sockaddr_fromsockaddr(&sock->peer, (struct sockaddr *)&ss); 211 RUNTIME_CHECK(result == ISC_R_SUCCESS); 212 213 isc__nm_connectcb(sock, req, ISC_R_SUCCESS, false); 214 215 return; 216 error: 217 isc__nm_failed_connect_cb(sock, req, result, false); 218 } 219 220 void 221 isc_nm_tcpconnect(isc_nm_t *mgr, isc_sockaddr_t *local, isc_sockaddr_t *peer, 222 isc_nm_cb_t connect_cb, void *connect_cbarg, 223 unsigned int timeout) { 224 isc_result_t result = ISC_R_SUCCESS; 225 isc_nmsocket_t *sock = NULL; 226 isc__nm_uvreq_t *req = NULL; 227 sa_family_t sa_family; 228 isc__networker_t *worker = NULL; 229 uv_os_sock_t fd = -1; 230 231 REQUIRE(VALID_NM(mgr)); 232 REQUIRE(local != NULL); 233 REQUIRE(peer != NULL); 234 235 worker = &mgr->workers[isc_tid()]; 236 237 if (isc__nm_closing(worker)) { 238 connect_cb(NULL, ISC_R_SHUTTINGDOWN, connect_cbarg); 239 return; 240 } 241 242 sa_family = peer->type.sa.sa_family; 243 244 result = isc__nm_socket(sa_family, SOCK_STREAM, 0, &fd); 245 if (result != ISC_R_SUCCESS) { 246 connect_cb(NULL, result, connect_cbarg); 247 return; 248 } 249 250 sock = isc_mempool_get(worker->nmsocket_pool); 251 isc__nmsocket_init(sock, worker, isc_nm_tcpsocket, local, NULL); 252 253 sock->connect_timeout = timeout; 254 sock->fd = fd; 255 sock->client = true; 256 257 req = isc__nm_uvreq_get(sock); 258 req->cb.connect = connect_cb; 259 req->cbarg = connect_cbarg; 260 req->peer = *peer; 261 req->local = *local; 262 req->handle = isc__nmhandle_get(sock, &req->peer, &sock->iface); 263 264 (void)isc__nm_socket_min_mtu(sock->fd, sa_family); 265 (void)isc__nm_socket_tcp_maxseg(sock->fd, NM_MAXSEG); 266 267 sock->active = true; 268 269 result = tcp_connect_direct(sock, req); 270 if (result != ISC_R_SUCCESS) { 271 sock->active = false; 272 isc__nm_tcp_close(sock); 273 isc__nm_connectcb(sock, req, result, true); 274 } 275 276 /* 277 * The sock is now attached to the handle. 278 */ 279 isc__nmsocket_detach(&sock); 280 } 281 282 static uv_os_sock_t 283 isc__nm_tcp_lb_socket(isc_nm_t *mgr, sa_family_t sa_family) { 284 isc_result_t result; 285 uv_os_sock_t sock; 286 287 result = isc__nm_socket(sa_family, SOCK_STREAM, 0, &sock); 288 RUNTIME_CHECK(result == ISC_R_SUCCESS); 289 290 (void)isc__nm_socket_v6only(sock, sa_family); 291 292 /* FIXME: set mss */ 293 294 result = isc__nm_socket_reuse(sock, 1); 295 RUNTIME_CHECK(result == ISC_R_SUCCESS); 296 297 if (mgr->load_balance_sockets) { 298 result = isc__nm_socket_reuse_lb(sock); 299 RUNTIME_CHECK(result == ISC_R_SUCCESS); 300 } 301 302 return sock; 303 } 304 305 static void 306 start_tcp_child_job(void *arg) { 307 isc_nmsocket_t *sock = arg; 308 309 REQUIRE(VALID_NMSOCK(sock)); 310 REQUIRE(VALID_NMSOCK(sock->parent)); 311 REQUIRE(sock->type == isc_nm_tcpsocket); 312 REQUIRE(sock->tid == isc_tid()); 313 314 sa_family_t sa_family = sock->iface.type.sa.sa_family; 315 int r, flags = 0; 316 isc_result_t result = ISC_R_UNSET; 317 isc_loop_t *loop = sock->worker->loop; 318 struct sockaddr_storage ss; 319 320 (void)isc__nm_socket_min_mtu(sock->fd, sa_family); 321 (void)isc__nm_socket_tcp_maxseg(sock->fd, NM_MAXSEG); 322 323 r = uv_tcp_init(&loop->loop, &sock->uv_handle.tcp); 324 UV_RUNTIME_CHECK(uv_tcp_init, r); 325 uv_handle_set_data(&sock->uv_handle.handle, sock); 326 /* This keeps the socket alive after everything else is gone */ 327 isc__nmsocket_attach(sock, &(isc_nmsocket_t *){ NULL }); 328 329 r = uv_timer_init(&loop->loop, &sock->read_timer); 330 UV_RUNTIME_CHECK(uv_timer_init, r); 331 uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock); 332 333 r = uv_tcp_open(&sock->uv_handle.tcp, sock->fd); 334 if (r < 0) { 335 isc__nm_closesocket(sock->fd); 336 isc__nm_incstats(sock, STATID_OPENFAIL); 337 goto done; 338 } 339 isc__nm_incstats(sock, STATID_OPEN); 340 341 if (sa_family == AF_INET6) { 342 flags = UV_TCP_IPV6ONLY; 343 } 344 345 if (sock->worker->netmgr->load_balance_sockets) { 346 r = isc__nm_tcp_freebind(&sock->uv_handle.tcp, 347 &sock->iface.type.sa, flags); 348 if (r < 0) { 349 isc__nm_incstats(sock, STATID_BINDFAIL); 350 goto done; 351 } 352 } else if (sock->tid == 0) { 353 r = isc__nm_tcp_freebind(&sock->uv_handle.tcp, 354 &sock->iface.type.sa, flags); 355 if (r < 0) { 356 isc__nm_incstats(sock, STATID_BINDFAIL); 357 goto done; 358 } 359 sock->parent->uv_handle.tcp.flags = sock->uv_handle.tcp.flags; 360 } else { 361 /* The socket is already bound, just copy the flags */ 362 sock->uv_handle.tcp.flags = sock->parent->uv_handle.tcp.flags; 363 } 364 365 isc__nm_set_network_buffers(sock->worker->netmgr, 366 &sock->uv_handle.handle); 367 368 /* 369 * The callback will run in the same thread uv_listen() was called 370 * from, so a race with tcp_connection_cb() isn't possible. 371 */ 372 r = uv_listen((uv_stream_t *)&sock->uv_handle.tcp, sock->backlog, 373 tcp_connection_cb); 374 if (r != 0) { 375 isc__nmsocket_log(sock, ISC_LOG_ERROR, "uv_listen failed: %s", 376 isc_result_totext(isc_uverr2result(r))); 377 isc__nm_incstats(sock, STATID_BINDFAIL); 378 goto done; 379 } 380 381 if (sock->tid == 0) { 382 r = uv_tcp_getsockname(&sock->uv_handle.tcp, 383 (struct sockaddr *)&ss, 384 &(int){ sizeof(ss) }); 385 if (r != 0) { 386 goto done; 387 } 388 389 result = isc_sockaddr_fromsockaddr(&sock->parent->iface, 390 (struct sockaddr *)&ss); 391 if (result != ISC_R_SUCCESS) { 392 goto done_result; 393 } 394 } 395 396 done: 397 result = isc_uverr2result(r); 398 399 done_result: 400 if (result != ISC_R_SUCCESS) { 401 sock->pquota = NULL; 402 } 403 404 sock->result = result; 405 406 REQUIRE(!loop->paused); 407 408 if (sock->tid != 0) { 409 isc_barrier_wait(&sock->parent->listen_barrier); 410 } 411 } 412 413 static void 414 start_tcp_child(isc_nm_t *mgr, isc_sockaddr_t *iface, isc_nmsocket_t *sock, 415 uv_os_sock_t fd, int tid) { 416 isc_nmsocket_t *csock = &sock->children[tid]; 417 isc__networker_t *worker = &mgr->workers[tid]; 418 419 isc__nmsocket_init(csock, worker, isc_nm_tcpsocket, iface, sock); 420 csock->accept_cb = sock->accept_cb; 421 csock->accept_cbarg = sock->accept_cbarg; 422 csock->backlog = sock->backlog; 423 424 /* 425 * Quota isn't attached, just assigned. 426 */ 427 csock->pquota = sock->pquota; 428 429 if (mgr->load_balance_sockets) { 430 UNUSED(fd); 431 csock->fd = isc__nm_tcp_lb_socket(mgr, 432 iface->type.sa.sa_family); 433 } else { 434 csock->fd = dup(fd); 435 } 436 REQUIRE(csock->fd >= 0); 437 438 if (tid == 0) { 439 start_tcp_child_job(csock); 440 } else { 441 isc_async_run(worker->loop, start_tcp_child_job, csock); 442 } 443 } 444 445 isc_result_t 446 isc_nm_listentcp(isc_nm_t *mgr, uint32_t workers, isc_sockaddr_t *iface, 447 isc_nm_accept_cb_t accept_cb, void *accept_cbarg, int backlog, 448 isc_quota_t *quota, isc_nmsocket_t **sockp) { 449 isc_nmsocket_t *sock = NULL; 450 uv_os_sock_t fd = -1; 451 isc_result_t result = ISC_R_UNSET; 452 isc__networker_t *worker = NULL; 453 454 REQUIRE(VALID_NM(mgr)); 455 REQUIRE(isc_tid() == 0); 456 457 if (workers == 0) { 458 workers = mgr->nloops; 459 } 460 REQUIRE(workers <= mgr->nloops); 461 462 worker = &mgr->workers[0]; 463 sock = isc_mempool_get(worker->nmsocket_pool); 464 isc__nmsocket_init(sock, worker, isc_nm_tcplistener, iface, NULL); 465 466 sock->nchildren = (workers == ISC_NM_LISTEN_ALL) ? (uint32_t)mgr->nloops 467 : workers; 468 sock->children = isc_mem_cget(worker->mctx, sock->nchildren, 469 sizeof(sock->children[0])); 470 471 isc__nmsocket_barrier_init(sock); 472 473 sock->accept_cb = accept_cb; 474 sock->accept_cbarg = accept_cbarg; 475 sock->backlog = backlog; 476 sock->pquota = quota; 477 478 if (!mgr->load_balance_sockets) { 479 fd = isc__nm_tcp_lb_socket(mgr, iface->type.sa.sa_family); 480 } 481 482 start_tcp_child(mgr, iface, sock, fd, 0); 483 result = sock->children[0].result; 484 INSIST(result != ISC_R_UNSET); 485 486 for (size_t i = 1; i < sock->nchildren; i++) { 487 start_tcp_child(mgr, iface, sock, fd, i); 488 } 489 490 isc_barrier_wait(&sock->listen_barrier); 491 492 if (!mgr->load_balance_sockets) { 493 isc__nm_closesocket(fd); 494 } 495 496 /* 497 * If any of the child sockets have failed then isc_nm_listentcp 498 * fails. 499 */ 500 for (size_t i = 1; i < sock->nchildren; i++) { 501 if (result == ISC_R_SUCCESS && 502 sock->children[i].result != ISC_R_SUCCESS) 503 { 504 result = sock->children[i].result; 505 } 506 } 507 508 if (result != ISC_R_SUCCESS) { 509 sock->active = false; 510 isc__nm_tcp_stoplistening(sock); 511 isc_nmsocket_close(&sock); 512 513 return result; 514 } 515 516 sock->active = true; 517 518 *sockp = sock; 519 return ISC_R_SUCCESS; 520 } 521 522 static void 523 tcp_connection_cb(uv_stream_t *server, int status) { 524 isc_nmsocket_t *ssock = uv_handle_get_data((uv_handle_t *)server); 525 isc_result_t result; 526 527 REQUIRE(ssock->accept_cb != NULL); 528 529 if (status != 0) { 530 result = isc_uverr2result(status); 531 goto done; 532 } 533 534 REQUIRE(VALID_NMSOCK(ssock)); 535 REQUIRE(ssock->tid == isc_tid()); 536 537 if (isc__nmsocket_closing(ssock)) { 538 result = ISC_R_CANCELED; 539 goto done; 540 } 541 542 /* Prepare the child socket */ 543 isc_nmsocket_t *csock = isc_mempool_get(ssock->worker->nmsocket_pool); 544 isc__nmsocket_init(csock, ssock->worker, isc_nm_tcpsocket, 545 &ssock->iface, NULL); 546 isc__nmsocket_attach(ssock, &csock->server); 547 548 if (csock->server->pquota != NULL) { 549 result = isc_quota_acquire_cb(csock->server->pquota, 550 &csock->quotacb, quota_accept_cb, 551 csock); 552 if (result == ISC_R_QUOTA) { 553 csock->quota_accept_ts = isc_time_monotonic(); 554 isc__nm_incstats(ssock, STATID_ACCEPTFAIL); 555 goto done; 556 } 557 } 558 559 result = accept_connection(csock); 560 done: 561 isc__nm_accept_connection_log(ssock, result, can_log_tcp_quota()); 562 } 563 564 static void 565 stop_tcp_child_job(void *arg) { 566 isc_nmsocket_t *sock = arg; 567 568 REQUIRE(VALID_NMSOCK(sock)); 569 REQUIRE(sock->tid == isc_tid()); 570 REQUIRE(sock->parent != NULL); 571 REQUIRE(sock->type == isc_nm_tcpsocket); 572 REQUIRE(!sock->closing); 573 574 sock->active = false; 575 sock->closing = true; 576 577 /* 578 * The order of the close operation is important here, the uv_close() 579 * gets scheduled in the reverse order, so we need to close the timer 580 * last, so its gone by the time we destroy the socket 581 */ 582 583 /* 2. close the listening socket */ 584 isc__nmsocket_clearcb(sock); 585 isc__nm_stop_reading(sock); 586 uv_close(&sock->uv_handle.handle, tcp_stop_cb); 587 588 /* 1. close the read timer */ 589 isc__nmsocket_timer_stop(sock); 590 uv_close(&sock->read_timer, NULL); 591 592 REQUIRE(!sock->worker->loop->paused); 593 isc_barrier_wait(&sock->parent->stop_barrier); 594 } 595 596 static void 597 stop_tcp_child(isc_nmsocket_t *sock) { 598 REQUIRE(VALID_NMSOCK(sock)); 599 600 if (sock->tid == 0) { 601 stop_tcp_child_job(sock); 602 } else { 603 isc_async_run(sock->worker->loop, stop_tcp_child_job, sock); 604 } 605 } 606 607 void 608 isc__nm_tcp_stoplistening(isc_nmsocket_t *sock) { 609 REQUIRE(VALID_NMSOCK(sock)); 610 REQUIRE(sock->type == isc_nm_tcplistener); 611 REQUIRE(sock->tid == isc_tid()); 612 REQUIRE(sock->tid == 0); 613 REQUIRE(!sock->closing); 614 615 sock->closing = true; 616 617 /* Mark the parent socket inactive */ 618 sock->active = false; 619 620 /* Stop all the other threads' children */ 621 for (size_t i = 1; i < sock->nchildren; i++) { 622 stop_tcp_child(&sock->children[i]); 623 } 624 625 /* Stop the child for the main thread */ 626 stop_tcp_child(&sock->children[0]); 627 628 /* Stop the parent */ 629 sock->closed = true; 630 631 isc__nmsocket_prep_destroy(sock); 632 } 633 634 static void 635 tcp_stop_cb(uv_handle_t *handle) { 636 isc_nmsocket_t *sock = uv_handle_get_data(handle); 637 uv_handle_set_data(handle, NULL); 638 639 REQUIRE(VALID_NMSOCK(sock)); 640 REQUIRE(sock->tid == isc_tid()); 641 REQUIRE(sock->closing); 642 REQUIRE(sock->type == isc_nm_tcpsocket); 643 REQUIRE(!sock->closed); 644 645 sock->closed = true; 646 647 isc__nm_incstats(sock, STATID_CLOSE); 648 649 isc__nmsocket_detach(&sock); 650 } 651 652 void 653 isc__nm_tcp_failed_read_cb(isc_nmsocket_t *sock, isc_result_t result, 654 bool async) { 655 REQUIRE(VALID_NMSOCK(sock)); 656 REQUIRE(result != ISC_R_SUCCESS); 657 658 isc__nmsocket_timer_stop(sock); 659 isc__nm_stop_reading(sock); 660 sock->reading = false; 661 662 if (sock->recv_cb != NULL) { 663 isc__nm_uvreq_t *req = isc__nm_get_read_req(sock, NULL); 664 isc__nmsocket_clearcb(sock); 665 isc__nm_readcb(sock, req, result, async); 666 } 667 668 isc__nmsocket_prep_destroy(sock); 669 } 670 671 void 672 isc__nm_tcp_read(isc_nmhandle_t *handle, isc_nm_recv_cb_t cb, void *cbarg) { 673 isc_nmsocket_t *sock; 674 isc_nm_t *netmgr; 675 isc_result_t result; 676 677 REQUIRE(VALID_NMHANDLE(handle)); 678 REQUIRE(VALID_NMSOCK(handle->sock)); 679 680 sock = handle->sock; 681 netmgr = sock->worker->netmgr; 682 683 REQUIRE(sock->type == isc_nm_tcpsocket); 684 REQUIRE(sock->statichandle == handle); 685 686 sock->recv_cb = cb; 687 sock->recv_cbarg = cbarg; 688 689 /* Initialize the timer */ 690 if (sock->read_timeout == 0) { 691 sock->read_timeout = 692 sock->keepalive 693 ? atomic_load_relaxed(&netmgr->keepalive) 694 : atomic_load_relaxed(&netmgr->idle); 695 } 696 697 if (isc__nmsocket_closing(sock)) { 698 result = ISC_R_CANCELED; 699 goto failure; 700 } 701 702 if (!sock->reading_throttled) { 703 result = isc__nm_start_reading(sock); 704 if (result != ISC_R_SUCCESS) { 705 goto failure; 706 } 707 } 708 709 sock->reading = true; 710 711 if (!sock->manual_read_timer) { 712 isc__nmsocket_timer_start(sock); 713 } 714 715 return; 716 failure: 717 isc__nm_tcp_failed_read_cb(sock, result, true); 718 } 719 720 void 721 isc__nm_tcp_read_stop(isc_nmhandle_t *handle) { 722 REQUIRE(VALID_NMHANDLE(handle)); 723 REQUIRE(VALID_NMSOCK(handle->sock)); 724 725 isc_nmsocket_t *sock = handle->sock; 726 727 if (!sock->manual_read_timer) { 728 isc__nmsocket_timer_stop(sock); 729 } 730 isc__nm_stop_reading(sock); 731 sock->reading = false; 732 733 return; 734 } 735 736 void 737 isc__nm_tcp_read_cb(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf) { 738 isc_nmsocket_t *sock = uv_handle_get_data((uv_handle_t *)stream); 739 isc__nm_uvreq_t *req = NULL; 740 isc_nm_t *netmgr = NULL; 741 742 REQUIRE(VALID_NMSOCK(sock)); 743 REQUIRE(sock->tid == isc_tid()); 744 REQUIRE(buf != NULL); 745 746 netmgr = sock->worker->netmgr; 747 748 if (isc__nmsocket_closing(sock)) { 749 isc__nm_tcp_failed_read_cb(sock, ISC_R_CANCELED, false); 750 goto free; 751 } 752 753 if (nread < 0) { 754 if (nread != UV_EOF) { 755 isc__nm_incstats(sock, STATID_RECVFAIL); 756 } 757 758 isc__nm_tcp_failed_read_cb(sock, isc_uverr2result(nread), 759 false); 760 761 goto free; 762 } 763 764 req = isc__nm_get_read_req(sock, NULL); 765 766 /* 767 * The callback will be called synchronously because the 768 * result is ISC_R_SUCCESS, so we don't need to retain 769 * the buffer 770 */ 771 req->uvbuf.base = buf->base; 772 req->uvbuf.len = nread; 773 774 if (!sock->client) { 775 sock->read_timeout = 776 sock->keepalive 777 ? atomic_load_relaxed(&netmgr->keepalive) 778 : atomic_load_relaxed(&netmgr->idle); 779 } 780 781 isc__nm_readcb(sock, req, ISC_R_SUCCESS, false); 782 783 if (!sock->client && sock->reading) { 784 /* 785 * Stop reading if we have accumulated enough bytes in the send 786 * queue; this means that the TCP client is not reading back the 787 * data we sending to it, and there's no reason to continue 788 * processing more incoming DNS messages, if the client is not 789 * reading back the responses. 790 */ 791 size_t write_queue_size = 792 uv_stream_get_write_queue_size(&sock->uv_handle.stream); 793 794 if (write_queue_size >= ISC_NETMGR_TCP_SENDBUF_SIZE) { 795 isc__nmsocket_log( 796 sock, ISC_LOG_DEBUG(3), 797 "throttling TCP connection, the other side is " 798 "not reading the data (%zu)", 799 write_queue_size); 800 sock->reading_throttled = true; 801 isc__nm_stop_reading(sock); 802 } 803 } else if (uv_is_active(&sock->uv_handle.handle) && 804 !sock->manual_read_timer) 805 { 806 /* The readcb could have paused the reading */ 807 /* The timer will be updated */ 808 isc__nmsocket_timer_restart(sock); 809 } 810 811 free: 812 if (nread < 0) { 813 /* 814 * The buffer may be a null buffer on error. 815 */ 816 if (buf->base == NULL && buf->len == 0) { 817 return; 818 } 819 } 820 821 isc__nm_free_uvbuf(sock, buf); 822 } 823 824 /* 825 * This is called after we get a quota_accept_cb() callback. 826 */ 827 static void 828 tcpaccept_cb(void *arg) { 829 isc_nmsocket_t *csock = arg; 830 isc_nmsocket_t *ssock = csock->server; 831 832 REQUIRE(VALID_NMSOCK(csock)); 833 REQUIRE(csock->tid == isc_tid()); 834 835 isc_result_t result = accept_connection(csock); 836 isc__nm_accept_connection_log(ssock, result, can_log_tcp_quota()); 837 isc__nmsocket_detach(&csock); 838 } 839 840 static void 841 quota_accept_cb(void *arg) { 842 isc_nmsocket_t *csock = arg; 843 isc_nmsocket_t *ssock = csock->server; 844 845 REQUIRE(VALID_NMSOCK(csock)); 846 847 /* 848 * This needs to be asynchronous, because the quota might have been 849 * released by a different child socket. 850 */ 851 if (csock->tid == isc_tid()) { 852 isc_result_t result = accept_connection(csock); 853 isc__nm_accept_connection_log(ssock, result, 854 can_log_tcp_quota()); 855 } else { 856 isc__nmsocket_attach(csock, &(isc_nmsocket_t *){ NULL }); 857 isc_async_run(csock->worker->loop, tcpaccept_cb, csock); 858 } 859 } 860 861 static isc_result_t 862 accept_connection(isc_nmsocket_t *csock) { 863 int r; 864 isc_result_t result; 865 struct sockaddr_storage ss; 866 isc_sockaddr_t local; 867 isc_nmhandle_t *handle = NULL; 868 869 REQUIRE(VALID_NMSOCK(csock)); 870 REQUIRE(VALID_NMSOCK(csock->server)); 871 REQUIRE(csock->tid == isc_tid()); 872 873 csock->accepting = true; 874 csock->accept_cb = csock->server->accept_cb; 875 csock->accept_cbarg = csock->server->accept_cbarg; 876 csock->recv_cb = csock->server->recv_cb; 877 csock->recv_cbarg = csock->server->recv_cbarg; 878 csock->read_timeout = atomic_load_relaxed(&csock->worker->netmgr->init); 879 880 r = uv_tcp_init(&csock->worker->loop->loop, &csock->uv_handle.tcp); 881 UV_RUNTIME_CHECK(uv_tcp_init, r); 882 uv_handle_set_data(&csock->uv_handle.handle, csock); 883 884 r = uv_timer_init(&csock->worker->loop->loop, &csock->read_timer); 885 UV_RUNTIME_CHECK(uv_timer_init, r); 886 uv_handle_set_data((uv_handle_t *)&csock->read_timer, csock); 887 888 if (csock->server->pquota != NULL) { 889 isc__nm_incstats(csock, STATID_CLIENTS); 890 } 891 892 /* 893 * We need to initialize the tcp and timer before failing because 894 * isc__nm_tcp_close() can't handle uninitalized TCP nmsocket. 895 */ 896 if (isc__nmsocket_closing(csock)) { 897 result = ISC_R_CANCELED; 898 goto failure; 899 } 900 901 r = uv_accept(&csock->server->uv_handle.stream, 902 &csock->uv_handle.stream); 903 if (r != 0) { 904 result = isc_uverr2result(r); 905 goto failure; 906 } 907 908 /* Check if the connection is not expired */ 909 if (csock->quota_accept_ts != 0) { 910 /* The timestamp is given in nanoseconds */ 911 const uint64_t time_elapsed_ms = 912 (isc_time_monotonic() - csock->quota_accept_ts) / 913 NS_PER_MS; 914 915 if (time_elapsed_ms >= csock->read_timeout) { 916 /* 917 * At this point we have received a connection from a 918 * queue of accepted connections (via uv_accept()), but 919 * it has expired. We cannot do anything better than 920 * drop it on the floor at this point. 921 */ 922 result = ISC_R_TIMEDOUT; 923 goto failure; 924 } else { 925 /* Adjust the initial read timeout accordingly */ 926 csock->read_timeout -= time_elapsed_ms; 927 } 928 } 929 930 r = uv_tcp_getpeername(&csock->uv_handle.tcp, (struct sockaddr *)&ss, 931 &(int){ sizeof(ss) }); 932 if (r != 0) { 933 result = isc_uverr2result(r); 934 goto failure; 935 } 936 937 result = isc_sockaddr_fromsockaddr(&csock->peer, 938 (struct sockaddr *)&ss); 939 if (result != ISC_R_SUCCESS) { 940 goto failure; 941 } 942 943 r = uv_tcp_getsockname(&csock->uv_handle.tcp, (struct sockaddr *)&ss, 944 &(int){ sizeof(ss) }); 945 if (r != 0) { 946 result = isc_uverr2result(r); 947 goto failure; 948 } 949 950 result = isc_sockaddr_fromsockaddr(&local, (struct sockaddr *)&ss); 951 if (result != ISC_R_SUCCESS) { 952 goto failure; 953 } 954 955 handle = isc__nmhandle_get(csock, NULL, &local); 956 957 result = csock->accept_cb(handle, ISC_R_SUCCESS, csock->accept_cbarg); 958 if (result != ISC_R_SUCCESS) { 959 isc_nmhandle_detach(&handle); 960 goto failure; 961 } 962 963 csock->accepting = false; 964 965 isc__nm_incstats(csock, STATID_ACCEPT); 966 967 /* 968 * The acceptcb needs to attach to the handle if it wants to keep the 969 * connection alive 970 */ 971 isc_nmhandle_detach(&handle); 972 973 /* 974 * sock is now attached to the handle. 975 */ 976 isc__nmsocket_detach(&csock); 977 978 return ISC_R_SUCCESS; 979 980 failure: 981 csock->active = false; 982 csock->accepting = false; 983 984 if (result != ISC_R_NOTCONNECTED) { 985 /* IGNORE: The client disconnected before we could accept */ 986 isc__nmsocket_log(csock, ISC_LOG_ERROR, 987 "Accepting TCP connection failed: %s", 988 isc_result_totext(result)); 989 } 990 991 isc__nmsocket_prep_destroy(csock); 992 993 isc__nmsocket_detach(&csock); 994 995 return result; 996 } 997 998 static void 999 tcp_send(isc_nmhandle_t *handle, const isc_region_t *region, isc_nm_cb_t cb, 1000 void *cbarg, const bool dnsmsg) { 1001 REQUIRE(VALID_NMHANDLE(handle)); 1002 REQUIRE(VALID_NMSOCK(handle->sock)); 1003 1004 isc_nmsocket_t *sock = handle->sock; 1005 isc_result_t result; 1006 isc__nm_uvreq_t *uvreq = NULL; 1007 isc_nm_t *netmgr = sock->worker->netmgr; 1008 1009 REQUIRE(sock->type == isc_nm_tcpsocket); 1010 REQUIRE(sock->tid == isc_tid()); 1011 1012 uvreq = isc__nm_uvreq_get(sock); 1013 if (dnsmsg) { 1014 *(uint16_t *)uvreq->tcplen = htons(region->length); 1015 } 1016 uvreq->uvbuf.base = (char *)region->base; 1017 uvreq->uvbuf.len = region->length; 1018 1019 isc_nmhandle_attach(handle, &uvreq->handle); 1020 1021 uvreq->cb.send = cb; 1022 uvreq->cbarg = cbarg; 1023 1024 if (sock->write_timeout == 0) { 1025 sock->write_timeout = 1026 sock->keepalive 1027 ? atomic_load_relaxed(&netmgr->keepalive) 1028 : atomic_load_relaxed(&netmgr->idle); 1029 } 1030 1031 result = tcp_send_direct(sock, uvreq); 1032 if (result != ISC_R_SUCCESS) { 1033 isc__nm_incstats(sock, STATID_SENDFAIL); 1034 isc__nm_failed_send_cb(sock, uvreq, result, true); 1035 } 1036 1037 return; 1038 } 1039 1040 void 1041 isc__nm_tcp_send(isc_nmhandle_t *handle, const isc_region_t *region, 1042 isc_nm_cb_t cb, void *cbarg) { 1043 tcp_send(handle, region, cb, cbarg, false); 1044 } 1045 1046 void 1047 isc__nm_tcp_senddns(isc_nmhandle_t *handle, const isc_region_t *region, 1048 isc_nm_cb_t cb, void *cbarg) { 1049 tcp_send(handle, region, cb, cbarg, true); 1050 } 1051 1052 static void 1053 tcp_maybe_restart_reading(isc_nmsocket_t *sock) { 1054 if (!sock->client && sock->reading && 1055 !uv_is_active(&sock->uv_handle.handle)) 1056 { 1057 /* 1058 * Restart reading if we have less data in the send queue than 1059 * the send buffer size, this means that the TCP client has 1060 * started reading some data again. Starting reading when we go 1061 * under the limit instead of waiting for all data has been 1062 * flushed allows faster recovery (in case there was a 1063 * congestion and now there isn't). 1064 */ 1065 size_t write_queue_size = 1066 uv_stream_get_write_queue_size(&sock->uv_handle.stream); 1067 if (write_queue_size < ISC_NETMGR_TCP_SENDBUF_SIZE) { 1068 isc__nmsocket_log( 1069 sock, ISC_LOG_DEBUG(3), 1070 "resuming TCP connection, the other side " 1071 "is reading the data again (%zu)", 1072 write_queue_size); 1073 isc__nm_start_reading(sock); 1074 sock->reading_throttled = false; 1075 } 1076 } 1077 } 1078 1079 static void 1080 tcp_send_cb(uv_write_t *req, int status) { 1081 isc__nm_uvreq_t *uvreq = (isc__nm_uvreq_t *)req->data; 1082 isc_nmsocket_t *sock = NULL; 1083 1084 REQUIRE(VALID_UVREQ(uvreq)); 1085 REQUIRE(VALID_NMSOCK(uvreq->sock)); 1086 1087 sock = uvreq->sock; 1088 1089 isc_nm_timer_stop(uvreq->timer); 1090 isc_nm_timer_detach(&uvreq->timer); 1091 1092 if (status < 0) { 1093 isc__nm_incstats(sock, STATID_SENDFAIL); 1094 isc__nm_failed_send_cb(sock, uvreq, isc_uverr2result(status), 1095 false); 1096 if (!sock->client && sock->reading) { 1097 /* 1098 * As we are resuming reading, it is not throttled 1099 * anymore (technically). 1100 */ 1101 sock->reading_throttled = false; 1102 isc__nm_start_reading(sock); 1103 isc__nmsocket_reset(sock); 1104 } 1105 return; 1106 } 1107 1108 isc__nm_sendcb(sock, uvreq, ISC_R_SUCCESS, false); 1109 tcp_maybe_restart_reading(sock); 1110 } 1111 1112 static isc_result_t 1113 tcp_send_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) { 1114 REQUIRE(VALID_NMSOCK(sock)); 1115 REQUIRE(VALID_UVREQ(req)); 1116 REQUIRE(sock->tid == isc_tid()); 1117 REQUIRE(sock->type == isc_nm_tcpsocket); 1118 1119 int r; 1120 uv_buf_t bufs[2] = { { 0 }, { 0 } }; /* ugly, but required for old GCC 1121 versions */ 1122 size_t nbufs = 1; 1123 1124 if (isc__nmsocket_closing(sock)) { 1125 return ISC_R_CANCELED; 1126 } 1127 1128 /* Check if we are not trying to send a DNS message */ 1129 if (*(uint16_t *)req->tcplen == 0) { 1130 bufs[0].base = req->uvbuf.base; 1131 bufs[0].len = req->uvbuf.len; 1132 1133 r = uv_try_write(&sock->uv_handle.stream, bufs, nbufs); 1134 1135 if (r == (int)(bufs[0].len)) { 1136 /* Wrote everything */ 1137 isc__nm_sendcb(sock, req, ISC_R_SUCCESS, true); 1138 tcp_maybe_restart_reading(sock); 1139 return ISC_R_SUCCESS; 1140 } else if (r > 0) { 1141 bufs[0].base += (size_t)r; 1142 bufs[0].len -= (size_t)r; 1143 } else if (!(r == UV_ENOSYS || r == UV_EAGAIN)) { 1144 return isc_uverr2result(r); 1145 } 1146 } else { 1147 nbufs = 2; 1148 bufs[0].base = req->tcplen; 1149 bufs[0].len = 2; 1150 bufs[1].base = req->uvbuf.base; 1151 bufs[1].len = req->uvbuf.len; 1152 1153 r = uv_try_write(&sock->uv_handle.stream, bufs, nbufs); 1154 1155 if (r == (int)(bufs[0].len + bufs[1].len)) { 1156 /* Wrote everything */ 1157 isc__nm_sendcb(sock, req, ISC_R_SUCCESS, true); 1158 tcp_maybe_restart_reading(sock); 1159 return ISC_R_SUCCESS; 1160 } else if (r == 1) { 1161 /* Partial write of DNSMSG length */ 1162 bufs[0].base = req->tcplen + 1; 1163 bufs[0].len = 1; 1164 } else if (r > 0) { 1165 /* Partial write of DNSMSG */ 1166 nbufs = 1; 1167 bufs[0].base = req->uvbuf.base + (r - 2); 1168 bufs[0].len = req->uvbuf.len - (r - 2); 1169 } else if (!(r == UV_ENOSYS || r == UV_EAGAIN)) { 1170 return isc_uverr2result(r); 1171 } 1172 } 1173 1174 if (!sock->client && sock->reading) { 1175 sock->reading_throttled = true; 1176 isc__nm_stop_reading(sock); 1177 } 1178 isc__nmsocket_log(sock, ISC_LOG_DEBUG(3), 1179 "%sthe other side is not " 1180 "reading the data, switching to uv_write()", 1181 !sock->client && sock->reading 1182 ? "throttling TCP connection, " 1183 : ""); 1184 1185 r = uv_write(&req->uv_req.write, &sock->uv_handle.stream, bufs, nbufs, 1186 tcp_send_cb); 1187 if (r < 0) { 1188 return isc_uverr2result(r); 1189 } 1190 1191 isc_nm_timer_create(req->handle, isc__nmsocket_writetimeout_cb, req, 1192 &req->timer); 1193 if (sock->write_timeout > 0) { 1194 isc_nm_timer_start(req->timer, sock->write_timeout); 1195 } 1196 1197 return ISC_R_SUCCESS; 1198 } 1199 1200 static void 1201 tcp_close_sock(isc_nmsocket_t *sock) { 1202 REQUIRE(VALID_NMSOCK(sock)); 1203 REQUIRE(sock->tid == isc_tid()); 1204 REQUIRE(sock->closing); 1205 REQUIRE(!sock->closed); 1206 1207 sock->closed = true; 1208 sock->connected = false; 1209 1210 isc__nm_incstats(sock, STATID_CLOSE); 1211 1212 if (sock->server != NULL) { 1213 if (sock->server->pquota != NULL) { 1214 isc__nm_decstats(sock, STATID_CLIENTS); 1215 isc_quota_release(sock->server->pquota); 1216 } 1217 isc__nmsocket_detach(&sock->server); 1218 } 1219 1220 isc__nmsocket_prep_destroy(sock); 1221 } 1222 1223 static void 1224 tcp_close_cb(uv_handle_t *handle) { 1225 isc_nmsocket_t *sock = uv_handle_get_data(handle); 1226 uv_handle_set_data(handle, NULL); 1227 1228 tcp_close_sock(sock); 1229 } 1230 1231 void 1232 isc__nm_tcp_close(isc_nmsocket_t *sock) { 1233 REQUIRE(VALID_NMSOCK(sock)); 1234 REQUIRE(sock->type == isc_nm_tcpsocket); 1235 REQUIRE(!isc__nmsocket_active(sock)); 1236 REQUIRE(sock->tid == isc_tid()); 1237 REQUIRE(sock->parent == NULL); 1238 REQUIRE(!sock->closing); 1239 1240 sock->closing = true; 1241 1242 /* 1243 * The order of the close operation is important here, the uv_close() 1244 * gets scheduled in the reverse order, so we need to close the timer 1245 * last, so its gone by the time we destroy the socket 1246 */ 1247 1248 if (!uv_is_closing(&sock->uv_handle.handle)) { 1249 /* Normal order of operation */ 1250 1251 /* 2. close the socket + destroy the socket in callback */ 1252 isc__nmsocket_clearcb(sock); 1253 isc__nm_stop_reading(sock); 1254 sock->reading = false; 1255 uv_close(&sock->uv_handle.handle, tcp_close_cb); 1256 1257 /* 1. close the timer */ 1258 isc__nmsocket_timer_stop(sock); 1259 uv_close((uv_handle_t *)&sock->read_timer, NULL); 1260 } else { 1261 /* The socket was already closed elsewhere */ 1262 1263 /* 1. close the timer + destroy the socket in callback */ 1264 isc__nmsocket_timer_stop(sock); 1265 uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock); 1266 uv_close((uv_handle_t *)&sock->read_timer, tcp_close_cb); 1267 } 1268 } 1269 1270 static void 1271 tcp_close_connect_cb(uv_handle_t *handle) { 1272 isc_nmsocket_t *sock = uv_handle_get_data(handle); 1273 1274 REQUIRE(VALID_NMSOCK(sock)); 1275 1276 REQUIRE(sock->tid == isc_tid()); 1277 1278 isc__nmsocket_prep_destroy(sock); 1279 isc__nmsocket_detach(&sock); 1280 } 1281 1282 void 1283 isc__nm_tcp_shutdown(isc_nmsocket_t *sock) { 1284 REQUIRE(VALID_NMSOCK(sock)); 1285 REQUIRE(sock->tid == isc_tid()); 1286 REQUIRE(sock->type == isc_nm_tcpsocket); 1287 1288 /* 1289 * If the socket is active, mark it inactive and 1290 * continue. If it isn't active, stop now. 1291 */ 1292 if (!sock->active) { 1293 return; 1294 } 1295 sock->active = false; 1296 1297 INSIST(!sock->accepting); 1298 1299 if (sock->connecting) { 1300 isc_nmsocket_t *tsock = NULL; 1301 isc__nmsocket_attach(sock, &tsock); 1302 uv_close(&sock->uv_handle.handle, tcp_close_connect_cb); 1303 return; 1304 } 1305 1306 /* There's a handle attached to the socket (from accept or connect) */ 1307 if (sock->statichandle) { 1308 isc__nm_failed_read_cb(sock, ISC_R_SHUTTINGDOWN, false); 1309 return; 1310 } 1311 1312 /* Destroy the non-listening socket */ 1313 if (sock->parent == NULL) { 1314 isc__nmsocket_prep_destroy(sock); 1315 return; 1316 } 1317 1318 /* Destroy the listening socket if on the same loop */ 1319 if (sock->tid == sock->parent->tid) { 1320 isc__nmsocket_prep_destroy(sock->parent); 1321 } 1322 } 1323 1324 void 1325 isc__nmhandle_tcp_set_manual_timer(isc_nmhandle_t *handle, const bool manual) { 1326 isc_nmsocket_t *sock; 1327 1328 REQUIRE(VALID_NMHANDLE(handle)); 1329 sock = handle->sock; 1330 REQUIRE(VALID_NMSOCK(sock)); 1331 REQUIRE(sock->type == isc_nm_tcpsocket); 1332 REQUIRE(sock->tid == isc_tid()); 1333 REQUIRE(!uv_is_active(&sock->uv_handle.handle)); 1334 1335 sock->manual_read_timer = manual; 1336 } 1337