1 /* $NetBSD: socket.c,v 1.3 2026/04/08 00:16:16 christos Exp $ */ 2 3 /* 4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC") 5 * 6 * SPDX-License-Identifier: MPL-2.0 7 * 8 * This Source Code Form is subject to the terms of the Mozilla Public 9 * License, v. 2.0. If a copy of the MPL was not distributed with this 10 * file, you can obtain one at https://mozilla.org/MPL/2.0/. 11 * 12 * See the COPYRIGHT file distributed with this work for additional 13 * information regarding copyright ownership. 14 */ 15 16 #include <netinet/in.h> 17 18 #include <isc/errno.h> 19 #include <isc/result.h> 20 #include <isc/uv.h> 21 22 #include "netmgr-int.h" 23 24 #define setsockopt_on(socket, level, name) \ 25 setsockopt(socket, level, name, &(int){ 1 }, sizeof(int)) 26 27 #define setsockopt_off(socket, level, name) \ 28 setsockopt(socket, level, name, &(int){ 0 }, sizeof(int)) 29 30 static isc_result_t 31 socket_freebind(uv_os_sock_t fd, sa_family_t sa_family) { 32 /* 33 * Set the IP_FREEBIND (or equivalent option) on the uv_handle. 34 */ 35 #ifdef IP_FREEBIND 36 UNUSED(sa_family); 37 if (setsockopt_on(fd, IPPROTO_IP, IP_FREEBIND) == -1) { 38 return ISC_R_FAILURE; 39 } 40 return ISC_R_SUCCESS; 41 #elif defined(IP_BINDANY) || defined(IPV6_BINDANY) 42 if (sa_family == AF_INET) { 43 #if defined(IP_BINDANY) 44 if (setsockopt_on(fd, IPPROTO_IP, IP_BINDANY) == -1) { 45 return ISC_R_FAILURE; 46 } 47 return ISC_R_SUCCESS; 48 #endif 49 } else if (sa_family == AF_INET6) { 50 #if defined(IPV6_BINDANY) 51 if (setsockopt_on(fd, IPPROTO_IPV6, IPV6_BINDANY) == -1) { 52 return ISC_R_FAILURE; 53 } 54 return ISC_R_SUCCESS; 55 #endif 56 } 57 return ISC_R_NOTIMPLEMENTED; 58 #elif defined(SO_BINDANY) 59 UNUSED(sa_family); 60 if (setsockopt_on(fd, SOL_SOCKET, SO_BINDANY) == -1) { 61 return ISC_R_FAILURE; 62 } 63 return ISC_R_SUCCESS; 64 #else 65 UNUSED(fd); 66 UNUSED(sa_family); 67 return ISC_R_NOTIMPLEMENTED; 68 #endif 69 } 70 71 int 72 isc__nm_udp_freebind(uv_udp_t *handle, const struct sockaddr *addr, 73 unsigned int flags) { 74 int r; 75 uv_os_sock_t fd = -1; 76 77 r = uv_fileno((const uv_handle_t *)handle, (uv_os_fd_t *)&fd); 78 if (r < 0) { 79 return r; 80 } 81 82 r = uv_udp_bind(handle, addr, flags); 83 if (r == UV_EADDRNOTAVAIL && 84 socket_freebind(fd, addr->sa_family) == ISC_R_SUCCESS) 85 { 86 /* 87 * Retry binding with IP_FREEBIND (or equivalent option) if the 88 * address is not available. This helps with IPv6 tentative 89 * addresses which are reported by the route socket, although 90 * named is not yet able to properly bind to them. 91 */ 92 r = uv_udp_bind(handle, addr, flags); 93 } 94 95 return r; 96 } 97 98 static int 99 tcp_bind_now(uv_tcp_t *handle, const struct sockaddr *addr, 100 unsigned int flags) { 101 int r; 102 struct sockaddr_storage sname; 103 int snamelen = sizeof(sname); 104 105 r = uv_tcp_bind(handle, addr, flags); 106 if (r < 0) { 107 return r; 108 } 109 110 /* 111 * uv_tcp_bind() uses a delayed error, initially returning 112 * success even if bind() fails. By calling uv_tcp_getsockname() 113 * here we can find out whether the bind() call was successful. 114 */ 115 r = uv_tcp_getsockname(handle, (struct sockaddr *)&sname, &snamelen); 116 if (r < 0) { 117 return r; 118 } 119 120 return 0; 121 } 122 123 int 124 isc__nm_tcp_freebind(uv_tcp_t *handle, const struct sockaddr *addr, 125 unsigned int flags) { 126 int r; 127 uv_os_sock_t fd = -1; 128 129 r = uv_fileno((const uv_handle_t *)handle, (uv_os_fd_t *)&fd); 130 if (r < 0) { 131 return r; 132 } 133 134 r = tcp_bind_now(handle, addr, flags); 135 if (r == UV_EADDRNOTAVAIL && 136 socket_freebind(fd, addr->sa_family) == ISC_R_SUCCESS) 137 { 138 /* 139 * Retry binding with IP_FREEBIND (or equivalent option) if the 140 * address is not available. This helps with IPv6 tentative 141 * addresses which are reported by the route socket, although 142 * named is not yet able to properly bind to them. 143 */ 144 r = tcp_bind_now(handle, addr, flags); 145 } 146 147 return r; 148 } 149 150 isc_result_t 151 isc__nm_socket(int domain, int type, int protocol, uv_os_sock_t *sockp) { 152 int sock = socket(domain, type, protocol); 153 if (sock < 0) { 154 return isc_errno_toresult(errno); 155 } 156 157 *sockp = (uv_os_sock_t)sock; 158 return ISC_R_SUCCESS; 159 } 160 161 void 162 isc__nm_closesocket(uv_os_sock_t sock) { 163 close(sock); 164 } 165 166 isc_result_t 167 isc__nm_socket_reuse(uv_os_sock_t fd, int val) { 168 /* 169 * Generally, the SO_REUSEADDR socket option allows reuse of 170 * local addresses. 171 * 172 * On the BSDs, SO_REUSEPORT implies SO_REUSEADDR but with some 173 * additional refinements for programs that use multicast. 174 * 175 * On Linux, SO_REUSEPORT has different semantics: it _shares_ the port 176 * rather than steal it from the current listener, so we don't use it 177 * here, but rather in isc__nm_socket_reuse_lb(). 178 */ 179 180 #if defined(SO_REUSEPORT) && !defined(__linux__) 181 if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &val, sizeof(val)) == -1) { 182 return ISC_R_FAILURE; 183 } 184 return ISC_R_SUCCESS; 185 #elif defined(SO_REUSEADDR) 186 if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val)) == -1) { 187 return ISC_R_FAILURE; 188 } 189 return ISC_R_SUCCESS; 190 #else 191 UNUSED(fd); 192 return ISC_R_NOTIMPLEMENTED; 193 #endif 194 } 195 196 isc_result_t 197 isc__nm_socket_reuse_lb(uv_os_sock_t fd) { 198 /* 199 * On FreeBSD 12+, SO_REUSEPORT_LB socket option allows sockets to be 200 * bound to an identical socket address. For UDP sockets, the use of 201 * this option can provide better distribution of incoming datagrams to 202 * multiple processes (or threads) as compared to the traditional 203 * technique of having multiple processes compete to receive datagrams 204 * on the same socket. 205 * 206 * On Linux, the same thing is achieved simply with SO_REUSEPORT. 207 */ 208 #if defined(SO_REUSEPORT_LB) 209 if (setsockopt_on(fd, SOL_SOCKET, SO_REUSEPORT_LB) == -1) { 210 return ISC_R_FAILURE; 211 } else { 212 return ISC_R_SUCCESS; 213 } 214 #elif defined(SO_REUSEPORT) && defined(__linux__) 215 if (setsockopt_on(fd, SOL_SOCKET, SO_REUSEPORT) == -1) { 216 return ISC_R_FAILURE; 217 } else { 218 return ISC_R_SUCCESS; 219 } 220 #else 221 UNUSED(fd); 222 return ISC_R_NOTIMPLEMENTED; 223 #endif 224 } 225 226 isc_result_t 227 isc__nm_socket_disable_pmtud(uv_os_sock_t fd, sa_family_t sa_family) { 228 /* 229 * Disable the Path MTU Discovery on IP packets 230 */ 231 if (sa_family == AF_INET6) { 232 #if defined(IPV6_DONTFRAG) 233 if (setsockopt_off(fd, IPPROTO_IPV6, IPV6_DONTFRAG) == -1) { 234 return ISC_R_FAILURE; 235 } else { 236 return ISC_R_SUCCESS; 237 } 238 #elif defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT) 239 if (setsockopt(fd, IPPROTO_IPV6, IPV6_MTU_DISCOVER, 240 &(int){ IP_PMTUDISC_OMIT }, sizeof(int)) == -1) 241 { 242 return ISC_R_FAILURE; 243 } else { 244 return ISC_R_SUCCESS; 245 } 246 #else 247 UNUSED(fd); 248 #endif 249 } else if (sa_family == AF_INET) { 250 #if defined(IP_DONTFRAG) 251 if (setsockopt_off(fd, IPPROTO_IP, IP_DONTFRAG) == -1) { 252 return ISC_R_FAILURE; 253 } else { 254 return ISC_R_SUCCESS; 255 } 256 #elif defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT) 257 if (setsockopt(fd, IPPROTO_IP, IP_MTU_DISCOVER, 258 &(int){ IP_PMTUDISC_OMIT }, sizeof(int)) == -1) 259 { 260 return ISC_R_FAILURE; 261 } else { 262 return ISC_R_SUCCESS; 263 } 264 #else 265 UNUSED(fd); 266 #endif 267 } else { 268 return ISC_R_FAMILYNOSUPPORT; 269 } 270 271 return ISC_R_NOTIMPLEMENTED; 272 } 273 274 isc_result_t 275 isc__nm_socket_v6only(uv_os_sock_t fd, sa_family_t sa_family) { 276 /* 277 * Enable the IPv6-only option on IPv6 sockets 278 */ 279 if (sa_family == AF_INET6) { 280 #if defined(IPV6_V6ONLY) 281 if (setsockopt_on(fd, IPPROTO_IPV6, IPV6_V6ONLY) == -1) { 282 return ISC_R_FAILURE; 283 } else { 284 return ISC_R_SUCCESS; 285 } 286 #else 287 UNUSED(fd); 288 #endif 289 } 290 return ISC_R_NOTIMPLEMENTED; 291 } 292 293 isc_result_t 294 isc__nm_socket_connectiontimeout(uv_os_sock_t fd, int timeout_ms) { 295 #if defined(TIMEOUT_OPTNAME) 296 TIMEOUT_TYPE timeout = timeout_ms / TIMEOUT_DIV; 297 298 if (timeout == 0) { 299 timeout = 1; 300 } 301 302 if (setsockopt(fd, IPPROTO_TCP, TIMEOUT_OPTNAME, &timeout, 303 sizeof(timeout)) == -1) 304 { 305 return ISC_R_FAILURE; 306 } 307 308 return ISC_R_SUCCESS; 309 #else 310 UNUSED(fd); 311 UNUSED(timeout_ms); 312 313 return ISC_R_SUCCESS; 314 #endif 315 } 316 317 isc_result_t 318 isc__nm_socket_tcp_nodelay(uv_os_sock_t fd, bool value) { 319 #ifdef TCP_NODELAY 320 int ret; 321 322 if (value) { 323 ret = setsockopt_on(fd, IPPROTO_TCP, TCP_NODELAY); 324 } else { 325 ret = setsockopt_off(fd, IPPROTO_TCP, TCP_NODELAY); 326 } 327 328 if (ret == -1) { 329 return ISC_R_FAILURE; 330 } else { 331 return ISC_R_SUCCESS; 332 } 333 #else 334 UNUSED(fd); 335 return ISC_R_SUCCESS; 336 #endif 337 } 338 339 isc_result_t 340 isc__nm_socket_tcp_maxseg(uv_os_sock_t fd, int size) { 341 #ifdef TCP_MAXSEG 342 if (setsockopt(fd, IPPROTO_TCP, TCP_MAXSEG, (void *)&size, 343 sizeof(size))) 344 { 345 return ISC_R_FAILURE; 346 } else { 347 return ISC_R_SUCCESS; 348 } 349 #else 350 UNUSED(fd); 351 UNUSED(size); 352 return ISC_R_SUCCESS; 353 #endif 354 } 355 356 isc_result_t 357 isc__nm_socket_min_mtu(uv_os_sock_t fd, sa_family_t sa_family) { 358 if (sa_family != AF_INET6) { 359 return ISC_R_SUCCESS; 360 } 361 #ifdef IPV6_USE_MIN_MTU 362 if (setsockopt_on(fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU) == -1) { 363 return ISC_R_FAILURE; 364 } 365 #elif defined(IPV6_MTU) 366 if (setsockopt(fd, IPPROTO_IPV6, IPV6_MTU, &(int){ 1280 }, 367 sizeof(int)) == -1) 368 { 369 return ISC_R_FAILURE; 370 } 371 #else 372 UNUSED(fd); 373 #endif 374 375 return ISC_R_SUCCESS; 376 } 377 378 /* 379 * See 380 * https://blog.cloudflare.com/linux-transport-protocol-port-selection-performance/#kernel 381 * for rationalle. 382 */ 383 #define PORT_RANGE 1000 384 385 isc_result_t 386 isc__nm_socket_max_port_range(uv_os_sock_t fd ISC_ATTR_UNUSED, 387 sa_family_t sa_family ISC_ATTR_UNUSED, 388 in_port_t port_low ISC_ATTR_UNUSED, 389 in_port_t port_high ISC_ATTR_UNUSED) { 390 #ifdef IP_BIND_ADDRESS_NO_PORT 391 if (setsockopt_on(fd, IPPROTO_IP, IP_BIND_ADDRESS_NO_PORT) == -1) { 392 return ISC_R_FAILURE; 393 } 394 #endif 395 396 #if defined(IP_LOCAL_PORT_RANGE) && defined(__linux__) 397 /* 398 * The option takes an uint32_t value with the high 16 bits 399 * set to the upper range bound, and the low 16 bits set to 400 * the lower range bound. Range bounds are inclusive. The 401 * 16-bit values should be in host byte order. 402 */ 403 uint32_t port_range; 404 int major, minor; 405 isc_os_kernel(NULL, &major, &minor, NULL); 406 407 /* 408 * Linux 6.8 implemented a following patch: 409 * 410 * If IP_LOCAL_PORT_RANGE is set on a socket before accept(), 411 * port selection no longer favors even ports. 412 * 413 * This means that connect() can find a suitable source port 414 * faster, and applications can use a different split between 415 * connect() and bind() users. 416 */ 417 if (major < 6 || (major == 6 && minor < 8)) { 418 /* 419 * On Linux << 6.8, use IP_LOCAL_PORT_RANGE to 420 * partition ephemeral port range randomly to help 421 * with the port selection. 422 */ 423 if (port_high - port_low <= PORT_RANGE) { 424 return ISC_R_RANGE; 425 } 426 427 /* 428 * port_low <= N < port_high - PORT_RANGE 429 */ 430 port_high -= PORT_RANGE; 431 port_low += isc_random_uniform(port_high - port_low); 432 port_high = port_low + PORT_RANGE; 433 } 434 INSIST(port_low > 0); 435 INSIST(port_low < port_high); 436 437 port_range = (uint32_t)port_low | ((uint32_t)port_high << 16); 438 if (setsockopt(fd, IPPROTO_IP, IP_LOCAL_PORT_RANGE, &port_range, 439 sizeof(port_range)) == -1) 440 { 441 return ISC_R_FAILURE; 442 } 443 #endif 444 return ISC_R_SUCCESS; 445 } 446