Home | History | Annotate | Line # | Download | only in netmgr
      1 /*	$NetBSD: socket.c,v 1.3 2026/04/08 00:16:16 christos Exp $	*/
      2 
      3 /*
      4  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
      5  *
      6  * SPDX-License-Identifier: MPL-2.0
      7  *
      8  * This Source Code Form is subject to the terms of the Mozilla Public
      9  * License, v. 2.0. If a copy of the MPL was not distributed with this
     10  * file, you can obtain one at https://mozilla.org/MPL/2.0/.
     11  *
     12  * See the COPYRIGHT file distributed with this work for additional
     13  * information regarding copyright ownership.
     14  */
     15 
     16 #include <netinet/in.h>
     17 
     18 #include <isc/errno.h>
     19 #include <isc/result.h>
     20 #include <isc/uv.h>
     21 
     22 #include "netmgr-int.h"
     23 
     24 #define setsockopt_on(socket, level, name) \
     25 	setsockopt(socket, level, name, &(int){ 1 }, sizeof(int))
     26 
     27 #define setsockopt_off(socket, level, name) \
     28 	setsockopt(socket, level, name, &(int){ 0 }, sizeof(int))
     29 
     30 static isc_result_t
     31 socket_freebind(uv_os_sock_t fd, sa_family_t sa_family) {
     32 	/*
     33 	 * Set the IP_FREEBIND (or equivalent option) on the uv_handle.
     34 	 */
     35 #ifdef IP_FREEBIND
     36 	UNUSED(sa_family);
     37 	if (setsockopt_on(fd, IPPROTO_IP, IP_FREEBIND) == -1) {
     38 		return ISC_R_FAILURE;
     39 	}
     40 	return ISC_R_SUCCESS;
     41 #elif defined(IP_BINDANY) || defined(IPV6_BINDANY)
     42 	if (sa_family == AF_INET) {
     43 #if defined(IP_BINDANY)
     44 		if (setsockopt_on(fd, IPPROTO_IP, IP_BINDANY) == -1) {
     45 			return ISC_R_FAILURE;
     46 		}
     47 		return ISC_R_SUCCESS;
     48 #endif
     49 	} else if (sa_family == AF_INET6) {
     50 #if defined(IPV6_BINDANY)
     51 		if (setsockopt_on(fd, IPPROTO_IPV6, IPV6_BINDANY) == -1) {
     52 			return ISC_R_FAILURE;
     53 		}
     54 		return ISC_R_SUCCESS;
     55 #endif
     56 	}
     57 	return ISC_R_NOTIMPLEMENTED;
     58 #elif defined(SO_BINDANY)
     59 	UNUSED(sa_family);
     60 	if (setsockopt_on(fd, SOL_SOCKET, SO_BINDANY) == -1) {
     61 		return ISC_R_FAILURE;
     62 	}
     63 	return ISC_R_SUCCESS;
     64 #else
     65 	UNUSED(fd);
     66 	UNUSED(sa_family);
     67 	return ISC_R_NOTIMPLEMENTED;
     68 #endif
     69 }
     70 
     71 int
     72 isc__nm_udp_freebind(uv_udp_t *handle, const struct sockaddr *addr,
     73 		     unsigned int flags) {
     74 	int r;
     75 	uv_os_sock_t fd = -1;
     76 
     77 	r = uv_fileno((const uv_handle_t *)handle, (uv_os_fd_t *)&fd);
     78 	if (r < 0) {
     79 		return r;
     80 	}
     81 
     82 	r = uv_udp_bind(handle, addr, flags);
     83 	if (r == UV_EADDRNOTAVAIL &&
     84 	    socket_freebind(fd, addr->sa_family) == ISC_R_SUCCESS)
     85 	{
     86 		/*
     87 		 * Retry binding with IP_FREEBIND (or equivalent option) if the
     88 		 * address is not available. This helps with IPv6 tentative
     89 		 * addresses which are reported by the route socket, although
     90 		 * named is not yet able to properly bind to them.
     91 		 */
     92 		r = uv_udp_bind(handle, addr, flags);
     93 	}
     94 
     95 	return r;
     96 }
     97 
     98 static int
     99 tcp_bind_now(uv_tcp_t *handle, const struct sockaddr *addr,
    100 	     unsigned int flags) {
    101 	int r;
    102 	struct sockaddr_storage sname;
    103 	int snamelen = sizeof(sname);
    104 
    105 	r = uv_tcp_bind(handle, addr, flags);
    106 	if (r < 0) {
    107 		return r;
    108 	}
    109 
    110 	/*
    111 	 * uv_tcp_bind() uses a delayed error, initially returning
    112 	 * success even if bind() fails. By calling uv_tcp_getsockname()
    113 	 * here we can find out whether the bind() call was successful.
    114 	 */
    115 	r = uv_tcp_getsockname(handle, (struct sockaddr *)&sname, &snamelen);
    116 	if (r < 0) {
    117 		return r;
    118 	}
    119 
    120 	return 0;
    121 }
    122 
    123 int
    124 isc__nm_tcp_freebind(uv_tcp_t *handle, const struct sockaddr *addr,
    125 		     unsigned int flags) {
    126 	int r;
    127 	uv_os_sock_t fd = -1;
    128 
    129 	r = uv_fileno((const uv_handle_t *)handle, (uv_os_fd_t *)&fd);
    130 	if (r < 0) {
    131 		return r;
    132 	}
    133 
    134 	r = tcp_bind_now(handle, addr, flags);
    135 	if (r == UV_EADDRNOTAVAIL &&
    136 	    socket_freebind(fd, addr->sa_family) == ISC_R_SUCCESS)
    137 	{
    138 		/*
    139 		 * Retry binding with IP_FREEBIND (or equivalent option) if the
    140 		 * address is not available. This helps with IPv6 tentative
    141 		 * addresses which are reported by the route socket, although
    142 		 * named is not yet able to properly bind to them.
    143 		 */
    144 		r = tcp_bind_now(handle, addr, flags);
    145 	}
    146 
    147 	return r;
    148 }
    149 
    150 isc_result_t
    151 isc__nm_socket(int domain, int type, int protocol, uv_os_sock_t *sockp) {
    152 	int sock = socket(domain, type, protocol);
    153 	if (sock < 0) {
    154 		return isc_errno_toresult(errno);
    155 	}
    156 
    157 	*sockp = (uv_os_sock_t)sock;
    158 	return ISC_R_SUCCESS;
    159 }
    160 
    161 void
    162 isc__nm_closesocket(uv_os_sock_t sock) {
    163 	close(sock);
    164 }
    165 
    166 isc_result_t
    167 isc__nm_socket_reuse(uv_os_sock_t fd, int val) {
    168 	/*
    169 	 * Generally, the SO_REUSEADDR socket option allows reuse of
    170 	 * local addresses.
    171 	 *
    172 	 * On the BSDs, SO_REUSEPORT implies SO_REUSEADDR but with some
    173 	 * additional refinements for programs that use multicast.
    174 	 *
    175 	 * On Linux, SO_REUSEPORT has different semantics: it _shares_ the port
    176 	 * rather than steal it from the current listener, so we don't use it
    177 	 * here, but rather in isc__nm_socket_reuse_lb().
    178 	 */
    179 
    180 #if defined(SO_REUSEPORT) && !defined(__linux__)
    181 	if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &val, sizeof(val)) == -1) {
    182 		return ISC_R_FAILURE;
    183 	}
    184 	return ISC_R_SUCCESS;
    185 #elif defined(SO_REUSEADDR)
    186 	if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val)) == -1) {
    187 		return ISC_R_FAILURE;
    188 	}
    189 	return ISC_R_SUCCESS;
    190 #else
    191 	UNUSED(fd);
    192 	return ISC_R_NOTIMPLEMENTED;
    193 #endif
    194 }
    195 
    196 isc_result_t
    197 isc__nm_socket_reuse_lb(uv_os_sock_t fd) {
    198 	/*
    199 	 * On FreeBSD 12+, SO_REUSEPORT_LB socket option allows sockets to be
    200 	 * bound to an identical socket address. For UDP sockets, the use of
    201 	 * this option can provide better distribution of incoming datagrams to
    202 	 * multiple processes (or threads) as compared to the traditional
    203 	 * technique of having multiple processes compete to receive datagrams
    204 	 * on the same socket.
    205 	 *
    206 	 * On Linux, the same thing is achieved simply with SO_REUSEPORT.
    207 	 */
    208 #if defined(SO_REUSEPORT_LB)
    209 	if (setsockopt_on(fd, SOL_SOCKET, SO_REUSEPORT_LB) == -1) {
    210 		return ISC_R_FAILURE;
    211 	} else {
    212 		return ISC_R_SUCCESS;
    213 	}
    214 #elif defined(SO_REUSEPORT) && defined(__linux__)
    215 	if (setsockopt_on(fd, SOL_SOCKET, SO_REUSEPORT) == -1) {
    216 		return ISC_R_FAILURE;
    217 	} else {
    218 		return ISC_R_SUCCESS;
    219 	}
    220 #else
    221 	UNUSED(fd);
    222 	return ISC_R_NOTIMPLEMENTED;
    223 #endif
    224 }
    225 
    226 isc_result_t
    227 isc__nm_socket_disable_pmtud(uv_os_sock_t fd, sa_family_t sa_family) {
    228 	/*
    229 	 * Disable the Path MTU Discovery on IP packets
    230 	 */
    231 	if (sa_family == AF_INET6) {
    232 #if defined(IPV6_DONTFRAG)
    233 		if (setsockopt_off(fd, IPPROTO_IPV6, IPV6_DONTFRAG) == -1) {
    234 			return ISC_R_FAILURE;
    235 		} else {
    236 			return ISC_R_SUCCESS;
    237 		}
    238 #elif defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT)
    239 		if (setsockopt(fd, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
    240 			       &(int){ IP_PMTUDISC_OMIT }, sizeof(int)) == -1)
    241 		{
    242 			return ISC_R_FAILURE;
    243 		} else {
    244 			return ISC_R_SUCCESS;
    245 		}
    246 #else
    247 		UNUSED(fd);
    248 #endif
    249 	} else if (sa_family == AF_INET) {
    250 #if defined(IP_DONTFRAG)
    251 		if (setsockopt_off(fd, IPPROTO_IP, IP_DONTFRAG) == -1) {
    252 			return ISC_R_FAILURE;
    253 		} else {
    254 			return ISC_R_SUCCESS;
    255 		}
    256 #elif defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT)
    257 		if (setsockopt(fd, IPPROTO_IP, IP_MTU_DISCOVER,
    258 			       &(int){ IP_PMTUDISC_OMIT }, sizeof(int)) == -1)
    259 		{
    260 			return ISC_R_FAILURE;
    261 		} else {
    262 			return ISC_R_SUCCESS;
    263 		}
    264 #else
    265 		UNUSED(fd);
    266 #endif
    267 	} else {
    268 		return ISC_R_FAMILYNOSUPPORT;
    269 	}
    270 
    271 	return ISC_R_NOTIMPLEMENTED;
    272 }
    273 
    274 isc_result_t
    275 isc__nm_socket_v6only(uv_os_sock_t fd, sa_family_t sa_family) {
    276 	/*
    277 	 * Enable the IPv6-only option on IPv6 sockets
    278 	 */
    279 	if (sa_family == AF_INET6) {
    280 #if defined(IPV6_V6ONLY)
    281 		if (setsockopt_on(fd, IPPROTO_IPV6, IPV6_V6ONLY) == -1) {
    282 			return ISC_R_FAILURE;
    283 		} else {
    284 			return ISC_R_SUCCESS;
    285 		}
    286 #else
    287 		UNUSED(fd);
    288 #endif
    289 	}
    290 	return ISC_R_NOTIMPLEMENTED;
    291 }
    292 
    293 isc_result_t
    294 isc__nm_socket_connectiontimeout(uv_os_sock_t fd, int timeout_ms) {
    295 #if defined(TIMEOUT_OPTNAME)
    296 	TIMEOUT_TYPE timeout = timeout_ms / TIMEOUT_DIV;
    297 
    298 	if (timeout == 0) {
    299 		timeout = 1;
    300 	}
    301 
    302 	if (setsockopt(fd, IPPROTO_TCP, TIMEOUT_OPTNAME, &timeout,
    303 		       sizeof(timeout)) == -1)
    304 	{
    305 		return ISC_R_FAILURE;
    306 	}
    307 
    308 	return ISC_R_SUCCESS;
    309 #else
    310 	UNUSED(fd);
    311 	UNUSED(timeout_ms);
    312 
    313 	return ISC_R_SUCCESS;
    314 #endif
    315 }
    316 
    317 isc_result_t
    318 isc__nm_socket_tcp_nodelay(uv_os_sock_t fd, bool value) {
    319 #ifdef TCP_NODELAY
    320 	int ret;
    321 
    322 	if (value) {
    323 		ret = setsockopt_on(fd, IPPROTO_TCP, TCP_NODELAY);
    324 	} else {
    325 		ret = setsockopt_off(fd, IPPROTO_TCP, TCP_NODELAY);
    326 	}
    327 
    328 	if (ret == -1) {
    329 		return ISC_R_FAILURE;
    330 	} else {
    331 		return ISC_R_SUCCESS;
    332 	}
    333 #else
    334 	UNUSED(fd);
    335 	return ISC_R_SUCCESS;
    336 #endif
    337 }
    338 
    339 isc_result_t
    340 isc__nm_socket_tcp_maxseg(uv_os_sock_t fd, int size) {
    341 #ifdef TCP_MAXSEG
    342 	if (setsockopt(fd, IPPROTO_TCP, TCP_MAXSEG, (void *)&size,
    343 		       sizeof(size)))
    344 	{
    345 		return ISC_R_FAILURE;
    346 	} else {
    347 		return ISC_R_SUCCESS;
    348 	}
    349 #else
    350 	UNUSED(fd);
    351 	UNUSED(size);
    352 	return ISC_R_SUCCESS;
    353 #endif
    354 }
    355 
    356 isc_result_t
    357 isc__nm_socket_min_mtu(uv_os_sock_t fd, sa_family_t sa_family) {
    358 	if (sa_family != AF_INET6) {
    359 		return ISC_R_SUCCESS;
    360 	}
    361 #ifdef IPV6_USE_MIN_MTU
    362 	if (setsockopt_on(fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU) == -1) {
    363 		return ISC_R_FAILURE;
    364 	}
    365 #elif defined(IPV6_MTU)
    366 	if (setsockopt(fd, IPPROTO_IPV6, IPV6_MTU, &(int){ 1280 },
    367 		       sizeof(int)) == -1)
    368 	{
    369 		return ISC_R_FAILURE;
    370 	}
    371 #else
    372 	UNUSED(fd);
    373 #endif
    374 
    375 	return ISC_R_SUCCESS;
    376 }
    377 
    378 /*
    379  * See
    380  * https://blog.cloudflare.com/linux-transport-protocol-port-selection-performance/#kernel
    381  * for rationalle.
    382  */
    383 #define PORT_RANGE 1000
    384 
    385 isc_result_t
    386 isc__nm_socket_max_port_range(uv_os_sock_t fd ISC_ATTR_UNUSED,
    387 			      sa_family_t sa_family ISC_ATTR_UNUSED,
    388 			      in_port_t port_low ISC_ATTR_UNUSED,
    389 			      in_port_t port_high ISC_ATTR_UNUSED) {
    390 #ifdef IP_BIND_ADDRESS_NO_PORT
    391 	if (setsockopt_on(fd, IPPROTO_IP, IP_BIND_ADDRESS_NO_PORT) == -1) {
    392 		return ISC_R_FAILURE;
    393 	}
    394 #endif
    395 
    396 #if defined(IP_LOCAL_PORT_RANGE) && defined(__linux__)
    397 	/*
    398 	 * The option takes an uint32_t value with the high 16 bits
    399 	 * set to the upper range bound, and the low 16 bits set to
    400 	 * the lower range bound.  Range bounds are inclusive.  The
    401 	 * 16-bit values should be in host byte order.
    402 	 */
    403 	uint32_t port_range;
    404 	int major, minor;
    405 	isc_os_kernel(NULL, &major, &minor, NULL);
    406 
    407 	/*
    408 	 * Linux 6.8 implemented a following patch:
    409 	 *
    410 	 * If IP_LOCAL_PORT_RANGE is set on a socket before accept(),
    411 	 * port selection no longer favors even ports.
    412 	 *
    413 	 * This means that connect() can find a suitable source port
    414 	 * faster, and applications can use a different split between
    415 	 * connect() and bind() users.
    416 	 */
    417 	if (major < 6 || (major == 6 && minor < 8)) {
    418 		/*
    419 		 * On Linux << 6.8, use IP_LOCAL_PORT_RANGE to
    420 		 * partition ephemeral port range randomly to help
    421 		 * with the port selection.
    422 		 */
    423 		if (port_high - port_low <= PORT_RANGE) {
    424 			return ISC_R_RANGE;
    425 		}
    426 
    427 		/*
    428 		 * port_low <= N < port_high - PORT_RANGE
    429 		 */
    430 		port_high -= PORT_RANGE;
    431 		port_low += isc_random_uniform(port_high - port_low);
    432 		port_high = port_low + PORT_RANGE;
    433 	}
    434 	INSIST(port_low > 0);
    435 	INSIST(port_low < port_high);
    436 
    437 	port_range = (uint32_t)port_low | ((uint32_t)port_high << 16);
    438 	if (setsockopt(fd, IPPROTO_IP, IP_LOCAL_PORT_RANGE, &port_range,
    439 		       sizeof(port_range)) == -1)
    440 	{
    441 		return ISC_R_FAILURE;
    442 	}
    443 #endif
    444 	return ISC_R_SUCCESS;
    445 }
    446