Home | History | Annotate | Line # | Download | only in unix
      1 /*	$NetBSD: socket.c,v 1.1 2024/02/18 20:57:57 christos Exp $	*/
      2 
      3 /*
      4  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
      5  *
      6  * SPDX-License-Identifier: MPL-2.0
      7  *
      8  * This Source Code Form is subject to the terms of the Mozilla Public
      9  * License, v. 2.0.  If a copy of the MPL was not distributed with this
     10  * file, you can obtain one at https://mozilla.org/MPL/2.0/.
     11  *
     12  * See the COPYRIGHT file distributed with this work for additional
     13  * information regarding copyright ownership.
     14  */
     15 
     16 /*! \file */
     17 
     18 #include <inttypes.h>
     19 #include <stdbool.h>
     20 #include <sys/param.h>
     21 #include <sys/socket.h>
     22 #include <sys/stat.h>
     23 #include <sys/types.h>
     24 #if defined(HAVE_SYS_SYSCTL_H) && !defined(__linux__)
     25 #include <sys/sysctl.h>
     26 #endif /* if defined(HAVE_SYS_SYSCTL_H) && !defined(__linux__) */
     27 #include <sys/time.h>
     28 #include <sys/uio.h>
     29 
     30 #if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H)
     31 #include <linux/netlink.h>
     32 #include <linux/rtnetlink.h>
     33 #endif /* if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H) \
     34 	*/
     35 
     36 #include <errno.h>
     37 #include <fcntl.h>
     38 #include <stddef.h>
     39 #include <stdlib.h>
     40 #include <unistd.h>
     41 
     42 #include <isc/app.h>
     43 #include <isc/buffer.h>
     44 #include <isc/condition.h>
     45 #include <isc/formatcheck.h>
     46 #include <isc/list.h>
     47 #include <isc/log.h>
     48 #include <isc/mem.h>
     49 #include <isc/mutex.h>
     50 #include <isc/net.h>
     51 #include <isc/once.h>
     52 #include <isc/platform.h>
     53 #include <isc/print.h>
     54 #include <isc/refcount.h>
     55 #include <isc/region.h>
     56 #include <isc/resource.h>
     57 #include <isc/socket.h>
     58 #include <isc/stats.h>
     59 #include <isc/strerr.h>
     60 #include <isc/string.h>
     61 #include <isc/task.h>
     62 #include <isc/thread.h>
     63 #include <isc/util.h>
     64 
     65 #ifdef ISC_PLATFORM_HAVESYSUNH
     66 #include <sys/un.h>
     67 #endif /* ifdef ISC_PLATFORM_HAVESYSUNH */
     68 #ifdef HAVE_KQUEUE
     69 #include <sys/event.h>
     70 #endif /* ifdef HAVE_KQUEUE */
     71 #ifdef HAVE_EPOLL_CREATE1
     72 #include <sys/epoll.h>
     73 #endif /* ifdef HAVE_EPOLL_CREATE1 */
     74 #if defined(HAVE_SYS_DEVPOLL_H)
     75 #include <sys/devpoll.h>
     76 #elif defined(HAVE_DEVPOLL_H)
     77 #include <devpoll.h>
     78 #endif /* if defined(HAVE_SYS_DEVPOLL_H) */
     79 
     80 #include <netinet/tcp.h>
     81 
     82 #include "errno2result.h"
     83 
     84 #ifdef ENABLE_TCP_FASTOPEN
     85 #include <netinet/tcp.h>
     86 #endif /* ifdef ENABLE_TCP_FASTOPEN */
     87 
     88 #ifdef HAVE_JSON_C
     89 #include <json_object.h>
     90 #endif /* HAVE_JSON_C */
     91 
     92 #ifdef HAVE_LIBXML2
     93 #include <libxml/xmlwriter.h>
     94 #define ISC_XMLCHAR (const xmlChar *)
     95 #endif /* HAVE_LIBXML2 */
     96 
     97 /*%
     98  * Choose the most preferable multiplex method.
     99  */
    100 #if defined(HAVE_KQUEUE)
    101 #define USE_KQUEUE
    102 #elif defined(HAVE_EPOLL_CREATE1)
    103 #define USE_EPOLL
    104 #elif defined(HAVE_SYS_DEVPOLL_H) || defined(HAVE_DEVPOLL_H)
    105 #define USE_DEVPOLL
    106 typedef struct {
    107 	unsigned int want_read : 1, want_write : 1;
    108 } pollinfo_t;
    109 #else /* if defined(HAVE_KQUEUE) */
    110 #define USE_SELECT
    111 #endif /* HAVE_KQUEUE */
    112 
    113 /*
    114  * Set by the -T dscp option on the command line. If set to a value
    115  * other than -1, we check to make sure DSCP values match it, and
    116  * assert if not.
    117  */
    118 int isc_dscp_check_value = -1;
    119 
    120 /*%
    121  * Maximum number of allowable open sockets.  This is also the maximum
    122  * allowable socket file descriptor.
    123  *
    124  * Care should be taken before modifying this value for select():
    125  * The API standard doesn't ensure select() accept more than (the system default
    126  * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
    127  * the vast majority of cases.  This constant should therefore be increased only
    128  * when absolutely necessary and possible, i.e., the server is exhausting all
    129  * available file descriptors (up to FD_SETSIZE) and the select() function
    130  * and FD_xxx macros support larger values than FD_SETSIZE (which may not
    131  * always by true, but we keep using some of them to ensure as much
    132  * portability as possible).  Note also that overall server performance
    133  * may be rather worsened with a larger value of this constant due to
    134  * inherent scalability problems of select().
    135  *
    136  * As a special note, this value shouldn't have to be touched if
    137  * this is a build for an authoritative only DNS server.
    138  */
    139 #ifndef ISC_SOCKET_MAXSOCKETS
    140 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
    141 #ifdef TUNE_LARGE
    142 #define ISC_SOCKET_MAXSOCKETS 21000
    143 #else /* ifdef TUNE_LARGE */
    144 #define ISC_SOCKET_MAXSOCKETS 4096
    145 #endif /* TUNE_LARGE */
    146 #elif defined(USE_SELECT)
    147 #define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
    148 #endif /* USE_KQUEUE... */
    149 #endif /* ISC_SOCKET_MAXSOCKETS */
    150 
    151 #ifdef USE_SELECT
    152 /*%
    153  * Mac OS X needs a special definition to support larger values in select().
    154  * We always define this because a larger value can be specified run-time.
    155  */
    156 #ifdef __APPLE__
    157 #define _DARWIN_UNLIMITED_SELECT
    158 #endif /* __APPLE__ */
    159 #endif /* USE_SELECT */
    160 
    161 #ifdef ISC_SOCKET_USE_POLLWATCH
    162 /*%
    163  * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
    164  * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
    165  * some of the specified FD.  The idea is based on the observation that it's
    166  * likely for a busy server to keep receiving packets.  It specifically works
    167  * as follows: the socket watcher is first initialized with the state of
    168  * "poll_idle".  While it's in the idle state it keeps sleeping until a socket
    169  * event occurs.  When it wakes up for a socket I/O event, it moves to the
    170  * poll_active state, and sets the poll timeout to a short period
    171  * (ISC_SOCKET_POLLWATCH_TIMEOUT msec).  If timeout occurs in this state, the
    172  * watcher goes to the poll_checking state with the same timeout period.
    173  * In this state, the watcher tries to detect whether this is a break
    174  * during intermittent events or the kernel bug is triggered.  If the next
    175  * polling reports an event within the short period, the previous timeout is
    176  * likely to be a kernel bug, and so the watcher goes back to the active state.
    177  * Otherwise, it moves to the idle state again.
    178  *
    179  * It's not clear whether this is a thread-related bug, but since we've only
    180  * seen this with threads, this workaround is used only when enabling threads.
    181  */
    182 
    183 typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;
    184 
    185 #ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
    186 #define ISC_SOCKET_POLLWATCH_TIMEOUT 10
    187 #endif /* ISC_SOCKET_POLLWATCH_TIMEOUT */
    188 #endif /* ISC_SOCKET_USE_POLLWATCH */
    189 
    190 /*%
    191  * Per-FD lock buckets, we shuffle them around a bit as FDs come in herds.
    192  */
    193 #define FDLOCK_BITS  10
    194 #define FDLOCK_COUNT (1 << FDLOCK_BITS)
    195 #define FDLOCK_ID(fd)                                   \
    196 	(((fd) % (FDLOCK_COUNT) >> (FDLOCK_BITS / 2)) | \
    197 	 (((fd) << (FDLOCK_BITS / 2)) % (FDLOCK_COUNT)))
    198 
    199 /*%
    200  * Maximum number of events communicated with the kernel.  There should normally
    201  * be no need for having a large number.
    202  */
    203 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
    204 #ifndef ISC_SOCKET_MAXEVENTS
    205 #ifdef TUNE_LARGE
    206 #define ISC_SOCKET_MAXEVENTS 2048
    207 #else /* ifdef TUNE_LARGE */
    208 #define ISC_SOCKET_MAXEVENTS 64
    209 #endif /* TUNE_LARGE */
    210 #endif /* ifndef ISC_SOCKET_MAXEVENTS */
    211 #endif /* if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) \
    212 	* */
    213 
    214 /*%
    215  * Some systems define the socket length argument as an int, some as size_t,
    216  * some as socklen_t.  This is here so it can be easily changed if needed.
    217  */
    218 #ifndef socklen_t
    219 #define socklen_t unsigned int
    220 #endif /* ifndef socklen_t */
    221 
    222 /*%
    223  * Define what the possible "soft" errors can be.  These are non-fatal returns
    224  * of various network related functions, like recv() and so on.
    225  *
    226  * For some reason, BSDI (and perhaps others) will sometimes return <0
    227  * from recv() but will have errno==0.  This is broken, but we have to
    228  * work around it here.
    229  */
    230 #define SOFT_ERROR(e)                                             \
    231 	((e) == EAGAIN || (e) == EWOULDBLOCK || (e) == ENOBUFS || \
    232 	 (e) == EINTR || (e) == 0)
    233 
    234 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
    235 
    236 /*!<
    237  * DLVL(90)  --  Function entry/exit and other tracing.
    238  * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
    239  * DLVL(60)  --  Socket data send/receive
    240  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
    241  * DLVL(20)  --  Socket creation/destruction.
    242  */
    243 #define TRACE_LEVEL	  90
    244 #define CORRECTNESS_LEVEL 70
    245 #define IOEVENT_LEVEL	  60
    246 #define EVENT_LEVEL	  50
    247 #define CREATION_LEVEL	  20
    248 
    249 #define TRACE	    DLVL(TRACE_LEVEL)
    250 #define CORRECTNESS DLVL(CORRECTNESS_LEVEL)
    251 #define IOEVENT	    DLVL(IOEVENT_LEVEL)
    252 #define EVENT	    DLVL(EVENT_LEVEL)
    253 #define CREATION    DLVL(CREATION_LEVEL)
    254 
    255 typedef isc_event_t intev_t;
    256 
    257 #define SOCKET_MAGIC	ISC_MAGIC('I', 'O', 'i', 'o')
    258 #define VALID_SOCKET(s) ISC_MAGIC_VALID(s, SOCKET_MAGIC)
    259 
    260 /*!
    261  * IPv6 control information.  If the socket is an IPv6 socket we want
    262  * to collect the destination address and interface so the client can
    263  * set them on outgoing packets.
    264  */
    265 #ifndef USE_CMSG
    266 #define USE_CMSG 1
    267 #endif /* ifndef USE_CMSG */
    268 
    269 /*%
    270  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
    271  * a setsockopt() like interface to request timestamps, and if the OS
    272  * doesn't do it for us, call gettimeofday() on every UDP receive?
    273  */
    274 #ifdef SO_TIMESTAMP
    275 #ifndef USE_CMSG
    276 #define USE_CMSG 1
    277 #endif /* ifndef USE_CMSG */
    278 #endif /* ifdef SO_TIMESTAMP */
    279 
    280 #if defined(SO_RCVBUF) && defined(ISC_RECV_BUFFER_SIZE)
    281 #define SET_RCVBUF
    282 #endif
    283 
    284 #if defined(SO_SNDBUF) && defined(ISC_SEND_BUFFER_SIZE)
    285 #define SET_SNDBUF
    286 #endif
    287 
    288 /*%
    289  * Instead of calculating the cmsgbuf lengths every time we take
    290  * a rule of thumb approach - sizes are taken from x86_64 linux,
    291  * multiplied by 2, everything should fit. Those sizes are not
    292  * large enough to cause any concern.
    293  */
    294 #if defined(USE_CMSG)
    295 #define CMSG_SP_IN6PKT 40
    296 #else /* if defined(USE_CMSG) */
    297 #define CMSG_SP_IN6PKT 0
    298 #endif /* if defined(USE_CMSG) */
    299 
    300 #if defined(USE_CMSG) && defined(SO_TIMESTAMP)
    301 #define CMSG_SP_TIMESTAMP 32
    302 #else /* if defined(USE_CMSG) && defined(SO_TIMESTAMP) */
    303 #define CMSG_SP_TIMESTAMP 0
    304 #endif /* if defined(USE_CMSG) && defined(SO_TIMESTAMP) */
    305 
    306 #if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS))
    307 #define CMSG_SP_TCTOS 24
    308 #else /* if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) */
    309 #define CMSG_SP_TCTOS 0
    310 #endif /* if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) */
    311 
    312 #define CMSG_SP_INT 24
    313 
    314 /* Align cmsg buffers to be safe on SPARC etc. */
    315 #define RECVCMSGBUFLEN                                                       \
    316 	ISC_ALIGN(2 * (CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS) + \
    317 			  1,                                                 \
    318 		  sizeof(void *))
    319 #define SENDCMSGBUFLEN                                                    \
    320 	ISC_ALIGN(2 * (CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS) + 1, \
    321 		  sizeof(void *))
    322 
    323 /*%
    324  * The number of times a send operation is repeated if the result is EINTR.
    325  */
    326 #define NRETRIES 10
    327 
    328 typedef struct isc__socketthread isc__socketthread_t;
    329 
    330 #define NEWCONNSOCK(ev) ((ev)->newsocket)
    331 
    332 struct isc_socket {
    333 	/* Not locked. */
    334 	unsigned int magic;
    335 	isc_socketmgr_t *manager;
    336 	isc_mutex_t lock;
    337 	isc_sockettype_t type;
    338 	const isc_statscounter_t *statsindex;
    339 	isc_refcount_t references;
    340 
    341 	/* Locked by socket lock. */
    342 	ISC_LINK(isc_socket_t) link;
    343 	int fd;
    344 	int pf;
    345 	int threadid;
    346 	char name[16];
    347 	void *tag;
    348 
    349 	ISC_LIST(isc_socketevent_t) send_list;
    350 	ISC_LIST(isc_socketevent_t) recv_list;
    351 	ISC_LIST(isc_socket_newconnev_t) accept_list;
    352 	ISC_LIST(isc_socket_connev_t) connect_list;
    353 
    354 	isc_sockaddr_t peer_address; /* remote address */
    355 
    356 	unsigned int listener : 1,	       /* listener socket */
    357 		connected : 1, connecting : 1, /* connect pending
    358 						* */
    359 		bound  : 1,		       /* bound to local addr */
    360 		dupped : 1, active : 1,	       /* currently active */
    361 		pktdscp : 1;		       /* per packet dscp */
    362 
    363 #ifdef ISC_PLATFORM_RECVOVERFLOW
    364 	unsigned char overflow; /* used for MSG_TRUNC fake */
    365 #endif				/* ifdef ISC_PLATFORM_RECVOVERFLOW */
    366 
    367 	void			*fdwatcharg;
    368 	isc_sockfdwatch_t	fdwatchcb;
    369 	int			fdwatchflags;
    370 	isc_task_t              *fdwatchtask;
    371 	unsigned int		dscp;
    372 };
    373 
    374 #define SOCKET_MANAGER_MAGIC ISC_MAGIC('I', 'O', 'm', 'g')
    375 #define VALID_MANAGER(m)     ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
    376 
    377 struct isc_socketmgr {
    378 	/* Not locked. */
    379 	unsigned int magic;
    380 	isc_mem_t *mctx;
    381 	isc_mutex_t lock;
    382 	isc_stats_t *stats;
    383 	int nthreads;
    384 	isc__socketthread_t *threads;
    385 	unsigned int maxsocks;
    386 	/* Locked by manager lock. */
    387 	ISC_LIST(isc_socket_t) socklist;
    388 	int reserved; /* unlocked */
    389 	isc_condition_t shutdown_ok;
    390 	size_t maxudp;
    391 };
    392 
    393 struct isc__socketthread {
    394 	isc_socketmgr_t *manager;
    395 	int threadid;
    396 	isc_thread_t thread;
    397 	int pipe_fds[2];
    398 	isc_mutex_t *fdlock;
    399 	/* Locked by fdlock. */
    400 	isc_socket_t **fds;
    401 	int *fdstate;
    402 #ifdef USE_KQUEUE
    403 	int kqueue_fd;
    404 	int nevents;
    405 	struct kevent *events;
    406 #endif /* USE_KQUEUE */
    407 #ifdef USE_EPOLL
    408 	int epoll_fd;
    409 	int nevents;
    410 	struct epoll_event *events;
    411 	uint32_t *epoll_events;
    412 #endif /* USE_EPOLL */
    413 #ifdef USE_DEVPOLL
    414 	int devpoll_fd;
    415 	isc_resourcevalue_t open_max;
    416 	unsigned int calls;
    417 	int nevents;
    418 	struct pollfd *events;
    419 	pollinfo_t *fdpollinfo;
    420 #endif /* USE_DEVPOLL */
    421 #ifdef USE_SELECT
    422 	int fd_bufsize;
    423 	fd_set *read_fds;
    424 	fd_set *read_fds_copy;
    425 	fd_set *write_fds;
    426 	fd_set *write_fds_copy;
    427 	int maxfd;
    428 #endif /* USE_SELECT */
    429 };
    430 
    431 #define CLOSED	      0 /* this one must be zero */
    432 #define MANAGED	      1
    433 #define CLOSE_PENDING 2
    434 
    435 /*
    436  * send() and recv() iovec counts
    437  */
    438 #define MAXSCATTERGATHER_SEND (ISC_SOCKET_MAXSCATTERGATHER)
    439 #ifdef ISC_PLATFORM_RECVOVERFLOW
    440 #define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER + 1)
    441 #else /* ifdef ISC_PLATFORM_RECVOVERFLOW */
    442 #define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER)
    443 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
    444 
    445 static isc_result_t
    446 socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
    447 	      isc_socket_t **socketp, isc_socket_t *dup_socket);
    448 static void
    449 send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
    450 static void
    451 send_senddone_event(isc_socket_t *, isc_socketevent_t **);
    452 static void
    453 send_connectdone_event(isc_socket_t *, isc_socket_connev_t **);
    454 static void
    455 free_socket(isc_socket_t **);
    456 static isc_result_t
    457 allocate_socket(isc_socketmgr_t *, isc_sockettype_t, isc_socket_t **);
    458 static void
    459 destroy(isc_socket_t **);
    460 static void
    461 internal_accept(isc_socket_t *);
    462 static void
    463 internal_connect(isc_socket_t *);
    464 static void
    465 internal_recv(isc_socket_t *);
    466 static void
    467 internal_send(isc_socket_t *);
    468 static void
    469 process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
    470 static void
    471 build_msghdr_send(isc_socket_t *, char *, isc_socketevent_t *, struct msghdr *,
    472 		  struct iovec *, size_t *);
    473 static void
    474 build_msghdr_recv(isc_socket_t *, char *, isc_socketevent_t *, struct msghdr *,
    475 		  struct iovec *, size_t *);
    476 static bool
    477 process_ctlfd(isc__socketthread_t *thread);
    478 static void
    479 setdscp(isc_socket_t *sock, isc_dscp_t dscp);
    480 static void
    481 dispatch_recv(isc_socket_t *sock);
    482 static void
    483 dispatch_send(isc_socket_t *sock);
    484 static void
    485 internal_fdwatch_read(isc_socket_t *sock);
    486 static void
    487 internal_fdwatch_write(isc_socket_t *sock);
    488 
    489 #define SELECT_POKE_SHUTDOWN (-1)
    490 #define SELECT_POKE_NOTHING  (-2)
    491 #define SELECT_POKE_READ     (-3)
    492 #define SELECT_POKE_ACCEPT   (-3) /*%< Same as _READ */
    493 #define SELECT_POKE_WRITE    (-4)
    494 #define SELECT_POKE_CONNECT  (-4) /*%< Same as _WRITE */
    495 #define SELECT_POKE_CLOSE    (-5)
    496 
    497 /*%
    498  * Shortcut index arrays to get access to statistics counters.
    499  */
    500 enum {
    501 	STATID_OPEN = 0,
    502 	STATID_OPENFAIL = 1,
    503 	STATID_CLOSE = 2,
    504 	STATID_BINDFAIL = 3,
    505 	STATID_CONNECTFAIL = 4,
    506 	STATID_CONNECT = 5,
    507 	STATID_ACCEPTFAIL = 6,
    508 	STATID_ACCEPT = 7,
    509 	STATID_SENDFAIL = 8,
    510 	STATID_RECVFAIL = 9,
    511 	STATID_ACTIVE = 10
    512 };
    513 static const isc_statscounter_t udp4statsindex[] = {
    514 	isc_sockstatscounter_udp4open,
    515 	isc_sockstatscounter_udp4openfail,
    516 	isc_sockstatscounter_udp4close,
    517 	isc_sockstatscounter_udp4bindfail,
    518 	isc_sockstatscounter_udp4connectfail,
    519 	isc_sockstatscounter_udp4connect,
    520 	-1,
    521 	-1,
    522 	isc_sockstatscounter_udp4sendfail,
    523 	isc_sockstatscounter_udp4recvfail,
    524 	isc_sockstatscounter_udp4active
    525 };
    526 static const isc_statscounter_t udp6statsindex[] = {
    527 	isc_sockstatscounter_udp6open,
    528 	isc_sockstatscounter_udp6openfail,
    529 	isc_sockstatscounter_udp6close,
    530 	isc_sockstatscounter_udp6bindfail,
    531 	isc_sockstatscounter_udp6connectfail,
    532 	isc_sockstatscounter_udp6connect,
    533 	-1,
    534 	-1,
    535 	isc_sockstatscounter_udp6sendfail,
    536 	isc_sockstatscounter_udp6recvfail,
    537 	isc_sockstatscounter_udp6active
    538 };
    539 static const isc_statscounter_t tcp4statsindex[] = {
    540 	isc_sockstatscounter_tcp4open,	      isc_sockstatscounter_tcp4openfail,
    541 	isc_sockstatscounter_tcp4close,	      isc_sockstatscounter_tcp4bindfail,
    542 	isc_sockstatscounter_tcp4connectfail, isc_sockstatscounter_tcp4connect,
    543 	isc_sockstatscounter_tcp4acceptfail,  isc_sockstatscounter_tcp4accept,
    544 	isc_sockstatscounter_tcp4sendfail,    isc_sockstatscounter_tcp4recvfail,
    545 	isc_sockstatscounter_tcp4active
    546 };
    547 static const isc_statscounter_t tcp6statsindex[] = {
    548 	isc_sockstatscounter_tcp6open,	      isc_sockstatscounter_tcp6openfail,
    549 	isc_sockstatscounter_tcp6close,	      isc_sockstatscounter_tcp6bindfail,
    550 	isc_sockstatscounter_tcp6connectfail, isc_sockstatscounter_tcp6connect,
    551 	isc_sockstatscounter_tcp6acceptfail,  isc_sockstatscounter_tcp6accept,
    552 	isc_sockstatscounter_tcp6sendfail,    isc_sockstatscounter_tcp6recvfail,
    553 	isc_sockstatscounter_tcp6active
    554 };
    555 static const isc_statscounter_t unixstatsindex[] = {
    556 	isc_sockstatscounter_unixopen,	      isc_sockstatscounter_unixopenfail,
    557 	isc_sockstatscounter_unixclose,	      isc_sockstatscounter_unixbindfail,
    558 	isc_sockstatscounter_unixconnectfail, isc_sockstatscounter_unixconnect,
    559 	isc_sockstatscounter_unixacceptfail,  isc_sockstatscounter_unixaccept,
    560 	isc_sockstatscounter_unixsendfail,    isc_sockstatscounter_unixrecvfail,
    561 	isc_sockstatscounter_unixactive
    562 };
    563 static const isc_statscounter_t rawstatsindex[] = {
    564 	isc_sockstatscounter_rawopen,
    565 	isc_sockstatscounter_rawopenfail,
    566 	isc_sockstatscounter_rawclose,
    567 	-1,
    568 	-1,
    569 	-1,
    570 	-1,
    571 	-1,
    572 	-1,
    573 	isc_sockstatscounter_rawrecvfail,
    574 	isc_sockstatscounter_rawactive
    575 };
    576 
    577 static int
    578 gen_threadid(isc_socket_t *sock);
    579 
    580 static int
    581 gen_threadid(isc_socket_t *sock) {
    582 	return (sock->fd % sock->manager->nthreads);
    583 }
    584 
    585 static void
    586 manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
    587 	    isc_logmodule_t *module, int level, const char *fmt, ...)
    588 	ISC_FORMAT_PRINTF(5, 6);
    589 static void
    590 manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
    591 	    isc_logmodule_t *module, int level, const char *fmt, ...) {
    592 	char msgbuf[2048];
    593 	va_list ap;
    594 
    595 	if (!isc_log_wouldlog(isc_lctx, level)) {
    596 		return;
    597 	}
    598 
    599 	va_start(ap, fmt);
    600 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
    601 	va_end(ap);
    602 
    603 	isc_log_write(isc_lctx, category, module, level, "sockmgr %p: %s",
    604 		      sockmgr, msgbuf);
    605 }
    606 
    607 static void
    608 thread_log(isc__socketthread_t *thread, isc_logcategory_t *category,
    609 	   isc_logmodule_t *module, int level, const char *fmt, ...)
    610 	ISC_FORMAT_PRINTF(5, 6);
    611 static void
    612 thread_log(isc__socketthread_t *thread, isc_logcategory_t *category,
    613 	   isc_logmodule_t *module, int level, const char *fmt, ...) {
    614 	char msgbuf[2048];
    615 	va_list ap;
    616 
    617 	if (!isc_log_wouldlog(isc_lctx, level)) {
    618 		return;
    619 	}
    620 
    621 	va_start(ap, fmt);
    622 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
    623 	va_end(ap);
    624 
    625 	isc_log_write(isc_lctx, category, module, level,
    626 		      "sockmgr %p thread %d: %s", thread->manager,
    627 		      thread->threadid, msgbuf);
    628 }
    629 
    630 static void
    631 socket_log(isc_socket_t *sock, const isc_sockaddr_t *address,
    632 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
    633 	   const char *fmt, ...) ISC_FORMAT_PRINTF(6, 7);
    634 static void
    635 socket_log(isc_socket_t *sock, const isc_sockaddr_t *address,
    636 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
    637 	   const char *fmt, ...) {
    638 	char msgbuf[2048];
    639 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
    640 	va_list ap;
    641 
    642 	if (!isc_log_wouldlog(isc_lctx, level)) {
    643 		return;
    644 	}
    645 
    646 	va_start(ap, fmt);
    647 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
    648 	va_end(ap);
    649 
    650 	if (address == NULL) {
    651 		isc_log_write(isc_lctx, category, module, level,
    652 			      "socket %p: %s", sock, msgbuf);
    653 	} else {
    654 		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
    655 		isc_log_write(isc_lctx, category, module, level,
    656 			      "socket %p %s: %s", sock, peerbuf, msgbuf);
    657 	}
    658 }
    659 
    660 /*%
    661  * Increment socket-related statistics counters.
    662  */
    663 static void
    664 inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
    665 	REQUIRE(counterid != -1);
    666 
    667 	if (stats != NULL) {
    668 		isc_stats_increment(stats, counterid);
    669 	}
    670 }
    671 
    672 /*%
    673  * Decrement socket-related statistics counters.
    674  */
    675 static void
    676 dec_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
    677 	REQUIRE(counterid != -1);
    678 
    679 	if (stats != NULL) {
    680 		isc_stats_decrement(stats, counterid);
    681 	}
    682 }
    683 
    684 static isc_result_t
    685 watch_fd(isc__socketthread_t *thread, int fd, int msg) {
    686 	isc_result_t result = ISC_R_SUCCESS;
    687 
    688 #ifdef USE_KQUEUE
    689 	struct kevent evchange;
    690 
    691 	memset(&evchange, 0, sizeof(evchange));
    692 	if (msg == SELECT_POKE_READ) {
    693 		evchange.filter = EVFILT_READ;
    694 	} else {
    695 		evchange.filter = EVFILT_WRITE;
    696 	}
    697 	evchange.flags = EV_ADD;
    698 	evchange.ident = fd;
    699 	if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) {
    700 		result = isc__errno2result(errno);
    701 	}
    702 
    703 	return (result);
    704 #elif defined(USE_EPOLL)
    705 	struct epoll_event event;
    706 	uint32_t oldevents;
    707 	int ret;
    708 	int op;
    709 
    710 	oldevents = thread->epoll_events[fd];
    711 	if (msg == SELECT_POKE_READ) {
    712 		thread->epoll_events[fd] |= EPOLLIN;
    713 	} else {
    714 		thread->epoll_events[fd] |= EPOLLOUT;
    715 	}
    716 
    717 	event.events = thread->epoll_events[fd];
    718 	memset(&event.data, 0, sizeof(event.data));
    719 	event.data.fd = fd;
    720 
    721 	op = (oldevents == 0U) ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
    722 	if (thread->fds[fd] != NULL) {
    723 		LOCK(&thread->fds[fd]->lock);
    724 	}
    725 	ret = epoll_ctl(thread->epoll_fd, op, fd, &event);
    726 	if (thread->fds[fd] != NULL) {
    727 		UNLOCK(&thread->fds[fd]->lock);
    728 	}
    729 	if (ret == -1) {
    730 		if (errno == EEXIST) {
    731 			UNEXPECTED_ERROR(__FILE__, __LINE__,
    732 					 "epoll_ctl(ADD/MOD) returned "
    733 					 "EEXIST for fd %d",
    734 					 fd);
    735 		}
    736 		result = isc__errno2result(errno);
    737 	}
    738 
    739 	return (result);
    740 #elif defined(USE_DEVPOLL)
    741 	struct pollfd pfd;
    742 
    743 	memset(&pfd, 0, sizeof(pfd));
    744 	if (msg == SELECT_POKE_READ) {
    745 		pfd.events = POLLIN;
    746 	} else {
    747 		pfd.events = POLLOUT;
    748 	}
    749 	pfd.fd = fd;
    750 	pfd.revents = 0;
    751 	if (write(thread->devpoll_fd, &pfd, sizeof(pfd)) == -1) {
    752 		result = isc__errno2result(errno);
    753 	} else {
    754 		if (msg == SELECT_POKE_READ) {
    755 			thread->fdpollinfo[fd].want_read = 1;
    756 		} else {
    757 			thread->fdpollinfo[fd].want_write = 1;
    758 		}
    759 	}
    760 
    761 	return (result);
    762 #elif defined(USE_SELECT)
    763 	LOCK(&thread->manager->lock);
    764 	if (msg == SELECT_POKE_READ) {
    765 		FD_SET(fd, thread->read_fds);
    766 	}
    767 	if (msg == SELECT_POKE_WRITE) {
    768 		FD_SET(fd, thread->write_fds);
    769 	}
    770 	UNLOCK(&thread->manager->lock);
    771 
    772 	return (result);
    773 #endif /* ifdef USE_KQUEUE */
    774 }
    775 
    776 static isc_result_t
    777 unwatch_fd(isc__socketthread_t *thread, int fd, int msg) {
    778 	isc_result_t result = ISC_R_SUCCESS;
    779 
    780 #ifdef USE_KQUEUE
    781 	struct kevent evchange;
    782 
    783 	memset(&evchange, 0, sizeof(evchange));
    784 	if (msg == SELECT_POKE_READ) {
    785 		evchange.filter = EVFILT_READ;
    786 	} else {
    787 		evchange.filter = EVFILT_WRITE;
    788 	}
    789 	evchange.flags = EV_DELETE;
    790 	evchange.ident = fd;
    791 	if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) {
    792 		result = isc__errno2result(errno);
    793 	}
    794 
    795 	return (result);
    796 #elif defined(USE_EPOLL)
    797 	struct epoll_event event;
    798 	int ret;
    799 	int op;
    800 
    801 	if (msg == SELECT_POKE_READ) {
    802 		thread->epoll_events[fd] &= ~(EPOLLIN);
    803 	} else {
    804 		thread->epoll_events[fd] &= ~(EPOLLOUT);
    805 	}
    806 
    807 	event.events = thread->epoll_events[fd];
    808 	memset(&event.data, 0, sizeof(event.data));
    809 	event.data.fd = fd;
    810 
    811 	op = (event.events == 0U) ? EPOLL_CTL_DEL : EPOLL_CTL_MOD;
    812 	ret = epoll_ctl(thread->epoll_fd, op, fd, &event);
    813 	if (ret == -1 && errno != ENOENT) {
    814 		char strbuf[ISC_STRERRORSIZE];
    815 		strerror_r(errno, strbuf, sizeof(strbuf));
    816 		UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_ctl(DEL), %d: %s",
    817 				 fd, strbuf);
    818 		result = ISC_R_UNEXPECTED;
    819 	}
    820 	return (result);
    821 #elif defined(USE_DEVPOLL)
    822 	struct pollfd pfds[2];
    823 	size_t writelen = sizeof(pfds[0]);
    824 
    825 	memset(pfds, 0, sizeof(pfds));
    826 	pfds[0].events = POLLREMOVE;
    827 	pfds[0].fd = fd;
    828 
    829 	/*
    830 	 * Canceling read or write polling via /dev/poll is tricky.  Since it
    831 	 * only provides a way of canceling per FD, we may need to re-poll the
    832 	 * socket for the other operation.
    833 	 */
    834 	if (msg == SELECT_POKE_READ && thread->fdpollinfo[fd].want_write == 1) {
    835 		pfds[1].events = POLLOUT;
    836 		pfds[1].fd = fd;
    837 		writelen += sizeof(pfds[1]);
    838 	}
    839 	if (msg == SELECT_POKE_WRITE && thread->fdpollinfo[fd].want_read == 1) {
    840 		pfds[1].events = POLLIN;
    841 		pfds[1].fd = fd;
    842 		writelen += sizeof(pfds[1]);
    843 	}
    844 
    845 	if (write(thread->devpoll_fd, pfds, writelen) == -1) {
    846 		result = isc__errno2result(errno);
    847 	} else {
    848 		if (msg == SELECT_POKE_READ) {
    849 			thread->fdpollinfo[fd].want_read = 0;
    850 		} else {
    851 			thread->fdpollinfo[fd].want_write = 0;
    852 		}
    853 	}
    854 
    855 	return (result);
    856 #elif defined(USE_SELECT)
    857 	LOCK(&thread->manager->lock);
    858 	if (msg == SELECT_POKE_READ) {
    859 		FD_CLR(fd, thread->read_fds);
    860 	} else if (msg == SELECT_POKE_WRITE) {
    861 		FD_CLR(fd, thread->write_fds);
    862 	}
    863 	UNLOCK(&thread->manager->lock);
    864 
    865 	return (result);
    866 #endif /* ifdef USE_KQUEUE */
    867 }
    868 
    869 /*
    870  * A poke message was received, perform a proper watch/unwatch
    871  * on a fd provided
    872  */
    873 static void
    874 wakeup_socket(isc__socketthread_t *thread, int fd, int msg) {
    875 	isc_result_t result;
    876 	int lockid = FDLOCK_ID(fd);
    877 
    878 	/*
    879 	 * This is a wakeup on a socket.  If the socket is not in the
    880 	 * process of being closed, start watching it for either reads
    881 	 * or writes.
    882 	 */
    883 
    884 	INSIST(fd >= 0 && fd < (int)thread->manager->maxsocks);
    885 
    886 	if (msg == SELECT_POKE_CLOSE) {
    887 		LOCK(&thread->fdlock[lockid]);
    888 		INSIST(thread->fdstate[fd] == CLOSE_PENDING);
    889 		thread->fdstate[fd] = CLOSED;
    890 		(void)unwatch_fd(thread, fd, SELECT_POKE_READ);
    891 		(void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
    892 		(void)close(fd);
    893 		UNLOCK(&thread->fdlock[lockid]);
    894 		return;
    895 	}
    896 
    897 	LOCK(&thread->fdlock[lockid]);
    898 	if (thread->fdstate[fd] == CLOSE_PENDING) {
    899 		/*
    900 		 * We accept (and ignore) any error from unwatch_fd() as we are
    901 		 * closing the socket, hoping it doesn't leave dangling state in
    902 		 * the kernel.
    903 		 * Note that unwatch_fd() must be called after releasing the
    904 		 * fdlock; otherwise it could cause deadlock due to a lock order
    905 		 * reversal.
    906 		 */
    907 		(void)unwatch_fd(thread, fd, SELECT_POKE_READ);
    908 		(void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
    909 		UNLOCK(&thread->fdlock[lockid]);
    910 		return;
    911 	}
    912 	if (thread->fdstate[fd] != MANAGED) {
    913 		UNLOCK(&thread->fdlock[lockid]);
    914 		return;
    915 	}
    916 
    917 	/*
    918 	 * Set requested bit.
    919 	 */
    920 	result = watch_fd(thread, fd, msg);
    921 	if (result != ISC_R_SUCCESS) {
    922 		/*
    923 		 * XXXJT: what should we do?  Ignoring the failure of watching
    924 		 * a socket will make the application dysfunctional, but there
    925 		 * seems to be no reasonable recovery process.
    926 		 */
    927 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
    928 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
    929 			      "failed to start watching FD (%d): %s", fd,
    930 			      isc_result_totext(result));
    931 	}
    932 	UNLOCK(&thread->fdlock[lockid]);
    933 }
    934 
    935 /*
    936  * Poke the select loop when there is something for us to do.
    937  * The write is required (by POSIX) to complete.  That is, we
    938  * will not get partial writes.
    939  */
    940 static void
    941 select_poke(isc_socketmgr_t *mgr, int threadid, int fd, int msg) {
    942 	int cc;
    943 	int buf[2];
    944 	char strbuf[ISC_STRERRORSIZE];
    945 
    946 	buf[0] = fd;
    947 	buf[1] = msg;
    948 
    949 	do {
    950 		cc = write(mgr->threads[threadid].pipe_fds[1], buf,
    951 			   sizeof(buf));
    952 #ifdef ENOSR
    953 		/*
    954 		 * Treat ENOSR as EAGAIN but loop slowly as it is
    955 		 * unlikely to clear fast.
    956 		 */
    957 		if (cc < 0 && errno == ENOSR) {
    958 			sleep(1);
    959 			errno = EAGAIN;
    960 		}
    961 #endif /* ifdef ENOSR */
    962 	} while (cc < 0 && SOFT_ERROR(errno));
    963 
    964 	if (cc < 0) {
    965 		strerror_r(errno, strbuf, sizeof(strbuf));
    966 		FATAL_ERROR(__FILE__, __LINE__,
    967 			    "write() failed during watcher poke: %s", strbuf);
    968 	}
    969 
    970 	INSIST(cc == sizeof(buf));
    971 }
    972 
    973 /*
    974  * Read a message on the internal fd.
    975  */
    976 static void
    977 select_readmsg(isc__socketthread_t *thread, int *fd, int *msg) {
    978 	int buf[2];
    979 	int cc;
    980 	char strbuf[ISC_STRERRORSIZE];
    981 
    982 	cc = read(thread->pipe_fds[0], buf, sizeof(buf));
    983 	if (cc < 0) {
    984 		*msg = SELECT_POKE_NOTHING;
    985 		*fd = -1; /* Silence compiler. */
    986 		if (SOFT_ERROR(errno)) {
    987 			return;
    988 		}
    989 
    990 		strerror_r(errno, strbuf, sizeof(strbuf));
    991 		FATAL_ERROR(__FILE__, __LINE__,
    992 			    "read() failed during watcher poke: %s", strbuf);
    993 	}
    994 	INSIST(cc == sizeof(buf));
    995 
    996 	*fd = buf[0];
    997 	*msg = buf[1];
    998 }
    999 
   1000 /*
   1001  * Make a fd non-blocking.
   1002  */
   1003 static isc_result_t
   1004 make_nonblock(int fd) {
   1005 	int ret;
   1006 	char strbuf[ISC_STRERRORSIZE];
   1007 #ifdef USE_FIONBIO_IOCTL
   1008 	int on = 1;
   1009 #else  /* ifdef USE_FIONBIO_IOCTL */
   1010 	int flags;
   1011 #endif /* ifdef USE_FIONBIO_IOCTL */
   1012 
   1013 #ifdef USE_FIONBIO_IOCTL
   1014 	ret = ioctl(fd, FIONBIO, (char *)&on);
   1015 #else  /* ifdef USE_FIONBIO_IOCTL */
   1016 	flags = fcntl(fd, F_GETFL, 0);
   1017 	flags |= PORT_NONBLOCK;
   1018 	ret = fcntl(fd, F_SETFL, flags);
   1019 #endif /* ifdef USE_FIONBIO_IOCTL */
   1020 
   1021 	if (ret == -1) {
   1022 		strerror_r(errno, strbuf, sizeof(strbuf));
   1023 		UNEXPECTED_ERROR(__FILE__, __LINE__,
   1024 #ifdef USE_FIONBIO_IOCTL
   1025 				 "ioctl(%d, FIONBIO, &on): %s", fd,
   1026 #else  /* ifdef USE_FIONBIO_IOCTL */
   1027 				 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
   1028 #endif /* ifdef USE_FIONBIO_IOCTL */
   1029 				 strbuf);
   1030 
   1031 		return (ISC_R_UNEXPECTED);
   1032 	}
   1033 
   1034 	return (ISC_R_SUCCESS);
   1035 }
   1036 
   1037 #ifdef USE_CMSG
   1038 /*
   1039  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
   1040  * In order to ensure as much portability as possible, we provide wrapper
   1041  * functions of these macros.
   1042  * Note that cmsg_space() could run slow on OSes that do not have
   1043  * CMSG_SPACE.
   1044  */
   1045 static socklen_t
   1046 cmsg_len(socklen_t len) {
   1047 #ifdef CMSG_LEN
   1048 	return (CMSG_LEN(len));
   1049 #else  /* ifdef CMSG_LEN */
   1050 	socklen_t hdrlen;
   1051 
   1052 	/*
   1053 	 * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
   1054 	 * is correct.
   1055 	 */
   1056 	hdrlen = (socklen_t)CMSG_DATA(((struct cmsghdr *)NULL));
   1057 	return (hdrlen + len);
   1058 #endif /* ifdef CMSG_LEN */
   1059 }
   1060 
   1061 static socklen_t
   1062 cmsg_space(socklen_t len) {
   1063 #ifdef CMSG_SPACE
   1064 	return (CMSG_SPACE(len));
   1065 #else  /* ifdef CMSG_SPACE */
   1066 	struct msghdr msg;
   1067 	struct cmsghdr *cmsgp;
   1068 	/*
   1069 	 * XXX: The buffer length is an ad-hoc value, but should be enough
   1070 	 * in a practical sense.
   1071 	 */
   1072 	char dummybuf[sizeof(struct cmsghdr) + 1024];
   1073 
   1074 	memset(&msg, 0, sizeof(msg));
   1075 	msg.msg_control = dummybuf;
   1076 	msg.msg_controllen = sizeof(dummybuf);
   1077 
   1078 	cmsgp = (struct cmsghdr *)dummybuf;
   1079 	cmsgp->cmsg_len = cmsg_len(len);
   1080 
   1081 	cmsgp = CMSG_NXTHDR(&msg, cmsgp);
   1082 	if (cmsgp != NULL) {
   1083 		return ((char *)cmsgp - (char *)msg.msg_control);
   1084 	} else {
   1085 		return (0);
   1086 	}
   1087 #endif /* ifdef CMSG_SPACE */
   1088 }
   1089 #endif /* USE_CMSG */
   1090 
   1091 /*
   1092  * Process control messages received on a socket.
   1093  */
   1094 static void
   1095 process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
   1096 #ifdef USE_CMSG
   1097 	struct cmsghdr *cmsgp;
   1098 	struct in6_pktinfo *pktinfop;
   1099 #ifdef SO_TIMESTAMP
   1100 	void *timevalp;
   1101 #endif /* ifdef SO_TIMESTAMP */
   1102 #endif /* ifdef USE_CMSG */
   1103 
   1104 	/*
   1105 	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
   1106 	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
   1107 	 * They are all here, outside of the CPP tests, because it is
   1108 	 * more consistent with the usual ISC coding style.
   1109 	 */
   1110 	UNUSED(sock);
   1111 	UNUSED(msg);
   1112 	UNUSED(dev);
   1113 
   1114 #ifdef MSG_TRUNC
   1115 	if ((msg->msg_flags & MSG_TRUNC) != 0) {
   1116 		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
   1117 	}
   1118 #endif /* ifdef MSG_TRUNC */
   1119 
   1120 #ifdef MSG_CTRUNC
   1121 	if ((msg->msg_flags & MSG_CTRUNC) != 0) {
   1122 		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
   1123 	}
   1124 #endif /* ifdef MSG_CTRUNC */
   1125 
   1126 #ifndef USE_CMSG
   1127 	return;
   1128 #else /* ifndef USE_CMSG */
   1129 	if (msg->msg_controllen == 0U || msg->msg_control == NULL) {
   1130 		return;
   1131 	}
   1132 
   1133 #ifdef SO_TIMESTAMP
   1134 	timevalp = NULL;
   1135 #endif /* ifdef SO_TIMESTAMP */
   1136 	pktinfop = NULL;
   1137 
   1138 	cmsgp = CMSG_FIRSTHDR(msg);
   1139 	while (cmsgp != NULL) {
   1140 		socket_log(sock, NULL, TRACE, "processing cmsg %p", cmsgp);
   1141 
   1142 		if (cmsgp->cmsg_level == IPPROTO_IPV6 &&
   1143 		    cmsgp->cmsg_type == IPV6_PKTINFO)
   1144 		{
   1145 			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
   1146 			memmove(&dev->pktinfo, pktinfop,
   1147 				sizeof(struct in6_pktinfo));
   1148 			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
   1149 			socket_log(sock, NULL, TRACE,
   1150 				   "interface received on ifindex %u",
   1151 				   dev->pktinfo.ipi6_ifindex);
   1152 			if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr)) {
   1153 				dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
   1154 			}
   1155 			goto next;
   1156 		}
   1157 
   1158 #ifdef SO_TIMESTAMP
   1159 		if (cmsgp->cmsg_level == SOL_SOCKET &&
   1160 		    cmsgp->cmsg_type == SCM_TIMESTAMP)
   1161 		{
   1162 			struct timeval tv;
   1163 			timevalp = CMSG_DATA(cmsgp);
   1164 			memmove(&tv, timevalp, sizeof(tv));
   1165 			dev->timestamp.seconds = tv.tv_sec;
   1166 			dev->timestamp.nanoseconds = tv.tv_usec * 1000;
   1167 			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
   1168 			goto next;
   1169 		}
   1170 #endif /* ifdef SO_TIMESTAMP */
   1171 
   1172 #ifdef IPV6_TCLASS
   1173 		if (cmsgp->cmsg_level == IPPROTO_IPV6 &&
   1174 		    cmsgp->cmsg_type == IPV6_TCLASS)
   1175 		{
   1176 			dev->dscp = *(int *)CMSG_DATA(cmsgp);
   1177 			dev->dscp >>= 2;
   1178 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
   1179 			goto next;
   1180 		}
   1181 #endif /* ifdef IPV6_TCLASS */
   1182 
   1183 #ifdef IP_TOS
   1184 		if (cmsgp->cmsg_level == IPPROTO_IP &&
   1185 		    (cmsgp->cmsg_type == IP_TOS
   1186 #ifdef IP_RECVTOS
   1187 		     || cmsgp->cmsg_type == IP_RECVTOS
   1188 #endif /* ifdef IP_RECVTOS */
   1189 		     ))
   1190 		{
   1191 			dev->dscp = (int)*(unsigned char *)CMSG_DATA(cmsgp);
   1192 			dev->dscp >>= 2;
   1193 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
   1194 			goto next;
   1195 		}
   1196 #endif /* ifdef IP_TOS */
   1197 	next:
   1198 		cmsgp = CMSG_NXTHDR(msg, cmsgp);
   1199 	}
   1200 #endif /* USE_CMSG */
   1201 }
   1202 
   1203 /*
   1204  * Construct an iov array and attach it to the msghdr passed in.  This is
   1205  * the SEND constructor, which will use the used region of the buffer
   1206  * (if using a buffer list) or will use the internal region (if a single
   1207  * buffer I/O is requested).
   1208  *
   1209  * Nothing can be NULL, and the done event must list at least one buffer
   1210  * on the buffer linked list for this function to be meaningful.
   1211  *
   1212  * If write_countp != NULL, *write_countp will hold the number of bytes
   1213  * this transaction can send.
   1214  */
   1215 static void
   1216 build_msghdr_send(isc_socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
   1217 		  struct msghdr *msg, struct iovec *iov, size_t *write_countp) {
   1218 	unsigned int iovcount;
   1219 	size_t write_count;
   1220 	struct cmsghdr *cmsgp;
   1221 
   1222 	memset(msg, 0, sizeof(*msg));
   1223 
   1224 	if (!sock->connected) {
   1225 		msg->msg_name = (void *)&dev->address.type.sa;
   1226 		msg->msg_namelen = dev->address.length;
   1227 	} else {
   1228 		msg->msg_name = NULL;
   1229 		msg->msg_namelen = 0;
   1230 	}
   1231 
   1232 	write_count = dev->region.length - dev->n;
   1233 	iov[0].iov_base = (void *)(dev->region.base + dev->n);
   1234 	iov[0].iov_len = write_count;
   1235 	iovcount = 1;
   1236 
   1237 	msg->msg_iov = iov;
   1238 	msg->msg_iovlen = iovcount;
   1239 	msg->msg_control = NULL;
   1240 	msg->msg_controllen = 0;
   1241 	msg->msg_flags = 0;
   1242 #if defined(USE_CMSG)
   1243 
   1244 	if ((sock->type == isc_sockettype_udp) &&
   1245 	    ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0))
   1246 	{
   1247 		struct in6_pktinfo *pktinfop;
   1248 
   1249 		socket_log(sock, NULL, TRACE, "sendto pktinfo data, ifindex %u",
   1250 			   dev->pktinfo.ipi6_ifindex);
   1251 
   1252 		msg->msg_control = (void *)cmsgbuf;
   1253 		msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
   1254 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
   1255 
   1256 		cmsgp = (struct cmsghdr *)cmsgbuf;
   1257 		cmsgp->cmsg_level = IPPROTO_IPV6;
   1258 		cmsgp->cmsg_type = IPV6_PKTINFO;
   1259 		cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
   1260 		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
   1261 		memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
   1262 	}
   1263 
   1264 #if defined(IPV6_USE_MIN_MTU)
   1265 	if ((sock->type == isc_sockettype_udp) && (sock->pf == AF_INET6) &&
   1266 	    ((dev->attributes & ISC_SOCKEVENTATTR_USEMINMTU) != 0))
   1267 	{
   1268 		int use_min_mtu = 1; /* -1, 0, 1 */
   1269 
   1270 		cmsgp = (struct cmsghdr *)(cmsgbuf + msg->msg_controllen);
   1271 		msg->msg_control = (void *)cmsgbuf;
   1272 		msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
   1273 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
   1274 
   1275 		cmsgp->cmsg_level = IPPROTO_IPV6;
   1276 		cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
   1277 		cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
   1278 		memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
   1279 	}
   1280 #endif /* if defined(IPV6_USE_MIN_MTU) */
   1281 
   1282 	if (isc_dscp_check_value > -1) {
   1283 		if (sock->type == isc_sockettype_udp) {
   1284 			INSIST((int)dev->dscp == isc_dscp_check_value);
   1285 		} else if (sock->type == isc_sockettype_tcp) {
   1286 			INSIST((int)sock->dscp == isc_dscp_check_value);
   1287 		}
   1288 	}
   1289 
   1290 #if defined(IP_TOS) || (defined(IPPROTO_IPV6) && defined(IPV6_TCLASS))
   1291 	if ((sock->type == isc_sockettype_udp) &&
   1292 	    ((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0))
   1293 	{
   1294 		int dscp = (dev->dscp << 2) & 0xff;
   1295 
   1296 		INSIST(dev->dscp < 0x40);
   1297 
   1298 #ifdef IP_TOS
   1299 		if (sock->pf == AF_INET && sock->pktdscp) {
   1300 			cmsgp = (struct cmsghdr *)(cmsgbuf +
   1301 						   msg->msg_controllen);
   1302 			msg->msg_control = (void *)cmsgbuf;
   1303 			msg->msg_controllen += cmsg_space(sizeof(dscp));
   1304 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
   1305 
   1306 			cmsgp->cmsg_level = IPPROTO_IP;
   1307 			cmsgp->cmsg_type = IP_TOS;
   1308 			cmsgp->cmsg_len = cmsg_len(sizeof(char));
   1309 			*(unsigned char *)CMSG_DATA(cmsgp) = dscp;
   1310 		} else if (sock->pf == AF_INET && sock->dscp != dev->dscp) {
   1311 			if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS,
   1312 				       (void *)&dscp, sizeof(int)) < 0)
   1313 			{
   1314 				char strbuf[ISC_STRERRORSIZE];
   1315 				strerror_r(errno, strbuf, sizeof(strbuf));
   1316 				UNEXPECTED_ERROR(__FILE__, __LINE__,
   1317 						 "setsockopt(%d, IP_TOS, %.02x)"
   1318 						 " failed: %s",
   1319 						 sock->fd, dscp >> 2, strbuf);
   1320 			} else {
   1321 				sock->dscp = dscp;
   1322 			}
   1323 		}
   1324 #endif /* ifdef IP_TOS */
   1325 #if defined(IPPROTO_IPV6) && defined(IPV6_TCLASS)
   1326 		if (sock->pf == AF_INET6 && sock->pktdscp) {
   1327 			cmsgp = (struct cmsghdr *)(cmsgbuf +
   1328 						   msg->msg_controllen);
   1329 			msg->msg_control = (void *)cmsgbuf;
   1330 			msg->msg_controllen += cmsg_space(sizeof(dscp));
   1331 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
   1332 
   1333 			cmsgp->cmsg_level = IPPROTO_IPV6;
   1334 			cmsgp->cmsg_type = IPV6_TCLASS;
   1335 			cmsgp->cmsg_len = cmsg_len(sizeof(dscp));
   1336 			memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp));
   1337 		} else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) {
   1338 			if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
   1339 				       (void *)&dscp, sizeof(int)) < 0)
   1340 			{
   1341 				char strbuf[ISC_STRERRORSIZE];
   1342 				strerror_r(errno, strbuf, sizeof(strbuf));
   1343 				UNEXPECTED_ERROR(__FILE__, __LINE__,
   1344 						 "setsockopt(%d, IPV6_TCLASS, "
   1345 						 "%.02x) failed: %s",
   1346 						 sock->fd, dscp >> 2, strbuf);
   1347 			} else {
   1348 				sock->dscp = dscp;
   1349 			}
   1350 		}
   1351 #endif /* if defined(IPPROTO_IPV6) && defined(IPV6_TCLASS) */
   1352 		if (msg->msg_controllen != 0 &&
   1353 		    msg->msg_controllen < SENDCMSGBUFLEN)
   1354 		{
   1355 			memset(cmsgbuf + msg->msg_controllen, 0,
   1356 			       SENDCMSGBUFLEN - msg->msg_controllen);
   1357 		}
   1358 	}
   1359 #endif /* if defined(IP_TOS) || (defined(IPPROTO_IPV6) && \
   1360 	* defined(IPV6_TCLASS))                           \
   1361 	* */
   1362 #endif /* USE_CMSG */
   1363 
   1364 	if (write_countp != NULL) {
   1365 		*write_countp = write_count;
   1366 	}
   1367 }
   1368 
   1369 /*
   1370  * Construct an iov array and attach it to the msghdr passed in.  This is
   1371  * the RECV constructor, which will use the available region of the buffer
   1372  * (if using a buffer list) or will use the internal region (if a single
   1373  * buffer I/O is requested).
   1374  *
   1375  * Nothing can be NULL, and the done event must list at least one buffer
   1376  * on the buffer linked list for this function to be meaningful.
   1377  *
   1378  * If read_countp != NULL, *read_countp will hold the number of bytes
   1379  * this transaction can receive.
   1380  */
   1381 static void
   1382 build_msghdr_recv(isc_socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
   1383 		  struct msghdr *msg, struct iovec *iov, size_t *read_countp) {
   1384 	unsigned int iovcount;
   1385 	size_t read_count;
   1386 
   1387 	memset(msg, 0, sizeof(struct msghdr));
   1388 
   1389 	if (sock->type == isc_sockettype_udp) {
   1390 		memset(&dev->address, 0, sizeof(dev->address));
   1391 		msg->msg_name = (void *)&dev->address.type.sa;
   1392 		msg->msg_namelen = sizeof(dev->address.type);
   1393 	} else { /* TCP */
   1394 		msg->msg_name = NULL;
   1395 		msg->msg_namelen = 0;
   1396 		dev->address = sock->peer_address;
   1397 	}
   1398 
   1399 	read_count = dev->region.length - dev->n;
   1400 	iov[0].iov_base = (void *)(dev->region.base + dev->n);
   1401 	iov[0].iov_len = read_count;
   1402 	iovcount = 1;
   1403 
   1404 	/*
   1405 	 * If needed, set up to receive that one extra byte.
   1406 	 */
   1407 #ifdef ISC_PLATFORM_RECVOVERFLOW
   1408 	if (sock->type == isc_sockettype_udp) {
   1409 		INSIST(iovcount < MAXSCATTERGATHER_RECV);
   1410 		iov[iovcount].iov_base = (void *)(&sock->overflow);
   1411 		iov[iovcount].iov_len = 1;
   1412 		iovcount++;
   1413 	}
   1414 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
   1415 
   1416 	msg->msg_iov = iov;
   1417 	msg->msg_iovlen = iovcount;
   1418 
   1419 #if defined(USE_CMSG)
   1420 	msg->msg_control = cmsgbuf;
   1421 	msg->msg_controllen = RECVCMSGBUFLEN;
   1422 #else  /* if defined(USE_CMSG) */
   1423 	msg->msg_control = NULL;
   1424 	msg->msg_controllen = 0;
   1425 #endif /* USE_CMSG */
   1426 	msg->msg_flags = 0;
   1427 
   1428 	if (read_countp != NULL) {
   1429 		*read_countp = read_count;
   1430 	}
   1431 }
   1432 
   1433 static void
   1434 set_dev_address(const isc_sockaddr_t *address, isc_socket_t *sock,
   1435 		isc_socketevent_t *dev) {
   1436 	if (sock->type == isc_sockettype_udp) {
   1437 		if (address != NULL) {
   1438 			dev->address = *address;
   1439 		} else {
   1440 			dev->address = sock->peer_address;
   1441 		}
   1442 	} else if (sock->type == isc_sockettype_tcp) {
   1443 		INSIST(address == NULL);
   1444 		dev->address = sock->peer_address;
   1445 	}
   1446 }
   1447 
   1448 static void
   1449 destroy_socketevent(isc_event_t *event) {
   1450 	isc_socketevent_t *ev = (isc_socketevent_t *)event;
   1451 
   1452 	(ev->destroy)(event);
   1453 }
   1454 
   1455 static isc_socketevent_t *
   1456 allocate_socketevent(isc_mem_t *mctx, void *sender, isc_eventtype_t eventtype,
   1457 		     isc_taskaction_t action, void *arg) {
   1458 	isc_socketevent_t *ev;
   1459 
   1460 	ev = (isc_socketevent_t *)isc_event_allocate(mctx, sender, eventtype,
   1461 						     action, arg, sizeof(*ev));
   1462 
   1463 	ev->result = ISC_R_UNSET;
   1464 	ISC_LINK_INIT(ev, ev_link);
   1465 	ev->region.base = NULL;
   1466 	ev->n = 0;
   1467 	ev->offset = 0;
   1468 	ev->attributes = 0;
   1469 	ev->destroy = ev->ev_destroy;
   1470 	ev->ev_destroy = destroy_socketevent;
   1471 	ev->dscp = 0;
   1472 
   1473 	return (ev);
   1474 }
   1475 
   1476 #if defined(ISC_SOCKET_DEBUG)
   1477 static void
   1478 dump_msg(struct msghdr *msg) {
   1479 	unsigned int i;
   1480 
   1481 	printf("MSGHDR %p\n", msg);
   1482 	printf("\tname %p, namelen %ld\n", msg->msg_name,
   1483 	       (long)msg->msg_namelen);
   1484 	printf("\tiov %p, iovlen %ld\n", msg->msg_iov, (long)msg->msg_iovlen);
   1485 	for (i = 0; i < (unsigned int)msg->msg_iovlen; i++) {
   1486 		printf("\t\t%u\tbase %p, len %ld\n", i,
   1487 		       msg->msg_iov[i].iov_base, (long)msg->msg_iov[i].iov_len);
   1488 	}
   1489 	printf("\tcontrol %p, controllen %ld\n", msg->msg_control,
   1490 	       (long)msg->msg_controllen);
   1491 }
   1492 #endif /* if defined(ISC_SOCKET_DEBUG) */
   1493 
   1494 #define DOIO_SUCCESS 0 /* i/o ok, event sent */
   1495 #define DOIO_SOFT    1 /* i/o ok, soft error, no event sent */
   1496 #define DOIO_HARD    2 /* i/o error, event sent */
   1497 #define DOIO_EOF     3 /* EOF, no event sent */
   1498 
   1499 static int
   1500 doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
   1501 	int cc;
   1502 	struct iovec iov[MAXSCATTERGATHER_RECV];
   1503 	size_t read_count;
   1504 	struct msghdr msghdr;
   1505 	int recv_errno;
   1506 	char strbuf[ISC_STRERRORSIZE];
   1507 	char cmsgbuf[RECVCMSGBUFLEN] = { 0 };
   1508 
   1509 	build_msghdr_recv(sock, cmsgbuf, dev, &msghdr, iov, &read_count);
   1510 
   1511 #if defined(ISC_SOCKET_DEBUG)
   1512 	dump_msg(&msghdr);
   1513 #endif /* if defined(ISC_SOCKET_DEBUG) */
   1514 
   1515 	cc = recvmsg(sock->fd, &msghdr, 0);
   1516 	recv_errno = errno;
   1517 
   1518 #if defined(ISC_SOCKET_DEBUG)
   1519 	dump_msg(&msghdr);
   1520 #endif /* if defined(ISC_SOCKET_DEBUG) */
   1521 
   1522 	if (cc < 0) {
   1523 		if (SOFT_ERROR(recv_errno)) {
   1524 			return (DOIO_SOFT);
   1525 		}
   1526 
   1527 		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
   1528 			strerror_r(recv_errno, strbuf, sizeof(strbuf));
   1529 			socket_log(sock, NULL, IOEVENT,
   1530 				   "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
   1531 				   sock->fd, cc, recv_errno, strbuf);
   1532 		}
   1533 
   1534 #define SOFT_OR_HARD(_system, _isc)                                   \
   1535 	if (recv_errno == _system) {                                  \
   1536 		if (sock->connected) {                                \
   1537 			dev->result = _isc;                           \
   1538 			inc_stats(sock->manager->stats,               \
   1539 				  sock->statsindex[STATID_RECVFAIL]); \
   1540 			return (DOIO_HARD);                           \
   1541 		}                                                     \
   1542 		return (DOIO_SOFT);                                   \
   1543 	}
   1544 #define ALWAYS_HARD(_system, _isc)                            \
   1545 	if (recv_errno == _system) {                          \
   1546 		dev->result = _isc;                           \
   1547 		inc_stats(sock->manager->stats,               \
   1548 			  sock->statsindex[STATID_RECVFAIL]); \
   1549 		return (DOIO_HARD);                           \
   1550 	}
   1551 
   1552 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
   1553 		SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
   1554 		SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
   1555 		SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
   1556 		SOFT_OR_HARD(ENOBUFS, ISC_R_NORESOURCES);
   1557 		/*
   1558 		 * Older operating systems may still return EPROTO in some
   1559 		 * situations, for example when receiving ICMP/ICMPv6 errors.
   1560 		 * A real life scenario is when ICMPv6 returns code 5 or 6.
   1561 		 * These codes are introduced in RFC 4443 from March 2006,
   1562 		 * and the document obsoletes RFC 1885. But unfortunately not
   1563 		 * all operating systems have caught up with the new standard
   1564 		 * (in 2020) and thus a generic protocol error is returned.
   1565 		 */
   1566 		SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
   1567 		/* Should never get this one but it was seen. */
   1568 #ifdef ENOPROTOOPT
   1569 		SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH);
   1570 #endif /* ifdef ENOPROTOOPT */
   1571 		SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
   1572 
   1573 #undef SOFT_OR_HARD
   1574 #undef ALWAYS_HARD
   1575 
   1576 		dev->result = isc__errno2result(recv_errno);
   1577 		inc_stats(sock->manager->stats,
   1578 			  sock->statsindex[STATID_RECVFAIL]);
   1579 		return (DOIO_HARD);
   1580 	}
   1581 
   1582 	/*
   1583 	 * On TCP and UNIX sockets, zero length reads indicate EOF,
   1584 	 * while on UDP sockets, zero length reads are perfectly valid,
   1585 	 * although strange.
   1586 	 */
   1587 	switch (sock->type) {
   1588 	case isc_sockettype_tcp:
   1589 	case isc_sockettype_unix:
   1590 		if (cc == 0) {
   1591 			return (DOIO_EOF);
   1592 		}
   1593 		break;
   1594 	case isc_sockettype_udp:
   1595 	case isc_sockettype_raw:
   1596 		break;
   1597 	case isc_sockettype_fdwatch:
   1598 	default:
   1599 		UNREACHABLE();
   1600 	}
   1601 
   1602 	if (sock->type == isc_sockettype_udp) {
   1603 		dev->address.length = msghdr.msg_namelen;
   1604 		if (isc_sockaddr_getport(&dev->address) == 0) {
   1605 			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
   1606 				socket_log(sock, &dev->address, IOEVENT,
   1607 					   "dropping source port zero packet");
   1608 			}
   1609 			return (DOIO_SOFT);
   1610 		}
   1611 		/*
   1612 		 * Simulate a firewall blocking UDP responses bigger than
   1613 		 * 'maxudp' bytes.
   1614 		 */
   1615 		if (sock->manager->maxudp != 0 &&
   1616 		    cc > (int)sock->manager->maxudp)
   1617 		{
   1618 			return (DOIO_SOFT);
   1619 		}
   1620 	}
   1621 
   1622 	socket_log(sock, &dev->address, IOEVENT, "packet received correctly");
   1623 
   1624 	/*
   1625 	 * Overflow bit detection.  If we received MORE bytes than we should,
   1626 	 * this indicates an overflow situation.  Set the flag in the
   1627 	 * dev entry and adjust how much we read by one.
   1628 	 */
   1629 #ifdef ISC_PLATFORM_RECVOVERFLOW
   1630 	if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
   1631 		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
   1632 		cc--;
   1633 	}
   1634 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
   1635 
   1636 	/*
   1637 	 * If there are control messages attached, run through them and pull
   1638 	 * out the interesting bits.
   1639 	 */
   1640 	process_cmsg(sock, &msghdr, dev);
   1641 
   1642 	/*
   1643 	 * update the buffers (if any) and the i/o count
   1644 	 */
   1645 	dev->n += cc;
   1646 
   1647 	/*
   1648 	 * If we read less than we expected, update counters,
   1649 	 * and let the upper layer poke the descriptor.
   1650 	 */
   1651 	if (((size_t)cc != read_count) && (dev->n < dev->minimum)) {
   1652 		return (DOIO_SOFT);
   1653 	}
   1654 
   1655 	/*
   1656 	 * Full reads are posted, or partials if partials are ok.
   1657 	 */
   1658 	dev->result = ISC_R_SUCCESS;
   1659 	return (DOIO_SUCCESS);
   1660 }
   1661 
   1662 /*
   1663  * Returns:
   1664  *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
   1665  *			ISC_R_SUCCESS.
   1666  *
   1667  *	DOIO_HARD	A hard or unexpected I/O error was encountered.
   1668  *			dev->result contains the appropriate error.
   1669  *
   1670  *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
   1671  *			event was sent.  The operation should be retried.
   1672  *
   1673  *	No other return values are possible.
   1674  */
   1675 static int
   1676 doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
   1677 	int cc;
   1678 	struct iovec iov[MAXSCATTERGATHER_SEND];
   1679 	size_t write_count;
   1680 	struct msghdr msghdr;
   1681 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
   1682 	int attempts = 0;
   1683 	int send_errno;
   1684 	char strbuf[ISC_STRERRORSIZE];
   1685 	char cmsgbuf[SENDCMSGBUFLEN] = { 0 };
   1686 
   1687 	build_msghdr_send(sock, cmsgbuf, dev, &msghdr, iov, &write_count);
   1688 
   1689 resend:
   1690 	if (sock->type == isc_sockettype_udp && sock->manager->maxudp != 0 &&
   1691 	    write_count > sock->manager->maxudp)
   1692 	{
   1693 		cc = write_count;
   1694 	} else {
   1695 		cc = sendmsg(sock->fd, &msghdr, 0);
   1696 	}
   1697 	send_errno = errno;
   1698 
   1699 	/*
   1700 	 * Check for error or block condition.
   1701 	 */
   1702 	if (cc < 0) {
   1703 		if (send_errno == EINTR && ++attempts < NRETRIES) {
   1704 			goto resend;
   1705 		}
   1706 
   1707 		if (SOFT_ERROR(send_errno)) {
   1708 			if (errno == EWOULDBLOCK || errno == EAGAIN) {
   1709 				dev->result = ISC_R_WOULDBLOCK;
   1710 			}
   1711 			return (DOIO_SOFT);
   1712 		}
   1713 
   1714 #define SOFT_OR_HARD(_system, _isc)                                   \
   1715 	if (send_errno == _system) {                                  \
   1716 		if (sock->connected) {                                \
   1717 			dev->result = _isc;                           \
   1718 			inc_stats(sock->manager->stats,               \
   1719 				  sock->statsindex[STATID_SENDFAIL]); \
   1720 			return (DOIO_HARD);                           \
   1721 		}                                                     \
   1722 		return (DOIO_SOFT);                                   \
   1723 	}
   1724 #define ALWAYS_HARD(_system, _isc)                            \
   1725 	if (send_errno == _system) {                          \
   1726 		dev->result = _isc;                           \
   1727 		inc_stats(sock->manager->stats,               \
   1728 			  sock->statsindex[STATID_SENDFAIL]); \
   1729 		return (DOIO_HARD);                           \
   1730 	}
   1731 
   1732 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
   1733 		ALWAYS_HARD(EACCES, ISC_R_NOPERM);
   1734 		ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
   1735 		ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
   1736 		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
   1737 #ifdef EHOSTDOWN
   1738 		ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
   1739 #endif /* ifdef EHOSTDOWN */
   1740 		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
   1741 		SOFT_OR_HARD(ENOBUFS, ISC_R_NORESOURCES);
   1742 		ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
   1743 		ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
   1744 		ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
   1745 
   1746 #undef SOFT_OR_HARD
   1747 #undef ALWAYS_HARD
   1748 
   1749 		/*
   1750 		 * The other error types depend on whether or not the
   1751 		 * socket is UDP or TCP.  If it is UDP, some errors
   1752 		 * that we expect to be fatal under TCP are merely
   1753 		 * annoying, and are really soft errors.
   1754 		 *
   1755 		 * However, these soft errors are still returned as
   1756 		 * a status.
   1757 		 */
   1758 		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
   1759 		strerror_r(send_errno, strbuf, sizeof(strbuf));
   1760 		UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
   1761 				 addrbuf, strbuf);
   1762 		dev->result = isc__errno2result(send_errno);
   1763 		inc_stats(sock->manager->stats,
   1764 			  sock->statsindex[STATID_SENDFAIL]);
   1765 		return (DOIO_HARD);
   1766 	}
   1767 
   1768 	if (cc == 0) {
   1769 		inc_stats(sock->manager->stats,
   1770 			  sock->statsindex[STATID_SENDFAIL]);
   1771 		UNEXPECTED_ERROR(__FILE__, __LINE__,
   1772 				 "doio_send: send() returned 0");
   1773 	}
   1774 
   1775 	/*
   1776 	 * If we write less than we expected, update counters, poke.
   1777 	 */
   1778 	dev->n += cc;
   1779 	if ((size_t)cc != write_count) {
   1780 		return (DOIO_SOFT);
   1781 	}
   1782 
   1783 	/*
   1784 	 * Exactly what we wanted to write.  We're done with this
   1785 	 * entry.  Post its completion event.
   1786 	 */
   1787 	dev->result = ISC_R_SUCCESS;
   1788 	return (DOIO_SUCCESS);
   1789 }
   1790 
   1791 /*
   1792  * Kill.
   1793  *
   1794  * Caller must ensure that the socket is not locked and no external
   1795  * references exist.
   1796  */
   1797 static void
   1798 socketclose(isc__socketthread_t *thread, isc_socket_t *sock, int fd) {
   1799 	int lockid = FDLOCK_ID(fd);
   1800 	/*
   1801 	 * No one has this socket open, so the watcher doesn't have to be
   1802 	 * poked, and the socket doesn't have to be locked.
   1803 	 */
   1804 	LOCK(&thread->fdlock[lockid]);
   1805 	thread->fds[fd] = NULL;
   1806 	if (sock->type == isc_sockettype_fdwatch)
   1807 		thread->fdstate[fd] = CLOSED;
   1808 	else
   1809 		thread->fdstate[fd] = CLOSE_PENDING;
   1810 	UNLOCK(&thread->fdlock[lockid]);
   1811 	if (sock->type == isc_sockettype_fdwatch) {
   1812 		/*
   1813 		 * The caller may close the socket once this function returns,
   1814 		 * and `fd' may be reassigned for a new socket.  So we do
   1815 		 * unwatch_fd() here, rather than defer it via select_poke().
   1816 		 * Note: this may complicate data protection among threads and
   1817 		 * may reduce performance due to additional locks.  One way to
   1818 		 * solve this would be to dup() the watched descriptor, but we
   1819 		 * take a simpler approach at this moment.
   1820 		 */
   1821 		(void)unwatch_fd(thread, fd, SELECT_POKE_READ);
   1822 		(void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
   1823 	} else
   1824 		select_poke(thread->manager, thread->threadid, fd,
   1825 		    SELECT_POKE_CLOSE);
   1826 
   1827 	inc_stats(thread->manager->stats, sock->statsindex[STATID_CLOSE]);
   1828 
   1829 	LOCK(&sock->lock);
   1830 	if (sock->active == 1) {
   1831 		dec_stats(thread->manager->stats,
   1832 			  sock->statsindex[STATID_ACTIVE]);
   1833 		sock->active = 0;
   1834 	}
   1835 	UNLOCK(&sock->lock);
   1836 
   1837 	/*
   1838 	 * update manager->maxfd here (XXX: this should be implemented more
   1839 	 * efficiently)
   1840 	 */
   1841 #ifdef USE_SELECT
   1842 	LOCK(&thread->manager->lock);
   1843 	if (thread->maxfd == fd) {
   1844 		int i;
   1845 
   1846 		thread->maxfd = 0;
   1847 		for (i = fd - 1; i >= 0; i--) {
   1848 			lockid = FDLOCK_ID(i);
   1849 
   1850 			LOCK(&thread->fdlock[lockid]);
   1851 			if (thread->fdstate[i] == MANAGED) {
   1852 				thread->maxfd = i;
   1853 				UNLOCK(&thread->fdlock[lockid]);
   1854 				break;
   1855 			}
   1856 			UNLOCK(&thread->fdlock[lockid]);
   1857 		}
   1858 		if (thread->maxfd < thread->pipe_fds[0]) {
   1859 			thread->maxfd = thread->pipe_fds[0];
   1860 		}
   1861 	}
   1862 
   1863 	UNLOCK(&thread->manager->lock);
   1864 #endif /* USE_SELECT */
   1865 }
   1866 
   1867 static void
   1868 destroy(isc_socket_t **sockp) {
   1869 	int fd = 0;
   1870 	isc_socket_t *sock = *sockp;
   1871 	isc_socketmgr_t *manager = sock->manager;
   1872 	isc__socketthread_t *thread = NULL;
   1873 
   1874 	socket_log(sock, NULL, CREATION, "destroying");
   1875 
   1876 	isc_refcount_destroy(&sock->references);
   1877 
   1878 	LOCK(&sock->lock);
   1879 	INSIST(ISC_LIST_EMPTY(sock->connect_list));
   1880 	INSIST(ISC_LIST_EMPTY(sock->accept_list));
   1881 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
   1882 	INSIST(ISC_LIST_EMPTY(sock->send_list));
   1883 	INSIST(sock->fd >= -1 && sock->fd < (int)manager->maxsocks);
   1884 
   1885 	if (sock->fd >= 0) {
   1886 		fd = sock->fd;
   1887 		thread = &manager->threads[sock->threadid];
   1888 		sock->fd = -1;
   1889 		sock->threadid = -1;
   1890 	}
   1891 	UNLOCK(&sock->lock);
   1892 
   1893 	if (fd > 0) {
   1894 		socketclose(thread, sock, fd);
   1895 	}
   1896 
   1897 	LOCK(&manager->lock);
   1898 
   1899 	ISC_LIST_UNLINK(manager->socklist, sock, link);
   1900 
   1901 	if (ISC_LIST_EMPTY(manager->socklist)) {
   1902 		SIGNAL(&manager->shutdown_ok);
   1903 	}
   1904 
   1905 	/* can't unlock manager as its memory context is still used */
   1906 	free_socket(sockp);
   1907 
   1908 	UNLOCK(&manager->lock);
   1909 }
   1910 
   1911 static isc_result_t
   1912 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
   1913 		isc_socket_t **socketp) {
   1914 	isc_socket_t *sock;
   1915 
   1916 	sock = isc_mem_get(manager->mctx, sizeof(*sock));
   1917 
   1918 	sock->magic = 0;
   1919 	isc_refcount_init(&sock->references, 0);
   1920 
   1921 	sock->manager = manager;
   1922 	sock->type = type;
   1923 	sock->fd = -1;
   1924 	sock->threadid = -1;
   1925 	sock->dscp = 0; /* TOS/TCLASS is zero until set. */
   1926 	sock->dupped = 0;
   1927 	sock->statsindex = NULL;
   1928 	sock->active = 0;
   1929 
   1930 	ISC_LINK_INIT(sock, link);
   1931 
   1932 	memset(sock->name, 0, sizeof(sock->name));
   1933 	sock->tag = NULL;
   1934 
   1935 	/*
   1936 	 * Set up list of readers and writers to be initially empty.
   1937 	 */
   1938 	ISC_LIST_INIT(sock->recv_list);
   1939 	ISC_LIST_INIT(sock->send_list);
   1940 	ISC_LIST_INIT(sock->accept_list);
   1941 	ISC_LIST_INIT(sock->connect_list);
   1942 
   1943 	sock->listener = 0;
   1944 	sock->connected = 0;
   1945 	sock->connecting = 0;
   1946 	sock->bound = 0;
   1947 	sock->pktdscp = 0;
   1948 
   1949 	/*
   1950 	 * Initialize the lock.
   1951 	 */
   1952 	isc_mutex_init(&sock->lock);
   1953 
   1954 	sock->magic = SOCKET_MAGIC;
   1955 	*socketp = sock;
   1956 
   1957 	return (ISC_R_SUCCESS);
   1958 }
   1959 
   1960 /*
   1961  * This event requires that the various lists be empty, that the reference
   1962  * count be 1, and that the magic number is valid.  The other socket bits,
   1963  * like the lock, must be initialized as well.  The fd associated must be
   1964  * marked as closed, by setting it to -1 on close, or this routine will
   1965  * also close the socket.
   1966  */
   1967 static void
   1968 free_socket(isc_socket_t **socketp) {
   1969 	isc_socket_t *sock = *socketp;
   1970 	*socketp = NULL;
   1971 
   1972 	INSIST(VALID_SOCKET(sock));
   1973 	isc_refcount_destroy(&sock->references);
   1974 	LOCK(&sock->lock);
   1975 	INSIST(!sock->connecting);
   1976 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
   1977 	INSIST(ISC_LIST_EMPTY(sock->send_list));
   1978 	INSIST(ISC_LIST_EMPTY(sock->accept_list));
   1979 	INSIST(ISC_LIST_EMPTY(sock->connect_list));
   1980 	INSIST(!ISC_LINK_LINKED(sock, link));
   1981 	UNLOCK(&sock->lock);
   1982 
   1983 	sock->magic = 0;
   1984 
   1985 	isc_mutex_destroy(&sock->lock);
   1986 
   1987 	isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
   1988 }
   1989 
   1990 #if defined(SET_RCVBUF)
   1991 static isc_once_t rcvbuf_once = ISC_ONCE_INIT;
   1992 static int rcvbuf = ISC_RECV_BUFFER_SIZE;
   1993 
   1994 static void
   1995 set_rcvbuf(void) {
   1996 	int fd;
   1997 	int max = rcvbuf, min;
   1998 	socklen_t len;
   1999 
   2000 	fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
   2001 	if (fd == -1) {
   2002 		switch (errno) {
   2003 		case EPROTONOSUPPORT:
   2004 		case EPFNOSUPPORT:
   2005 		case EAFNOSUPPORT:
   2006 		/*
   2007 		 * Linux 2.2 (and maybe others) return EINVAL instead of
   2008 		 * EAFNOSUPPORT.
   2009 		 */
   2010 		case EINVAL:
   2011 			fd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP);
   2012 			break;
   2013 		}
   2014 	}
   2015 	if (fd == -1) {
   2016 		return;
   2017 	}
   2018 
   2019 	len = sizeof(min);
   2020 	if (getsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&min, &len) == 0 &&
   2021 	    min < rcvbuf)
   2022 	{
   2023 	again:
   2024 		if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&rcvbuf,
   2025 			       sizeof(rcvbuf)) == -1)
   2026 		{
   2027 			if (errno == ENOBUFS && rcvbuf > min) {
   2028 				max = rcvbuf - 1;
   2029 				rcvbuf = (rcvbuf + min) / 2;
   2030 				goto again;
   2031 			} else {
   2032 				rcvbuf = min;
   2033 				goto cleanup;
   2034 			}
   2035 		} else {
   2036 			min = rcvbuf;
   2037 		}
   2038 		if (min != max) {
   2039 			rcvbuf = max;
   2040 			goto again;
   2041 		}
   2042 	}
   2043 cleanup:
   2044 	close(fd);
   2045 }
   2046 #endif /* ifdef SO_RCVBUF */
   2047 
   2048 #if defined(SET_SNDBUF)
   2049 static isc_once_t sndbuf_once = ISC_ONCE_INIT;
   2050 static int sndbuf = ISC_SEND_BUFFER_SIZE;
   2051 
   2052 static void
   2053 set_sndbuf(void) {
   2054 	int fd;
   2055 	int max = sndbuf, min;
   2056 	socklen_t len;
   2057 
   2058 	fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
   2059 	if (fd == -1) {
   2060 		switch (errno) {
   2061 		case EPROTONOSUPPORT:
   2062 		case EPFNOSUPPORT:
   2063 		case EAFNOSUPPORT:
   2064 		/*
   2065 		 * Linux 2.2 (and maybe others) return EINVAL instead of
   2066 		 * EAFNOSUPPORT.
   2067 		 */
   2068 		case EINVAL:
   2069 			fd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP);
   2070 			break;
   2071 		}
   2072 	}
   2073 	if (fd == -1) {
   2074 		return;
   2075 	}
   2076 
   2077 	len = sizeof(min);
   2078 	if (getsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *)&min, &len) == 0 &&
   2079 	    min < sndbuf)
   2080 	{
   2081 	again:
   2082 		if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *)&sndbuf,
   2083 			       sizeof(sndbuf)) == -1)
   2084 		{
   2085 			if (errno == ENOBUFS && sndbuf > min) {
   2086 				max = sndbuf - 1;
   2087 				sndbuf = (sndbuf + min) / 2;
   2088 				goto again;
   2089 			} else {
   2090 				sndbuf = min;
   2091 				goto cleanup;
   2092 			}
   2093 		} else {
   2094 			min = sndbuf;
   2095 		}
   2096 		if (min != max) {
   2097 			sndbuf = max;
   2098 			goto again;
   2099 		}
   2100 	}
   2101 cleanup:
   2102 	close(fd);
   2103 }
   2104 #endif /* ifdef SO_SNDBUF */
   2105 
   2106 static void
   2107 use_min_mtu(isc_socket_t *sock) {
   2108 #if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU)
   2109 	UNUSED(sock);
   2110 #endif /* if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU) */
   2111 #ifdef IPV6_USE_MIN_MTU
   2112 	/* use minimum MTU */
   2113 	if (sock->pf == AF_INET6) {
   2114 		int on = 1;
   2115 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
   2116 				 (void *)&on, sizeof(on));
   2117 	}
   2118 #endif /* ifdef IPV6_USE_MIN_MTU */
   2119 #if defined(IPV6_MTU)
   2120 	/*
   2121 	 * Use minimum MTU on IPv6 sockets.
   2122 	 */
   2123 	if (sock->pf == AF_INET6) {
   2124 		int mtu = 1280;
   2125 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU, &mtu,
   2126 				 sizeof(mtu));
   2127 	}
   2128 #endif /* if defined(IPV6_MTU) */
   2129 }
   2130 
   2131 static void
   2132 set_tcp_maxseg(isc_socket_t *sock, int size) {
   2133 #ifdef TCP_MAXSEG
   2134 	if (sock->type == isc_sockettype_tcp) {
   2135 		(void)setsockopt(sock->fd, IPPROTO_TCP, TCP_MAXSEG,
   2136 				 (void *)&size, sizeof(size));
   2137 	}
   2138 #endif /* ifdef TCP_MAXSEG */
   2139 }
   2140 
   2141 static void
   2142 set_ip_disable_pmtud(isc_socket_t *sock) {
   2143 	/*
   2144 	 * Disable Path MTU Discover on IP packets
   2145 	 */
   2146 	if (sock->pf == AF_INET6) {
   2147 #if defined(IPV6_DONTFRAG)
   2148 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_DONTFRAG,
   2149 				 &(int){ 0 }, sizeof(int));
   2150 #endif
   2151 #if defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT)
   2152 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
   2153 				 &(int){ IP_PMTUDISC_OMIT }, sizeof(int));
   2154 #endif
   2155 	} else if (sock->pf == AF_INET) {
   2156 #if defined(IP_DONTFRAG)
   2157 		(void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG, &(int){ 0 },
   2158 				 sizeof(int));
   2159 #endif
   2160 #if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT)
   2161 		(void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER,
   2162 				 &(int){ IP_PMTUDISC_OMIT }, sizeof(int));
   2163 #endif
   2164 	}
   2165 }
   2166 
   2167 static isc_result_t
   2168 opensocket(isc_socketmgr_t *manager, isc_socket_t *sock,
   2169 	   isc_socket_t *dup_socket) {
   2170 	isc_result_t result;
   2171 	char strbuf[ISC_STRERRORSIZE];
   2172 	const char *err = "socket";
   2173 	int tries = 0;
   2174 #if defined(USE_CMSG) || defined(SO_NOSIGPIPE)
   2175 	int on = 1;
   2176 #endif /* if defined(USE_CMSG) || defined(SO_NOSIGPIPE) */
   2177 #if defined(SET_RCVBUF) || defined(SET_SNDBUF)
   2178 	socklen_t optlen;
   2179 	int size = 0;
   2180 #endif
   2181 
   2182 again:
   2183 	if (dup_socket == NULL) {
   2184 		switch (sock->type) {
   2185 		case isc_sockettype_udp:
   2186 			sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
   2187 			break;
   2188 		case isc_sockettype_tcp:
   2189 			sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
   2190 			break;
   2191 		case isc_sockettype_unix:
   2192 			sock->fd = socket(sock->pf, SOCK_STREAM, 0);
   2193 			break;
   2194 		case isc_sockettype_raw:
   2195 			errno = EPFNOSUPPORT;
   2196 			/*
   2197 			 * PF_ROUTE is a alias for PF_NETLINK on linux.
   2198 			 */
   2199 #if defined(PF_ROUTE)
   2200 			if (sock->fd == -1 && sock->pf == PF_ROUTE) {
   2201 #ifdef NETLINK_ROUTE
   2202 				sock->fd = socket(sock->pf, SOCK_RAW,
   2203 						  NETLINK_ROUTE);
   2204 #else  /* ifdef NETLINK_ROUTE */
   2205 				sock->fd = socket(sock->pf, SOCK_RAW, 0);
   2206 #endif /* ifdef NETLINK_ROUTE */
   2207 				if (sock->fd != -1) {
   2208 #ifdef NETLINK_ROUTE
   2209 					struct sockaddr_nl sa;
   2210 					int n;
   2211 
   2212 					/*
   2213 					 * Do an implicit bind.
   2214 					 */
   2215 					memset(&sa, 0, sizeof(sa));
   2216 					sa.nl_family = AF_NETLINK;
   2217 					sa.nl_groups = RTMGRP_IPV4_IFADDR |
   2218 						       RTMGRP_IPV6_IFADDR;
   2219 					n = bind(sock->fd,
   2220 						 (struct sockaddr *)&sa,
   2221 						 sizeof(sa));
   2222 					if (n < 0) {
   2223 						close(sock->fd);
   2224 						sock->fd = -1;
   2225 					}
   2226 #endif /* ifdef NETLINK_ROUTE */
   2227 					sock->bound = 1;
   2228 				}
   2229 			}
   2230 #endif /* if defined(PF_ROUTE) */
   2231 			break;
   2232 		case isc_sockettype_fdwatch:
   2233 			/*
   2234 			 * We should not be called for isc_sockettype_fdwatch
   2235 			 * sockets.
   2236 			 */
   2237 			INSIST(0);
   2238 			break;
   2239 		}
   2240 	} else {
   2241 		sock->fd = dup(dup_socket->fd);
   2242 		sock->dupped = 1;
   2243 		sock->bound = dup_socket->bound;
   2244 	}
   2245 	if (sock->fd == -1 && errno == EINTR && tries++ < 42) {
   2246 		goto again;
   2247 	}
   2248 
   2249 #ifdef F_DUPFD
   2250 	/*
   2251 	 * Leave a space for stdio and TCP to work in.
   2252 	 */
   2253 	if (manager->reserved != 0 && sock->type == isc_sockettype_udp &&
   2254 	    sock->fd >= 0 && sock->fd < manager->reserved)
   2255 	{
   2256 		int newfd, tmp;
   2257 		newfd = fcntl(sock->fd, F_DUPFD, manager->reserved);
   2258 		tmp = errno;
   2259 		(void)close(sock->fd);
   2260 		errno = tmp;
   2261 		sock->fd = newfd;
   2262 		err = "isc_socket_create: fcntl/reserved";
   2263 	} else if (sock->fd >= 0 && sock->fd < 20) {
   2264 		int newfd, tmp;
   2265 		newfd = fcntl(sock->fd, F_DUPFD, 20);
   2266 		tmp = errno;
   2267 		(void)close(sock->fd);
   2268 		errno = tmp;
   2269 		sock->fd = newfd;
   2270 		err = "isc_socket_create: fcntl";
   2271 	}
   2272 #endif /* ifdef F_DUPFD */
   2273 
   2274 	if (sock->fd >= (int)manager->maxsocks) {
   2275 		(void)close(sock->fd);
   2276 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
   2277 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
   2278 			      "socket: file descriptor exceeds limit (%d/%u)",
   2279 			      sock->fd, manager->maxsocks);
   2280 		inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
   2281 		return (ISC_R_NORESOURCES);
   2282 	}
   2283 
   2284 	if (sock->fd < 0) {
   2285 		switch (errno) {
   2286 		case EMFILE:
   2287 		case ENFILE:
   2288 			strerror_r(errno, strbuf, sizeof(strbuf));
   2289 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
   2290 				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
   2291 				      "%s: %s", err, strbuf);
   2292 			FALLTHROUGH;
   2293 		case ENOBUFS:
   2294 			inc_stats(manager->stats,
   2295 				  sock->statsindex[STATID_OPENFAIL]);
   2296 			return (ISC_R_NORESOURCES);
   2297 
   2298 		case EPROTONOSUPPORT:
   2299 		case EPFNOSUPPORT:
   2300 		case EAFNOSUPPORT:
   2301 		/*
   2302 		 * Linux 2.2 (and maybe others) return EINVAL instead of
   2303 		 * EAFNOSUPPORT.
   2304 		 */
   2305 		case EINVAL:
   2306 			inc_stats(manager->stats,
   2307 				  sock->statsindex[STATID_OPENFAIL]);
   2308 			return (ISC_R_FAMILYNOSUPPORT);
   2309 
   2310 		default:
   2311 			strerror_r(errno, strbuf, sizeof(strbuf));
   2312 			UNEXPECTED_ERROR(__FILE__, __LINE__, "%s() failed: %s",
   2313 					 err, strbuf);
   2314 			inc_stats(manager->stats,
   2315 				  sock->statsindex[STATID_OPENFAIL]);
   2316 			return (ISC_R_UNEXPECTED);
   2317 		}
   2318 	}
   2319 
   2320 	if (dup_socket != NULL) {
   2321 		goto setup_done;
   2322 	}
   2323 
   2324 	result = make_nonblock(sock->fd);
   2325 	if (result != ISC_R_SUCCESS) {
   2326 		(void)close(sock->fd);
   2327 		inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
   2328 		return (result);
   2329 	}
   2330 
   2331 #ifdef SO_NOSIGPIPE
   2332 	if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE, (void *)&on,
   2333 		       sizeof(on)) < 0)
   2334 	{
   2335 		strerror_r(errno, strbuf, sizeof(strbuf));
   2336 		UNEXPECTED_ERROR(__FILE__, __LINE__,
   2337 				 "setsockopt(%d, SO_NOSIGPIPE) failed: %s",
   2338 				 sock->fd, strbuf);
   2339 		/* Press on... */
   2340 	}
   2341 #endif /* ifdef SO_NOSIGPIPE */
   2342 
   2343 	/*
   2344 	 * Use minimum mtu if possible.
   2345 	 */
   2346 	if (sock->type == isc_sockettype_tcp && sock->pf == AF_INET6) {
   2347 		use_min_mtu(sock);
   2348 		set_tcp_maxseg(sock, 1280 - 20 - 40); /* 1280 - TCP - IPV6 */
   2349 	}
   2350 
   2351 #if defined(USE_CMSG) || defined(SET_RCVBUF) || defined(SET_SNDBUF)
   2352 	if (sock->type == isc_sockettype_udp) {
   2353 #if defined(USE_CMSG)
   2354 #if defined(SO_TIMESTAMP)
   2355 		if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP, (void *)&on,
   2356 			       sizeof(on)) < 0 &&
   2357 		    errno != ENOPROTOOPT)
   2358 		{
   2359 			strerror_r(errno, strbuf, sizeof(strbuf));
   2360 			UNEXPECTED_ERROR(__FILE__, __LINE__,
   2361 					 "setsockopt(%d, SO_TIMESTAMP) failed: "
   2362 					 "%s",
   2363 					 sock->fd, strbuf);
   2364 			/* Press on... */
   2365 		}
   2366 #endif /* SO_TIMESTAMP */
   2367 
   2368 #ifdef IPV6_RECVPKTINFO
   2369 		/* RFC 3542 */
   2370 		if ((sock->pf == AF_INET6) &&
   2371 		    (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
   2372 				(void *)&on, sizeof(on)) < 0))
   2373 		{
   2374 			strerror_r(errno, strbuf, sizeof(strbuf));
   2375 			UNEXPECTED_ERROR(__FILE__, __LINE__,
   2376 					 "setsockopt(%d, IPV6_RECVPKTINFO) "
   2377 					 "failed: %s",
   2378 					 sock->fd, strbuf);
   2379 		}
   2380 #else  /* ifdef IPV6_RECVPKTINFO */
   2381 		/* RFC 2292 */
   2382 		if ((sock->pf == AF_INET6) &&
   2383 		    (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
   2384 				(void *)&on, sizeof(on)) < 0))
   2385 		{
   2386 			strerror_r(errno, strbuf, sizeof(strbuf));
   2387 			UNEXPECTED_ERROR(__FILE__, __LINE__,
   2388 					 "setsockopt(%d, IPV6_PKTINFO) failed: "
   2389 					 "%s",
   2390 					 sock->fd, strbuf);
   2391 		}
   2392 #endif /* IPV6_RECVPKTINFO */
   2393 #endif /* defined(USE_CMSG) */
   2394 
   2395 #if defined(SET_RCVBUF)
   2396 		optlen = sizeof(size);
   2397 		if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, (void *)&size,
   2398 			       &optlen) == 0 &&
   2399 		    size < rcvbuf)
   2400 		{
   2401 			RUNTIME_CHECK(isc_once_do(&rcvbuf_once, set_rcvbuf) ==
   2402 				      ISC_R_SUCCESS);
   2403 			if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
   2404 				       (void *)&rcvbuf, sizeof(rcvbuf)) == -1)
   2405 			{
   2406 				strerror_r(errno, strbuf, sizeof(strbuf));
   2407 				UNEXPECTED_ERROR(__FILE__, __LINE__,
   2408 						 "setsockopt(%d, SO_RCVBUF, "
   2409 						 "%d) failed: %s",
   2410 						 sock->fd, rcvbuf, strbuf);
   2411 			}
   2412 		}
   2413 #endif /* if defined(SET_RCVBUF) */
   2414 
   2415 #if defined(SET_SNDBUF)
   2416 		optlen = sizeof(size);
   2417 		if (getsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, (void *)&size,
   2418 			       &optlen) == 0 &&
   2419 		    size < sndbuf)
   2420 		{
   2421 			RUNTIME_CHECK(isc_once_do(&sndbuf_once, set_sndbuf) ==
   2422 				      ISC_R_SUCCESS);
   2423 			if (setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF,
   2424 				       (void *)&sndbuf, sizeof(sndbuf)) == -1)
   2425 			{
   2426 				strerror_r(errno, strbuf, sizeof(strbuf));
   2427 				UNEXPECTED_ERROR(__FILE__, __LINE__,
   2428 						 "setsockopt(%d, SO_SNDBUF, "
   2429 						 "%d) failed: %s",
   2430 						 sock->fd, sndbuf, strbuf);
   2431 			}
   2432 		}
   2433 #endif /* if defined(SO_SNDBUF) */
   2434 	}
   2435 #ifdef IPV6_RECVTCLASS
   2436 	if ((sock->pf == AF_INET6) &&
   2437 	    (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVTCLASS, (void *)&on,
   2438 			sizeof(on)) < 0))
   2439 	{
   2440 		strerror_r(errno, strbuf, sizeof(strbuf));
   2441 		UNEXPECTED_ERROR(__FILE__, __LINE__,
   2442 				 "setsockopt(%d, IPV6_RECVTCLASS) "
   2443 				 "failed: %s",
   2444 				 sock->fd, strbuf);
   2445 	}
   2446 #endif /* ifdef IPV6_RECVTCLASS */
   2447 #ifdef IP_RECVTOS
   2448 	if ((sock->pf == AF_INET) &&
   2449 	    (setsockopt(sock->fd, IPPROTO_IP, IP_RECVTOS, (void *)&on,
   2450 			sizeof(on)) < 0))
   2451 	{
   2452 		strerror_r(errno, strbuf, sizeof(strbuf));
   2453 		UNEXPECTED_ERROR(__FILE__, __LINE__,
   2454 				 "setsockopt(%d, IP_RECVTOS) "
   2455 				 "failed: %s",
   2456 				 sock->fd, strbuf);
   2457 	}
   2458 #endif /* ifdef IP_RECVTOS */
   2459 #endif /* defined(USE_CMSG) || defined(SET_RCVBUF) || defined(SET_SNDBUF) */
   2460 
   2461 	set_ip_disable_pmtud(sock);
   2462 
   2463 setup_done:
   2464 	inc_stats(manager->stats, sock->statsindex[STATID_OPEN]);
   2465 	if (sock->active == 0) {
   2466 		inc_stats(manager->stats, sock->statsindex[STATID_ACTIVE]);
   2467 		sock->active = 1;
   2468 	}
   2469 
   2470 	return (ISC_R_SUCCESS);
   2471 }
   2472 
   2473 /*
   2474  * Create a 'type' socket or duplicate an existing socket, managed
   2475  * by 'manager'.  Events will be posted to 'task' and when dispatched
   2476  * 'action' will be called with 'arg' as the arg value.  The new
   2477  * socket is returned in 'socketp'.
   2478  */
   2479 static isc_result_t
   2480 socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
   2481 	      isc_socket_t **socketp, isc_socket_t *dup_socket) {
   2482 	isc_socket_t *sock = NULL;
   2483 	isc__socketthread_t *thread;
   2484 	isc_result_t result;
   2485 	int lockid;
   2486 
   2487 	REQUIRE(VALID_MANAGER(manager));
   2488 	REQUIRE(socketp != NULL && *socketp == NULL);
   2489 	REQUIRE(type != isc_sockettype_fdwatch);
   2490 
   2491 	result = allocate_socket(manager, type, &sock);
   2492 	if (result != ISC_R_SUCCESS) {
   2493 		return (result);
   2494 	}
   2495 
   2496 	switch (sock->type) {
   2497 	case isc_sockettype_udp:
   2498 		sock->statsindex = (pf == AF_INET) ? udp4statsindex
   2499 						   : udp6statsindex;
   2500 #define DCSPPKT(pf) ((pf == AF_INET) ? ISC_NET_DSCPPKTV4 : ISC_NET_DSCPPKTV6)
   2501 		sock->pktdscp = (isc_net_probedscp() & DCSPPKT(pf)) != 0;
   2502 		break;
   2503 	case isc_sockettype_tcp:
   2504 		sock->statsindex = (pf == AF_INET) ? tcp4statsindex
   2505 						   : tcp6statsindex;
   2506 		break;
   2507 	case isc_sockettype_unix:
   2508 		sock->statsindex = unixstatsindex;
   2509 		break;
   2510 	case isc_sockettype_raw:
   2511 		sock->statsindex = rawstatsindex;
   2512 		break;
   2513 	default:
   2514 		UNREACHABLE();
   2515 	}
   2516 
   2517 	sock->pf = pf;
   2518 
   2519 	result = opensocket(manager, sock, dup_socket);
   2520 	if (result != ISC_R_SUCCESS) {
   2521 		free_socket(&sock);
   2522 		return (result);
   2523 	}
   2524 
   2525 	if (sock->fd == -1) {
   2526 		abort();
   2527 	}
   2528 	sock->threadid = gen_threadid(sock);
   2529 	isc_refcount_increment0(&sock->references);
   2530 	thread = &manager->threads[sock->threadid];
   2531 	*socketp = sock;
   2532 
   2533 	/*
   2534 	 * Note we don't have to lock the socket like we normally would because
   2535 	 * there are no external references to it yet.
   2536 	 */
   2537 
   2538 	lockid = FDLOCK_ID(sock->fd);
   2539 	LOCK(&thread->fdlock[lockid]);
   2540 	thread->fds[sock->fd] = sock;
   2541 	thread->fdstate[sock->fd] = MANAGED;
   2542 #if defined(USE_EPOLL)
   2543 	thread->epoll_events[sock->fd] = 0;
   2544 #endif /* if defined(USE_EPOLL) */
   2545 #ifdef USE_DEVPOLL
   2546 	INSIST(thread->fdpollinfo[sock->fd].want_read == 0 &&
   2547 	       thread->fdpollinfo[sock->fd].want_write == 0);
   2548 #endif /* ifdef USE_DEVPOLL */
   2549 	UNLOCK(&thread->fdlock[lockid]);
   2550 
   2551 	LOCK(&manager->lock);
   2552 	ISC_LIST_APPEND(manager->socklist, sock, link);
   2553 #ifdef USE_SELECT
   2554 	if (thread->maxfd < sock->fd) {
   2555 		thread->maxfd = sock->fd;
   2556 	}
   2557 #endif /* ifdef USE_SELECT */
   2558 	UNLOCK(&manager->lock);
   2559 
   2560 	socket_log(sock, NULL, CREATION,
   2561 		   dup_socket != NULL ? "dupped" : "created");
   2562 
   2563 	return (ISC_R_SUCCESS);
   2564 }
   2565 
   2566 /*%
   2567  * Create a new 'type' socket managed by 'manager'.  Events
   2568  * will be posted to 'task' and when dispatched 'action' will be
   2569  * called with 'arg' as the arg value.  The new socket is returned
   2570  * in 'socketp'.
   2571  */
   2572 isc_result_t
   2573 isc_socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
   2574 		  isc_socket_t **socketp) {
   2575 	return (socket_create(manager0, pf, type, socketp, NULL));
   2576 }
   2577 
   2578 /*%
   2579  * Duplicate an existing socket.  The new socket is returned
   2580  * in 'socketp'.
   2581  */
   2582 isc_result_t
   2583 isc_socket_dup(isc_socket_t *sock, isc_socket_t **socketp) {
   2584 	REQUIRE(VALID_SOCKET(sock));
   2585 	REQUIRE(socketp != NULL && *socketp == NULL);
   2586 
   2587 	return (socket_create(sock->manager, sock->pf, sock->type, socketp,
   2588 			      sock));
   2589 }
   2590 
   2591 isc_result_t
   2592 isc_socket_open(isc_socket_t *sock) {
   2593 	isc_result_t result;
   2594 	isc__socketthread_t *thread;
   2595 
   2596 	REQUIRE(VALID_SOCKET(sock));
   2597 
   2598 	LOCK(&sock->lock);
   2599 
   2600 	REQUIRE(isc_refcount_current(&sock->references) >= 1);
   2601 	REQUIRE(sock->fd == -1);
   2602 	REQUIRE(sock->threadid == -1);
   2603 	REQUIRE(sock->type != isc_sockettype_fdwatch);
   2604 
   2605 	result = opensocket(sock->manager, sock, NULL);
   2606 
   2607 	UNLOCK(&sock->lock);
   2608 
   2609 	if (result != ISC_R_SUCCESS) {
   2610 		sock->fd = -1;
   2611 	} else {
   2612 		sock->threadid = gen_threadid(sock);
   2613 		thread = &sock->manager->threads[sock->threadid];
   2614 		int lockid = FDLOCK_ID(sock->fd);
   2615 
   2616 		LOCK(&thread->fdlock[lockid]);
   2617 		thread->fds[sock->fd] = sock;
   2618 		thread->fdstate[sock->fd] = MANAGED;
   2619 #if defined(USE_EPOLL)
   2620 		thread->epoll_events[sock->fd] = 0;
   2621 #endif /* if defined(USE_EPOLL) */
   2622 #ifdef USE_DEVPOLL
   2623 		INSIST(thread->fdpollinfo[sock->fd].want_read == 0 &&
   2624 		       thread->fdpollinfo[sock->fd].want_write == 0);
   2625 #endif /* ifdef USE_DEVPOLL */
   2626 		UNLOCK(&thread->fdlock[lockid]);
   2627 
   2628 #ifdef USE_SELECT
   2629 		LOCK(&sock->manager->lock);
   2630 		if (thread->maxfd < sock->fd) {
   2631 			thread->maxfd = sock->fd;
   2632 		}
   2633 		UNLOCK(&sock->manager->lock);
   2634 #endif /* ifdef USE_SELECT */
   2635 	}
   2636 
   2637 	return (result);
   2638 }
   2639 
   2640 /*
   2641  * Attach to a socket.  Caller must explicitly detach when it is done.
   2642  */
   2643 void
   2644 isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
   2645 	REQUIRE(VALID_SOCKET(sock));
   2646 	REQUIRE(socketp != NULL && *socketp == NULL);
   2647 
   2648 	int old_refs = isc_refcount_increment(&sock->references);
   2649 	REQUIRE(old_refs > 0);
   2650 
   2651 	*socketp = sock;
   2652 }
   2653 
   2654 /*
   2655  * Dereference a socket.  If this is the last reference to it, clean things
   2656  * up by destroying the socket.
   2657  */
   2658 void
   2659 isc_socket_detach(isc_socket_t **socketp) {
   2660 	isc_socket_t *sock;
   2661 
   2662 	REQUIRE(socketp != NULL);
   2663 	sock = *socketp;
   2664 	REQUIRE(VALID_SOCKET(sock));
   2665 	if (isc_refcount_decrement(&sock->references) == 1) {
   2666 		destroy(&sock);
   2667 	}
   2668 
   2669 	*socketp = NULL;
   2670 }
   2671 
   2672 isc_result_t
   2673 isc_socket_close(isc_socket_t *sock) {
   2674 	int fd;
   2675 	isc_socketmgr_t *manager;
   2676 	isc__socketthread_t *thread;
   2677 	fflush(stdout);
   2678 	REQUIRE(VALID_SOCKET(sock));
   2679 
   2680 	LOCK(&sock->lock);
   2681 
   2682 	REQUIRE(sock->type != isc_sockettype_fdwatch);
   2683 	REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks);
   2684 
   2685 	INSIST(!sock->connecting);
   2686 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
   2687 	INSIST(ISC_LIST_EMPTY(sock->send_list));
   2688 	INSIST(ISC_LIST_EMPTY(sock->accept_list));
   2689 	INSIST(ISC_LIST_EMPTY(sock->connect_list));
   2690 
   2691 	manager = sock->manager;
   2692 	thread = &manager->threads[sock->threadid];
   2693 	fd = sock->fd;
   2694 	sock->fd = -1;
   2695 	sock->threadid = -1;
   2696 
   2697 	sock->dupped = 0;
   2698 	memset(sock->name, 0, sizeof(sock->name));
   2699 	sock->tag = NULL;
   2700 	sock->listener = 0;
   2701 	sock->connected = 0;
   2702 	sock->connecting = 0;
   2703 	sock->bound = 0;
   2704 	isc_sockaddr_any(&sock->peer_address);
   2705 
   2706 	UNLOCK(&sock->lock);
   2707 
   2708 	socketclose(thread, sock, fd);
   2709 
   2710 	return (ISC_R_SUCCESS);
   2711 }
   2712 
   2713 static void
   2714 dispatch_recv(isc_socket_t *sock) {
   2715 	if (sock->type != isc_sockettype_fdwatch) {
   2716 		internal_recv(sock);
   2717 	} else {
   2718 		internal_fdwatch_read(sock);
   2719 	}
   2720 }
   2721 
   2722 static void
   2723 dispatch_send(isc_socket_t *sock) {
   2724 	if (sock->type != isc_sockettype_fdwatch) {
   2725 		internal_send(sock);
   2726 	} else {
   2727 		internal_fdwatch_write(sock);
   2728 	}
   2729 }
   2730 
   2731 /*
   2732  * Dequeue an item off the given socket's read queue, set the result code
   2733  * in the done event to the one provided, and send it to the task it was
   2734  * destined for.
   2735  *
   2736  * If the event to be sent is on a list, remove it before sending.  If
   2737  * asked to, send and detach from the socket as well.
   2738  *
   2739  * Caller must have the socket locked if the event is attached to the socket.
   2740  */
   2741 static void
   2742 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
   2743 	isc_task_t *task;
   2744 
   2745 	task = (*dev)->ev_sender;
   2746 
   2747 	(*dev)->ev_sender = sock;
   2748 
   2749 	if (ISC_LINK_LINKED(*dev, ev_link)) {
   2750 		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
   2751 	}
   2752 
   2753 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) != 0) {
   2754 		isc_task_sendtoanddetach(&task, (isc_event_t **)dev,
   2755 					 sock->threadid);
   2756 	} else {
   2757 		isc_task_sendto(task, (isc_event_t **)dev, sock->threadid);
   2758 	}
   2759 }
   2760 
   2761 /*
   2762  * See comments for send_recvdone_event() above.
   2763  *
   2764  * Caller must have the socket locked if the event is attached to the socket.
   2765  */
   2766 static void
   2767 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
   2768 	isc_task_t *task;
   2769 
   2770 	INSIST(dev != NULL && *dev != NULL);
   2771 
   2772 	task = (*dev)->ev_sender;
   2773 	(*dev)->ev_sender = sock;
   2774 
   2775 	if (ISC_LINK_LINKED(*dev, ev_link)) {
   2776 		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
   2777 	}
   2778 
   2779 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) != 0) {
   2780 		isc_task_sendtoanddetach(&task, (isc_event_t **)dev,
   2781 					 sock->threadid);
   2782 	} else {
   2783 		isc_task_sendto(task, (isc_event_t **)dev, sock->threadid);
   2784 	}
   2785 }
   2786 
   2787 /*
   2788  * See comments for send_recvdone_event() above.
   2789  *
   2790  * Caller must have the socket locked if the event is attached to the socket.
   2791  */
   2792 static void
   2793 send_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **dev) {
   2794 	isc_task_t *task;
   2795 
   2796 	INSIST(dev != NULL && *dev != NULL);
   2797 
   2798 	task = (*dev)->ev_sender;
   2799 	(*dev)->ev_sender = sock;
   2800 
   2801 	if (ISC_LINK_LINKED(*dev, ev_link)) {
   2802 		ISC_LIST_DEQUEUE(sock->connect_list, *dev, ev_link);
   2803 	}
   2804 
   2805 	isc_task_sendtoanddetach(&task, (isc_event_t **)dev, sock->threadid);
   2806 }
   2807 
   2808 /*
   2809  * Call accept() on a socket, to get the new file descriptor.  The listen
   2810  * socket is used as a prototype to create a new isc_socket_t.  The new
   2811  * socket has one outstanding reference.  The task receiving the event
   2812  * will be detached from just after the event is delivered.
   2813  *
   2814  * On entry to this function, the event delivered is the internal
   2815  * readable event, and the first item on the accept_list should be
   2816  * the done event we want to send.  If the list is empty, this is a no-op,
   2817  * so just unlock and return.
   2818  */
   2819 static void
   2820 internal_accept(isc_socket_t *sock) {
   2821 	isc_socketmgr_t *manager;
   2822 	isc__socketthread_t *thread, *nthread;
   2823 	isc_socket_newconnev_t *dev;
   2824 	isc_task_t *task;
   2825 	socklen_t addrlen;
   2826 	int fd;
   2827 	isc_result_t result = ISC_R_SUCCESS;
   2828 	char strbuf[ISC_STRERRORSIZE];
   2829 	const char *err = "accept";
   2830 
   2831 	INSIST(VALID_SOCKET(sock));
   2832 	REQUIRE(sock->fd >= 0);
   2833 
   2834 	socket_log(sock, NULL, TRACE, "internal_accept called, locked socket");
   2835 
   2836 	manager = sock->manager;
   2837 	INSIST(VALID_MANAGER(manager));
   2838 	thread = &manager->threads[sock->threadid];
   2839 
   2840 	INSIST(sock->listener);
   2841 
   2842 	/*
   2843 	 * Get the first item off the accept list.
   2844 	 * If it is empty, unlock the socket and return.
   2845 	 */
   2846 	dev = ISC_LIST_HEAD(sock->accept_list);
   2847 	if (dev == NULL) {
   2848 		unwatch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
   2849 		UNLOCK(&sock->lock);
   2850 		return;
   2851 	}
   2852 
   2853 	/*
   2854 	 * Try to accept the new connection.  If the accept fails with
   2855 	 * EAGAIN or EINTR, simply poke the watcher to watch this socket
   2856 	 * again.  Also ignore ECONNRESET, which has been reported to
   2857 	 * be spuriously returned on Linux 2.2.19 although it is not
   2858 	 * a documented error for accept().  ECONNABORTED has been
   2859 	 * reported for Solaris 8.  The rest are thrown in not because
   2860 	 * we have seen them but because they are ignored by other
   2861 	 * daemons such as BIND 8 and Apache.
   2862 	 */
   2863 
   2864 	addrlen = sizeof(NEWCONNSOCK(dev)->peer_address.type);
   2865 	memset(&NEWCONNSOCK(dev)->peer_address.type, 0, addrlen);
   2866 	fd = accept(sock->fd, &NEWCONNSOCK(dev)->peer_address.type.sa,
   2867 		    (void *)&addrlen);
   2868 
   2869 #ifdef F_DUPFD
   2870 	/*
   2871 	 * Leave a space for stdio to work in.
   2872 	 */
   2873 	if (fd >= 0 && fd < 20) {
   2874 		int newfd, tmp;
   2875 		newfd = fcntl(fd, F_DUPFD, 20);
   2876 		tmp = errno;
   2877 		(void)close(fd);
   2878 		errno = tmp;
   2879 		fd = newfd;
   2880 		err = "accept/fcntl";
   2881 	}
   2882 #endif /* ifdef F_DUPFD */
   2883 
   2884 	if (fd < 0) {
   2885 		if (SOFT_ERROR(errno)) {
   2886 			goto soft_error;
   2887 		}
   2888 		switch (errno) {
   2889 		case ENFILE:
   2890 		case EMFILE:
   2891 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
   2892 				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
   2893 				      "%s: too many open file descriptors",
   2894 				      err);
   2895 			goto soft_error;
   2896 
   2897 		case ENOBUFS:
   2898 		case ENOMEM:
   2899 		case ECONNRESET:
   2900 		case ECONNABORTED:
   2901 		case EHOSTUNREACH:
   2902 		case EHOSTDOWN:
   2903 		case ENETUNREACH:
   2904 		case ENETDOWN:
   2905 		case ECONNREFUSED:
   2906 #ifdef EPROTO
   2907 		case EPROTO:
   2908 #endif /* ifdef EPROTO */
   2909 #ifdef ENONET
   2910 		case ENONET:
   2911 #endif /* ifdef ENONET */
   2912 			goto soft_error;
   2913 		default:
   2914 			break;
   2915 		}
   2916 		strerror_r(errno, strbuf, sizeof(strbuf));
   2917 		UNEXPECTED_ERROR(__FILE__, __LINE__,
   2918 				 "internal_accept: %s() failed: %s", err,
   2919 				 strbuf);
   2920 		fd = -1;
   2921 		result = ISC_R_UNEXPECTED;
   2922 	} else {
   2923 		if (addrlen == 0U) {
   2924 			UNEXPECTED_ERROR(__FILE__, __LINE__,
   2925 					 "internal_accept(): "
   2926 					 "accept() failed to return "
   2927 					 "remote address");
   2928 
   2929 			(void)close(fd);
   2930 			goto soft_error;
   2931 		} else if (NEWCONNSOCK(dev)->peer_address.type.sa.sa_family !=
   2932 			   sock->pf)
   2933 		{
   2934 			UNEXPECTED_ERROR(
   2935 				__FILE__, __LINE__,
   2936 				"internal_accept(): "
   2937 				"accept() returned peer address "
   2938 				"family %u (expected %u)",
   2939 				NEWCONNSOCK(dev)->peer_address.type.sa.sa_family,
   2940 				sock->pf);
   2941 			(void)close(fd);
   2942 			goto soft_error;
   2943 		} else if (fd >= (int)manager->maxsocks) {
   2944 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
   2945 				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
   2946 				      "accept: file descriptor exceeds limit "
   2947 				      "(%d/%u)",
   2948 				      fd, manager->maxsocks);
   2949 			(void)close(fd);
   2950 			goto soft_error;
   2951 		}
   2952 	}
   2953 
   2954 	if (fd != -1) {
   2955 		NEWCONNSOCK(dev)->peer_address.length = addrlen;
   2956 		NEWCONNSOCK(dev)->pf = sock->pf;
   2957 	}
   2958 
   2959 	/*
   2960 	 * Pull off the done event.
   2961 	 */
   2962 	ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
   2963 
   2964 	/*
   2965 	 * Poke watcher if there are more pending accepts.
   2966 	 */
   2967 	if (ISC_LIST_EMPTY(sock->accept_list)) {
   2968 		unwatch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
   2969 	}
   2970 
   2971 	if (fd != -1) {
   2972 		result = make_nonblock(fd);
   2973 		if (result != ISC_R_SUCCESS) {
   2974 			(void)close(fd);
   2975 			fd = -1;
   2976 		}
   2977 	}
   2978 
   2979 	/*
   2980 	 * We need to unlock sock->lock now to be able to lock manager->lock
   2981 	 * without risking a deadlock with xmlstats.
   2982 	 */
   2983 	UNLOCK(&sock->lock);
   2984 
   2985 	/*
   2986 	 * -1 means the new socket didn't happen.
   2987 	 */
   2988 	if (fd != -1) {
   2989 		int lockid = FDLOCK_ID(fd);
   2990 
   2991 		NEWCONNSOCK(dev)->fd = fd;
   2992 		NEWCONNSOCK(dev)->threadid = gen_threadid(NEWCONNSOCK(dev));
   2993 		NEWCONNSOCK(dev)->bound = 1;
   2994 		NEWCONNSOCK(dev)->connected = 1;
   2995 		nthread = &manager->threads[NEWCONNSOCK(dev)->threadid];
   2996 
   2997 		/*
   2998 		 * We already hold a lock on one fdlock in accepting thread,
   2999 		 * we need to make sure that we don't double lock.
   3000 		 */
   3001 		bool same_bucket = (sock->threadid ==
   3002 				    NEWCONNSOCK(dev)->threadid) &&
   3003 				   (FDLOCK_ID(sock->fd) == lockid);
   3004 
   3005 		/*
   3006 		 * Use minimum mtu if possible.
   3007 		 */
   3008 		use_min_mtu(NEWCONNSOCK(dev));
   3009 		set_tcp_maxseg(NEWCONNSOCK(dev), 1280 - 20 - 40);
   3010 
   3011 		/*
   3012 		 * Ensure DSCP settings are inherited across accept.
   3013 		 */
   3014 		setdscp(NEWCONNSOCK(dev), sock->dscp);
   3015 
   3016 		/*
   3017 		 * Save away the remote address
   3018 		 */
   3019 		dev->address = NEWCONNSOCK(dev)->peer_address;
   3020 
   3021 		if (NEWCONNSOCK(dev)->active == 0) {
   3022 			inc_stats(manager->stats,
   3023 				  NEWCONNSOCK(dev)->statsindex[STATID_ACTIVE]);
   3024 			NEWCONNSOCK(dev)->active = 1;
   3025 		}
   3026 
   3027 		if (!same_bucket) {
   3028 			LOCK(&nthread->fdlock[lockid]);
   3029 		}
   3030 		nthread->fds[fd] = NEWCONNSOCK(dev);
   3031 		nthread->fdstate[fd] = MANAGED;
   3032 #if defined(USE_EPOLL)
   3033 		nthread->epoll_events[fd] = 0;
   3034 #endif /* if defined(USE_EPOLL) */
   3035 		if (!same_bucket) {
   3036 			UNLOCK(&nthread->fdlock[lockid]);
   3037 		}
   3038 
   3039 		LOCK(&manager->lock);
   3040 
   3041 #ifdef USE_SELECT
   3042 		if (nthread->maxfd < fd) {
   3043 			nthread->maxfd = fd;
   3044 		}
   3045 #endif /* ifdef USE_SELECT */
   3046 
   3047 		socket_log(sock, &NEWCONNSOCK(dev)->peer_address, CREATION,
   3048 			   "accepted connection, new socket %p",
   3049 			   dev->newsocket);
   3050 
   3051 		ISC_LIST_APPEND(manager->socklist, NEWCONNSOCK(dev), link);
   3052 
   3053 		UNLOCK(&manager->lock);
   3054 
   3055 		inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]);
   3056 	} else {
   3057 		inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
   3058 		isc_refcount_decrementz(&NEWCONNSOCK(dev)->references);
   3059 		free_socket((isc_socket_t **)&dev->newsocket);
   3060 	}
   3061 
   3062 	/*
   3063 	 * Fill in the done event details and send it off.
   3064 	 */
   3065 	dev->result = result;
   3066 	task = dev->ev_sender;
   3067 	dev->ev_sender = sock;
   3068 
   3069 	isc_task_sendtoanddetach(&task, ISC_EVENT_PTR(&dev), sock->threadid);
   3070 	return;
   3071 
   3072 soft_error:
   3073 	watch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
   3074 	UNLOCK(&sock->lock);
   3075 
   3076 	inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
   3077 	return;
   3078 }
   3079 
   3080 static void
   3081 internal_recv(isc_socket_t *sock) {
   3082 	isc_socketevent_t *dev;
   3083 
   3084 	INSIST(VALID_SOCKET(sock));
   3085 	REQUIRE(sock->fd >= 0);
   3086 
   3087 	dev = ISC_LIST_HEAD(sock->recv_list);
   3088 	if (dev == NULL) {
   3089 		goto finish;
   3090 	}
   3091 
   3092 	socket_log(sock, NULL, IOEVENT, "internal_recv: event %p -> task %p",
   3093 		   dev, dev->ev_sender);
   3094 
   3095 	/*
   3096 	 * Try to do as much I/O as possible on this socket.  There are no
   3097 	 * limits here, currently.
   3098 	 */
   3099 	while (dev != NULL) {
   3100 		switch (doio_recv(sock, dev)) {
   3101 		case DOIO_SOFT:
   3102 			goto finish;
   3103 
   3104 		case DOIO_EOF:
   3105 			/*
   3106 			 * read of 0 means the remote end was closed.
   3107 			 * Run through the event queue and dispatch all
   3108 			 * the events with an EOF result code.
   3109 			 */
   3110 			do {
   3111 				dev->result = ISC_R_EOF;
   3112 				send_recvdone_event(sock, &dev);
   3113 				dev = ISC_LIST_HEAD(sock->recv_list);
   3114 			} while (dev != NULL);
   3115 			goto finish;
   3116 
   3117 		case DOIO_SUCCESS:
   3118 		case DOIO_HARD:
   3119 			send_recvdone_event(sock, &dev);
   3120 			break;
   3121 		}
   3122 
   3123 		dev = ISC_LIST_HEAD(sock->recv_list);
   3124 	}
   3125 
   3126 finish:
   3127 	if (ISC_LIST_EMPTY(sock->recv_list)) {
   3128 		unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
   3129 			   SELECT_POKE_READ);
   3130 	}
   3131 }
   3132 
   3133 static void
   3134 internal_send(isc_socket_t *sock) {
   3135 	isc_socketevent_t *dev;
   3136 
   3137 	INSIST(VALID_SOCKET(sock));
   3138 	REQUIRE(sock->fd >= 0);
   3139 
   3140 	dev = ISC_LIST_HEAD(sock->send_list);
   3141 	if (dev == NULL) {
   3142 		goto finish;
   3143 	}
   3144 	socket_log(sock, NULL, EVENT, "internal_send: event %p -> task %p", dev,
   3145 		   dev->ev_sender);
   3146 
   3147 	/*
   3148 	 * Try to do as much I/O as possible on this socket.  There are no
   3149 	 * limits here, currently.
   3150 	 */
   3151 	while (dev != NULL) {
   3152 		switch (doio_send(sock, dev)) {
   3153 		case DOIO_SOFT:
   3154 			goto finish;
   3155 
   3156 		case DOIO_HARD:
   3157 		case DOIO_SUCCESS:
   3158 			send_senddone_event(sock, &dev);
   3159 			break;
   3160 		}
   3161 
   3162 		dev = ISC_LIST_HEAD(sock->send_list);
   3163 	}
   3164 
   3165 finish:
   3166 	if (ISC_LIST_EMPTY(sock->send_list)) {
   3167 		unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
   3168 			   SELECT_POKE_WRITE);
   3169 	}
   3170 }
   3171 
   3172 static void
   3173 internal_fdwatch_write(isc_socket_t *sock)
   3174 {
   3175 	int more_data;
   3176 
   3177 	INSIST(VALID_SOCKET(sock));
   3178 
   3179 	isc_refcount_increment(&sock->references);
   3180 	UNLOCK(&sock->lock);
   3181 
   3182 	more_data = (sock->fdwatchcb)(sock->fdwatchtask, (isc_socket_t *)sock,
   3183 				      sock->fdwatcharg, ISC_SOCKFDWATCH_WRITE);
   3184 
   3185 	LOCK(&sock->lock);
   3186 
   3187 	if (isc_refcount_decrement(&sock->references) == 0) {
   3188 		UNLOCK(&sock->lock);
   3189 		destroy(&sock);
   3190 		return;
   3191 	}
   3192 
   3193 	if (more_data)
   3194 		select_poke(sock->manager, sock->threadid, sock->fd,
   3195 		    SELECT_POKE_WRITE);
   3196 }
   3197 
   3198 static void
   3199 internal_fdwatch_read(isc_socket_t *sock)
   3200 {
   3201 	int more_data;
   3202 
   3203 	INSIST(VALID_SOCKET(sock));
   3204 
   3205 	isc_refcount_increment(&sock->references);
   3206 	UNLOCK(&sock->lock);
   3207 
   3208 	more_data = (sock->fdwatchcb)(sock->fdwatchtask, (isc_socket_t *)sock,
   3209 				      sock->fdwatcharg, ISC_SOCKFDWATCH_READ);
   3210 
   3211 	LOCK(&sock->lock);
   3212 
   3213 	if (isc_refcount_decrement(&sock->references) == 0) {
   3214 		UNLOCK(&sock->lock);
   3215 		destroy(&sock);
   3216 		return;
   3217 	}
   3218 
   3219 	if (more_data)
   3220 		select_poke(sock->manager, sock->threadid, sock->fd,
   3221 		    SELECT_POKE_READ);
   3222 }
   3223 
   3224 /*
   3225  * Process read/writes on each fd here.  Avoid locking
   3226  * and unlocking twice if both reads and writes are possible.
   3227  */
   3228 static void
   3229 process_fd(isc__socketthread_t *thread, int fd, bool readable, bool writeable) {
   3230 	isc_socket_t *sock;
   3231 	int lockid = FDLOCK_ID(fd);
   3232 
   3233 	/*
   3234 	 * If the socket is going to be closed, don't do more I/O.
   3235 	 */
   3236 	LOCK(&thread->fdlock[lockid]);
   3237 	if (thread->fdstate[fd] == CLOSE_PENDING) {
   3238 		UNLOCK(&thread->fdlock[lockid]);
   3239 
   3240 		(void)unwatch_fd(thread, fd, SELECT_POKE_READ);
   3241 		(void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
   3242 		return;
   3243 	}
   3244 
   3245 	sock = thread->fds[fd];
   3246 	if (sock == NULL) {
   3247 		UNLOCK(&thread->fdlock[lockid]);
   3248 		return;
   3249 	}
   3250 
   3251 	LOCK(&sock->lock);
   3252 
   3253 	if (sock->fd < 0) {
   3254 		/*
   3255 		 * Sock is being closed - the final external reference
   3256 		 * is gone but it was not yet removed from event loop
   3257 		 * and fdstate[]/fds[] as destroy() is waiting on
   3258 		 * thread->fdlock[lockid] or sock->lock that we're holding.
   3259 		 * Just release the locks and bail.
   3260 		 */
   3261 		UNLOCK(&sock->lock);
   3262 		UNLOCK(&thread->fdlock[lockid]);
   3263 		return;
   3264 	}
   3265 
   3266 	REQUIRE(readable || writeable);
   3267 	if (writeable) {
   3268 		if (sock->connecting) {
   3269 			internal_connect(sock);
   3270 		} else {
   3271 			dispatch_send(sock);
   3272 		}
   3273 	}
   3274 
   3275 	if (readable) {
   3276 		if (sock->listener) {
   3277 			internal_accept(sock); /* unlocks sock */
   3278 		} else {
   3279 			dispatch_recv(sock);
   3280 			UNLOCK(&sock->lock);
   3281 		}
   3282 	} else {
   3283 		UNLOCK(&sock->lock);
   3284 	}
   3285 
   3286 	UNLOCK(&thread->fdlock[lockid]);
   3287 
   3288 	/*
   3289 	 * Socket destruction might be pending, it will resume
   3290 	 * after releasing fdlock and sock->lock.
   3291 	 */
   3292 }
   3293 
   3294 /*
   3295  * process_fds is different for different event loops
   3296  * it takes the events from event loops and for each FD
   3297  * launches process_fd
   3298  */
   3299 #ifdef USE_KQUEUE
   3300 static bool
   3301 process_fds(isc__socketthread_t *thread, struct kevent *events, int nevents) {
   3302 	int i;
   3303 	bool readable, writable;
   3304 	bool done = false;
   3305 	bool have_ctlevent = false;
   3306 	if (nevents == thread->nevents) {
   3307 		/*
   3308 		 * This is not an error, but something unexpected.  If this
   3309 		 * happens, it may indicate the need for increasing
   3310 		 * ISC_SOCKET_MAXEVENTS.
   3311 		 */
   3312 		thread_log(thread, ISC_LOGCATEGORY_GENERAL,
   3313 			   ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
   3314 			   "maximum number of FD events (%d) received",
   3315 			   nevents);
   3316 	}
   3317 
   3318 	for (i = 0; i < nevents; i++) {
   3319 		REQUIRE(events[i].ident < thread->manager->maxsocks);
   3320 		if (events[i].ident == (uintptr_t)thread->pipe_fds[0]) {
   3321 			have_ctlevent = true;
   3322 			continue;
   3323 		}
   3324 		readable = (events[i].filter == EVFILT_READ);
   3325 		writable = (events[i].filter == EVFILT_WRITE);
   3326 		process_fd(thread, events[i].ident, readable, writable);
   3327 	}
   3328 
   3329 	if (have_ctlevent) {
   3330 		done = process_ctlfd(thread);
   3331 	}
   3332 
   3333 	return (done);
   3334 }
   3335 #elif defined(USE_EPOLL)
   3336 static bool
   3337 process_fds(isc__socketthread_t *thread, struct epoll_event *events,
   3338 	    int nevents) {
   3339 	int i;
   3340 	bool done = false;
   3341 	bool have_ctlevent = false;
   3342 
   3343 	if (nevents == thread->nevents) {
   3344 		thread_log(thread, ISC_LOGCATEGORY_GENERAL,
   3345 			   ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
   3346 			   "maximum number of FD events (%d) received",
   3347 			   nevents);
   3348 	}
   3349 
   3350 	for (i = 0; i < nevents; i++) {
   3351 		REQUIRE(events[i].data.fd < (int)thread->manager->maxsocks);
   3352 		if (events[i].data.fd == thread->pipe_fds[0]) {
   3353 			have_ctlevent = true;
   3354 			continue;
   3355 		}
   3356 		if ((events[i].events & EPOLLERR) != 0 ||
   3357 		    (events[i].events & EPOLLHUP) != 0)
   3358 		{
   3359 			/*
   3360 			 * epoll does not set IN/OUT bits on an erroneous
   3361 			 * condition, so we need to try both anyway.  This is a
   3362 			 * bit inefficient, but should be okay for such rare
   3363 			 * events.  Note also that the read or write attempt
   3364 			 * won't block because we use non-blocking sockets.
   3365 			 */
   3366 			int fd = events[i].data.fd;
   3367 			events[i].events |= thread->epoll_events[fd];
   3368 		}
   3369 		process_fd(thread, events[i].data.fd,
   3370 			   (events[i].events & EPOLLIN) != 0,
   3371 			   (events[i].events & EPOLLOUT) != 0);
   3372 	}
   3373 
   3374 	if (have_ctlevent) {
   3375 		done = process_ctlfd(thread);
   3376 	}
   3377 
   3378 	return (done);
   3379 }
   3380 #elif defined(USE_DEVPOLL)
   3381 static bool
   3382 process_fds(isc__socketthread_t *thread, struct pollfd *events, int nevents) {
   3383 	int i;
   3384 	bool done = false;
   3385 	bool have_ctlevent = false;
   3386 
   3387 	if (nevents == thread->nevents) {
   3388 		thread_log(thread, ISC_LOGCATEGORY_GENERAL,
   3389 			   ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
   3390 			   "maximum number of FD events (%d) received",
   3391 			   nevents);
   3392 	}
   3393 
   3394 	for (i = 0; i < nevents; i++) {
   3395 		REQUIRE(events[i].fd < (int)thread->manager->maxsocks);
   3396 		if (events[i].fd == thread->pipe_fds[0]) {
   3397 			have_ctlevent = true;
   3398 			continue;
   3399 		}
   3400 		process_fd(thread, events[i].fd,
   3401 			   (events[i].events & POLLIN) != 0,
   3402 			   (events[i].events & POLLOUT) != 0);
   3403 	}
   3404 
   3405 	if (have_ctlevent) {
   3406 		done = process_ctlfd(thread);
   3407 	}
   3408 
   3409 	return (done);
   3410 }
   3411 #elif defined(USE_SELECT)
   3412 static void
   3413 process_fds(isc__socketthread_t *thread, int maxfd, fd_set *readfds,
   3414 	    fd_set *writefds) {
   3415 	int i;
   3416 
   3417 	REQUIRE(maxfd <= (int)thread->manager->maxsocks);
   3418 
   3419 	for (i = 0; i < maxfd; i++) {
   3420 		if (i == thread->pipe_fds[0] || i == thread->pipe_fds[1]) {
   3421 			continue;
   3422 		}
   3423 		process_fd(thread, i, FD_ISSET(i, readfds),
   3424 			   FD_ISSET(i, writefds));
   3425 	}
   3426 }
   3427 #endif /* ifdef USE_KQUEUE */
   3428 
   3429 static bool
   3430 process_ctlfd(isc__socketthread_t *thread) {
   3431 	int msg, fd;
   3432 
   3433 	for (;;) {
   3434 		select_readmsg(thread, &fd, &msg);
   3435 
   3436 		thread_log(thread, IOEVENT,
   3437 			   "watcher got message %d for socket %d", msg, fd);
   3438 
   3439 		/*
   3440 		 * Nothing to read?
   3441 		 */
   3442 		if (msg == SELECT_POKE_NOTHING) {
   3443 			break;
   3444 		}
   3445 
   3446 		/*
   3447 		 * Handle shutdown message.  We really should
   3448 		 * jump out of this loop right away, but
   3449 		 * it doesn't matter if we have to do a little
   3450 		 * more work first.
   3451 		 */
   3452 		if (msg == SELECT_POKE_SHUTDOWN) {
   3453 			return (true);
   3454 		}
   3455 
   3456 		/*
   3457 		 * This is a wakeup on a socket.  Look
   3458 		 * at the event queue for both read and write,
   3459 		 * and decide if we need to watch on it now
   3460 		 * or not.
   3461 		 */
   3462 		wakeup_socket(thread, fd, msg);
   3463 	}
   3464 
   3465 	return (false);
   3466 }
   3467 
   3468 /*
   3469  * This is the thread that will loop forever, always in a select or poll
   3470  * call.
   3471  *
   3472  * When select returns something to do, do whatever's necessary and post
   3473  * an event to the task that was requesting the action.
   3474  */
   3475 static isc_threadresult_t
   3476 netthread(void *uap) {
   3477 	isc__socketthread_t *thread = uap;
   3478 	isc_socketmgr_t *manager = thread->manager;
   3479 	(void)manager;
   3480 	bool done;
   3481 	int cc;
   3482 #ifdef USE_KQUEUE
   3483 	const char *fnname = "kevent()";
   3484 #elif defined(USE_EPOLL)
   3485 	const char *fnname = "epoll_wait()";
   3486 #elif defined(USE_DEVPOLL)
   3487 	isc_result_t result;
   3488 	const char *fnname = "ioctl(DP_POLL)";
   3489 	struct dvpoll dvp;
   3490 	int pass;
   3491 #if defined(ISC_SOCKET_USE_POLLWATCH)
   3492 	pollstate_t pollstate = poll_idle;
   3493 #endif /* if defined(ISC_SOCKET_USE_POLLWATCH) */
   3494 #elif defined(USE_SELECT)
   3495 	const char *fnname = "select()";
   3496 	int maxfd;
   3497 	int ctlfd;
   3498 #endif /* ifdef USE_KQUEUE */
   3499 	char strbuf[ISC_STRERRORSIZE];
   3500 
   3501 #if defined(USE_SELECT)
   3502 	/*
   3503 	 * Get the control fd here.  This will never change.
   3504 	 */
   3505 	ctlfd = thread->pipe_fds[0];
   3506 #endif /* if defined(USE_SELECT) */
   3507 	done = false;
   3508 	while (!done) {
   3509 		do {
   3510 #ifdef USE_KQUEUE
   3511 			cc = kevent(thread->kqueue_fd, NULL, 0, thread->events,
   3512 				    thread->nevents, NULL);
   3513 #elif defined(USE_EPOLL)
   3514 			cc = epoll_wait(thread->epoll_fd, thread->events,
   3515 					thread->nevents, -1);
   3516 #elif defined(USE_DEVPOLL)
   3517 			/*
   3518 			 * Re-probe every thousand calls.
   3519 			 */
   3520 			if (thread->calls++ > 1000U) {
   3521 				result = isc_resource_getcurlimit(
   3522 					isc_resource_openfiles,
   3523 					&thread->open_max);
   3524 				if (result != ISC_R_SUCCESS) {
   3525 					thread->open_max = 64;
   3526 				}
   3527 				thread->calls = 0;
   3528 			}
   3529 			for (pass = 0; pass < 2; pass++) {
   3530 				dvp.dp_fds = thread->events;
   3531 				dvp.dp_nfds = thread->nevents;
   3532 				if (dvp.dp_nfds >= thread->open_max) {
   3533 					dvp.dp_nfds = thread->open_max - 1;
   3534 				}
   3535 #ifndef ISC_SOCKET_USE_POLLWATCH
   3536 				dvp.dp_timeout = -1;
   3537 #else  /* ifndef ISC_SOCKET_USE_POLLWATCH */
   3538 				if (pollstate == poll_idle) {
   3539 					dvp.dp_timeout = -1;
   3540 				} else {
   3541 					dvp.dp_timeout =
   3542 						ISC_SOCKET_POLLWATCH_TIMEOUT;
   3543 				}
   3544 #endif /* ISC_SOCKET_USE_POLLWATCH */
   3545 				cc = ioctl(thread->devpoll_fd, DP_POLL, &dvp);
   3546 				if (cc == -1 && errno == EINVAL) {
   3547 					/*
   3548 					 * {OPEN_MAX} may have dropped.  Look
   3549 					 * up the current value and try again.
   3550 					 */
   3551 					result = isc_resource_getcurlimit(
   3552 						isc_resource_openfiles,
   3553 						&thread->open_max);
   3554 					if (result != ISC_R_SUCCESS) {
   3555 						thread->open_max = 64;
   3556 					}
   3557 				} else {
   3558 					break;
   3559 				}
   3560 			}
   3561 #elif defined(USE_SELECT)
   3562 			/*
   3563 			 * We will have only one thread anyway, we can lock
   3564 			 * manager lock and don't care
   3565 			 */
   3566 			LOCK(&manager->lock);
   3567 			memmove(thread->read_fds_copy, thread->read_fds,
   3568 				thread->fd_bufsize);
   3569 			memmove(thread->write_fds_copy, thread->write_fds,
   3570 				thread->fd_bufsize);
   3571 			maxfd = thread->maxfd + 1;
   3572 			UNLOCK(&manager->lock);
   3573 
   3574 			cc = select(maxfd, thread->read_fds_copy,
   3575 				    thread->write_fds_copy, NULL, NULL);
   3576 #endif /* USE_KQUEUE */
   3577 
   3578 			if (cc < 0 && !SOFT_ERROR(errno)) {
   3579 				strerror_r(errno, strbuf, sizeof(strbuf));
   3580 				FATAL_ERROR(__FILE__, __LINE__, "%s failed: %s",
   3581 					    fnname, strbuf);
   3582 			}
   3583 
   3584 #if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH)
   3585 			if (cc == 0) {
   3586 				if (pollstate == poll_active) {
   3587 					pollstate = poll_checking;
   3588 				} else if (pollstate == poll_checking) {
   3589 					pollstate = poll_idle;
   3590 				}
   3591 			} else if (cc > 0) {
   3592 				if (pollstate == poll_checking) {
   3593 					/*
   3594 					 * XXX: We'd like to use a more
   3595 					 * verbose log level as it's actually an
   3596 					 * unexpected event, but the kernel bug
   3597 					 * reportedly happens pretty frequently
   3598 					 * (and it can also be a false positive)
   3599 					 * so it would be just too noisy.
   3600 					 */
   3601 					thread_log(thread,
   3602 						   ISC_LOGCATEGORY_GENERAL,
   3603 						   ISC_LOGMODULE_SOCKET,
   3604 						   ISC_LOG_DEBUG(1),
   3605 						   "unexpected POLL timeout");
   3606 				}
   3607 				pollstate = poll_active;
   3608 			}
   3609 #endif /* if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH) */
   3610 		} while (cc < 0);
   3611 
   3612 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
   3613 		done = process_fds(thread, thread->events, cc);
   3614 #elif defined(USE_SELECT)
   3615 		process_fds(thread, maxfd, thread->read_fds_copy,
   3616 			    thread->write_fds_copy);
   3617 
   3618 		/*
   3619 		 * Process reads on internal, control fd.
   3620 		 */
   3621 		if (FD_ISSET(ctlfd, thread->read_fds_copy)) {
   3622 			done = process_ctlfd(thread);
   3623 		}
   3624 #endif /* if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) \
   3625 	* */
   3626 	}
   3627 
   3628 	thread_log(thread, TRACE, "watcher exiting");
   3629 	return ((isc_threadresult_t)0);
   3630 }
   3631 
   3632 void
   3633 isc_socketmgr_setreserved(isc_socketmgr_t *manager, uint32_t reserved) {
   3634 	REQUIRE(VALID_MANAGER(manager));
   3635 
   3636 	manager->reserved = reserved;
   3637 }
   3638 
   3639 void
   3640 isc_socketmgr_maxudp(isc_socketmgr_t *manager, unsigned int maxudp) {
   3641 	REQUIRE(VALID_MANAGER(manager));
   3642 
   3643 	manager->maxudp = maxudp;
   3644 }
   3645 
   3646 /*
   3647  * Setup socket thread, thread->manager and thread->threadid must be filled.
   3648  */
   3649 
   3650 static isc_result_t
   3651 setup_thread(isc__socketthread_t *thread) {
   3652 	isc_result_t result = ISC_R_SUCCESS;
   3653 	int i;
   3654 	char strbuf[ISC_STRERRORSIZE];
   3655 
   3656 	REQUIRE(thread != NULL);
   3657 	REQUIRE(VALID_MANAGER(thread->manager));
   3658 	REQUIRE(thread->threadid >= 0 &&
   3659 		thread->threadid < thread->manager->nthreads);
   3660 
   3661 	thread->fds =
   3662 		isc_mem_get(thread->manager->mctx,
   3663 			    thread->manager->maxsocks * sizeof(isc_socket_t *));
   3664 
   3665 	memset(thread->fds, 0,
   3666 	       thread->manager->maxsocks * sizeof(isc_socket_t *));
   3667 
   3668 	thread->fdstate = isc_mem_get(thread->manager->mctx,
   3669 				      thread->manager->maxsocks * sizeof(int));
   3670 
   3671 	memset(thread->fdstate, 0, thread->manager->maxsocks * sizeof(int));
   3672 
   3673 	thread->fdlock = isc_mem_get(thread->manager->mctx,
   3674 				     FDLOCK_COUNT * sizeof(isc_mutex_t));
   3675 
   3676 	for (i = 0; i < FDLOCK_COUNT; i++) {
   3677 		isc_mutex_init(&thread->fdlock[i]);
   3678 	}
   3679 
   3680 	if (pipe(thread->pipe_fds) != 0) {
   3681 		strerror_r(errno, strbuf, sizeof(strbuf));
   3682 		UNEXPECTED_ERROR(__FILE__, __LINE__, "pipe() failed: %s",
   3683 				 strbuf);
   3684 		return (ISC_R_UNEXPECTED);
   3685 	}
   3686 	RUNTIME_CHECK(make_nonblock(thread->pipe_fds[0]) == ISC_R_SUCCESS);
   3687 
   3688 #ifdef USE_KQUEUE
   3689 	thread->nevents = ISC_SOCKET_MAXEVENTS;
   3690 	thread->events = isc_mem_get(thread->manager->mctx,
   3691 				     sizeof(struct kevent) * thread->nevents);
   3692 
   3693 	thread->kqueue_fd = kqueue();
   3694 	if (thread->kqueue_fd == -1) {
   3695 		result = isc__errno2result(errno);
   3696 		strerror_r(errno, strbuf, sizeof(strbuf));
   3697 		UNEXPECTED_ERROR(__FILE__, __LINE__, "kqueue failed: %s",
   3698 				 strbuf);
   3699 		isc_mem_put(thread->manager->mctx, thread->events,
   3700 			    sizeof(struct kevent) * thread->nevents);
   3701 		return (result);
   3702 	}
   3703 
   3704 	result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
   3705 	if (result != ISC_R_SUCCESS) {
   3706 		close(thread->kqueue_fd);
   3707 		isc_mem_put(thread->manager->mctx, thread->events,
   3708 			    sizeof(struct kevent) * thread->nevents);
   3709 	}
   3710 	return (result);
   3711 
   3712 #elif defined(USE_EPOLL)
   3713 	thread->nevents = ISC_SOCKET_MAXEVENTS;
   3714 	thread->epoll_events =
   3715 		isc_mem_get(thread->manager->mctx,
   3716 			    (thread->manager->maxsocks * sizeof(uint32_t)));
   3717 
   3718 	memset(thread->epoll_events, 0,
   3719 	       thread->manager->maxsocks * sizeof(uint32_t));
   3720 
   3721 	thread->events =
   3722 		isc_mem_get(thread->manager->mctx,
   3723 			    sizeof(struct epoll_event) * thread->nevents);
   3724 
   3725 	thread->epoll_fd = epoll_create(thread->nevents);
   3726 	if (thread->epoll_fd == -1) {
   3727 		result = isc__errno2result(errno);
   3728 		strerror_r(errno, strbuf, sizeof(strbuf));
   3729 		UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_create failed: %s",
   3730 				 strbuf);
   3731 		return (result);
   3732 	}
   3733 
   3734 	result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
   3735 	return (result);
   3736 
   3737 #elif defined(USE_DEVPOLL)
   3738 	thread->nevents = ISC_SOCKET_MAXEVENTS;
   3739 	result = isc_resource_getcurlimit(isc_resource_openfiles,
   3740 					  &thread->open_max);
   3741 	if (result != ISC_R_SUCCESS) {
   3742 		thread->open_max = 64;
   3743 	}
   3744 	thread->calls = 0;
   3745 	thread->events = isc_mem_get(thread->manager->mctx,
   3746 				     sizeof(struct pollfd) * thread->nevents);
   3747 
   3748 	/*
   3749 	 * Note: fdpollinfo should be able to support all possible FDs, so
   3750 	 * it must have maxsocks entries (not nevents).
   3751 	 */
   3752 	thread->fdpollinfo =
   3753 		isc_mem_get(thread->manager->mctx,
   3754 			    sizeof(pollinfo_t) * thread->manager->maxsocks);
   3755 	memset(thread->fdpollinfo, 0,
   3756 	       sizeof(pollinfo_t) * thread->manager->maxsocks);
   3757 	thread->devpoll_fd = open("/dev/poll", O_RDWR);
   3758 	if (thread->devpoll_fd == -1) {
   3759 		result = isc__errno2result(errno);
   3760 		strerror_r(errno, strbuf, sizeof(strbuf));
   3761 		UNEXPECTED_ERROR(__FILE__, __LINE__,
   3762 				 "open(/dev/poll) failed: %s", strbuf);
   3763 		isc_mem_put(thread->manager->mctx, thread->events,
   3764 			    sizeof(struct pollfd) * thread->nevents);
   3765 		isc_mem_put(thread->manager->mctx, thread->fdpollinfo,
   3766 			    sizeof(pollinfo_t) * thread->manager->maxsocks);
   3767 		return (result);
   3768 	}
   3769 	result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
   3770 	if (result != ISC_R_SUCCESS) {
   3771 		close(thread->devpoll_fd);
   3772 		isc_mem_put(thread->manager->mctx, thread->events,
   3773 			    sizeof(struct pollfd) * thread->nevents);
   3774 		isc_mem_put(thread->manager->mctx, thread->fdpollinfo,
   3775 			    sizeof(pollinfo_t) * thread->manager->maxsocks);
   3776 		return (result);
   3777 	}
   3778 
   3779 	return (ISC_R_SUCCESS);
   3780 #elif defined(USE_SELECT)
   3781 	UNUSED(result);
   3782 
   3783 #if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE
   3784 	/*
   3785 	 * Note: this code should also cover the case of MAXSOCKETS <=
   3786 	 * FD_SETSIZE, but we separate the cases to avoid possible portability
   3787 	 * issues regarding howmany() and the actual representation of fd_set.
   3788 	 */
   3789 	thread->fd_bufsize = howmany(manager->maxsocks, NFDBITS) *
   3790 			     sizeof(fd_mask);
   3791 #else  /* if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE */
   3792 	thread->fd_bufsize = sizeof(fd_set);
   3793 #endif /* if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE */
   3794 
   3795 	thread->read_fds = isc_mem_get(thread->manager->mctx,
   3796 				       thread->fd_bufsize);
   3797 	thread->read_fds_copy = isc_mem_get(thread->manager->mctx,
   3798 					    thread->fd_bufsize);
   3799 	thread->write_fds = isc_mem_get(thread->manager->mctx,
   3800 					thread->fd_bufsize);
   3801 	thread->write_fds_copy = isc_mem_get(thread->manager->mctx,
   3802 					     thread->fd_bufsize);
   3803 	memset(thread->read_fds, 0, thread->fd_bufsize);
   3804 	memset(thread->write_fds, 0, thread->fd_bufsize);
   3805 
   3806 	(void)watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
   3807 	thread->maxfd = thread->pipe_fds[0];
   3808 
   3809 	return (ISC_R_SUCCESS);
   3810 #endif /* USE_KQUEUE */
   3811 }
   3812 
   3813 static void
   3814 cleanup_thread(isc_mem_t *mctx, isc__socketthread_t *thread) {
   3815 	isc_result_t result;
   3816 	int i;
   3817 
   3818 	result = unwatch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
   3819 	if (result != ISC_R_SUCCESS) {
   3820 		UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_ctl(DEL) failed");
   3821 	}
   3822 #ifdef USE_KQUEUE
   3823 	close(thread->kqueue_fd);
   3824 	isc_mem_put(mctx, thread->events,
   3825 		    sizeof(struct kevent) * thread->nevents);
   3826 #elif defined(USE_EPOLL)
   3827 	close(thread->epoll_fd);
   3828 
   3829 	isc_mem_put(mctx, thread->events,
   3830 		    sizeof(struct epoll_event) * thread->nevents);
   3831 #elif defined(USE_DEVPOLL)
   3832 	close(thread->devpoll_fd);
   3833 	isc_mem_put(mctx, thread->events,
   3834 		    sizeof(struct pollfd) * thread->nevents);
   3835 	isc_mem_put(mctx, thread->fdpollinfo,
   3836 		    sizeof(pollinfo_t) * thread->manager->maxsocks);
   3837 #elif defined(USE_SELECT)
   3838 	if (thread->read_fds != NULL) {
   3839 		isc_mem_put(mctx, thread->read_fds, thread->fd_bufsize);
   3840 	}
   3841 	if (thread->read_fds_copy != NULL) {
   3842 		isc_mem_put(mctx, thread->read_fds_copy, thread->fd_bufsize);
   3843 	}
   3844 	if (thread->write_fds != NULL) {
   3845 		isc_mem_put(mctx, thread->write_fds, thread->fd_bufsize);
   3846 	}
   3847 	if (thread->write_fds_copy != NULL) {
   3848 		isc_mem_put(mctx, thread->write_fds_copy, thread->fd_bufsize);
   3849 	}
   3850 #endif /* USE_KQUEUE */
   3851 	for (i = 0; i < (int)thread->manager->maxsocks; i++) {
   3852 		if (thread->fdstate[i] == CLOSE_PENDING) {
   3853 			/* no need to lock */
   3854 			(void)close(i);
   3855 		}
   3856 	}
   3857 
   3858 #if defined(USE_EPOLL)
   3859 	isc_mem_put(thread->manager->mctx, thread->epoll_events,
   3860 		    thread->manager->maxsocks * sizeof(uint32_t));
   3861 #endif /* if defined(USE_EPOLL) */
   3862 	isc_mem_put(thread->manager->mctx, thread->fds,
   3863 		    thread->manager->maxsocks * sizeof(isc_socket_t *));
   3864 	isc_mem_put(thread->manager->mctx, thread->fdstate,
   3865 		    thread->manager->maxsocks * sizeof(int));
   3866 
   3867 	for (i = 0; i < FDLOCK_COUNT; i++) {
   3868 		isc_mutex_destroy(&thread->fdlock[i]);
   3869 	}
   3870 	isc_mem_put(thread->manager->mctx, thread->fdlock,
   3871 		    FDLOCK_COUNT * sizeof(isc_mutex_t));
   3872 }
   3873 
   3874 isc_result_t
   3875 isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
   3876 	return (isc_socketmgr_create2(mctx, managerp, 0, 1));
   3877 }
   3878 
   3879 isc_result_t
   3880 isc_socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
   3881 		      unsigned int maxsocks, int nthreads) {
   3882 	int i;
   3883 	isc_socketmgr_t *manager;
   3884 
   3885 	REQUIRE(managerp != NULL && *managerp == NULL);
   3886 
   3887 	if (maxsocks == 0) {
   3888 		maxsocks = ISC_SOCKET_MAXSOCKETS;
   3889 	}
   3890 
   3891 	manager = isc_mem_get(mctx, sizeof(*manager));
   3892 
   3893 	/* zero-clear so that necessary cleanup on failure will be easy */
   3894 	memset(manager, 0, sizeof(*manager));
   3895 	manager->maxsocks = maxsocks;
   3896 	manager->reserved = 0;
   3897 	manager->maxudp = 0;
   3898 	manager->nthreads = nthreads;
   3899 	manager->stats = NULL;
   3900 
   3901 	manager->magic = SOCKET_MANAGER_MAGIC;
   3902 	manager->mctx = NULL;
   3903 	ISC_LIST_INIT(manager->socklist);
   3904 	isc_mutex_init(&manager->lock);
   3905 	isc_condition_init(&manager->shutdown_ok);
   3906 
   3907 	/*
   3908 	 * Start up the select/poll thread.
   3909 	 */
   3910 	manager->threads = isc_mem_get(mctx, sizeof(isc__socketthread_t) *
   3911 						     manager->nthreads);
   3912 	isc_mem_attach(mctx, &manager->mctx);
   3913 
   3914 	for (i = 0; i < manager->nthreads; i++) {
   3915 		manager->threads[i].manager = manager;
   3916 		manager->threads[i].threadid = i;
   3917 		setup_thread(&manager->threads[i]);
   3918 		isc_thread_create(netthread, &manager->threads[i],
   3919 				  &manager->threads[i].thread);
   3920 		char tname[1024];
   3921 		sprintf(tname, "sock-%d", i);
   3922 		isc_thread_setname(manager->threads[i].thread, tname);
   3923 	}
   3924 
   3925 	*managerp = manager;
   3926 
   3927 	return (ISC_R_SUCCESS);
   3928 }
   3929 
   3930 isc_result_t
   3931 isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) {
   3932 	REQUIRE(VALID_MANAGER(manager));
   3933 	REQUIRE(nsockp != NULL);
   3934 
   3935 	*nsockp = manager->maxsocks;
   3936 
   3937 	return (ISC_R_SUCCESS);
   3938 }
   3939 
   3940 void
   3941 isc_socketmgr_setstats(isc_socketmgr_t *manager, isc_stats_t *stats) {
   3942 	REQUIRE(VALID_MANAGER(manager));
   3943 	REQUIRE(ISC_LIST_EMPTY(manager->socklist));
   3944 	REQUIRE(manager->stats == NULL);
   3945 	REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
   3946 
   3947 	isc_stats_attach(stats, &manager->stats);
   3948 }
   3949 
   3950 void
   3951 isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
   3952 	isc_socketmgr_t *manager;
   3953 
   3954 	/*
   3955 	 * Destroy a socket manager.
   3956 	 */
   3957 
   3958 	REQUIRE(managerp != NULL);
   3959 	manager = *managerp;
   3960 	REQUIRE(VALID_MANAGER(manager));
   3961 
   3962 	LOCK(&manager->lock);
   3963 
   3964 	/*
   3965 	 * Wait for all sockets to be destroyed.
   3966 	 */
   3967 	while (!ISC_LIST_EMPTY(manager->socklist)) {
   3968 		manager_log(manager, CREATION, "sockets exist");
   3969 		WAIT(&manager->shutdown_ok, &manager->lock);
   3970 	}
   3971 
   3972 	UNLOCK(&manager->lock);
   3973 
   3974 	/*
   3975 	 * Here, poke our select/poll thread.  Do this by closing the write
   3976 	 * half of the pipe, which will send EOF to the read half.
   3977 	 * This is currently a no-op in the non-threaded case.
   3978 	 */
   3979 	for (int i = 0; i < manager->nthreads; i++) {
   3980 		select_poke(manager, i, 0, SELECT_POKE_SHUTDOWN);
   3981 	}
   3982 
   3983 	/*
   3984 	 * Wait for thread to exit.
   3985 	 */
   3986 	for (int i = 0; i < manager->nthreads; i++) {
   3987 		isc_thread_join(manager->threads[i].thread, NULL);
   3988 		cleanup_thread(manager->mctx, &manager->threads[i]);
   3989 	}
   3990 	/*
   3991 	 * Clean up.
   3992 	 */
   3993 	isc_mem_put(manager->mctx, manager->threads,
   3994 		    sizeof(isc__socketthread_t) * manager->nthreads);
   3995 	(void)isc_condition_destroy(&manager->shutdown_ok);
   3996 
   3997 	if (manager->stats != NULL) {
   3998 		isc_stats_detach(&manager->stats);
   3999 	}
   4000 	isc_mutex_destroy(&manager->lock);
   4001 	manager->magic = 0;
   4002 	isc_mem_putanddetach(&manager->mctx, manager, sizeof(*manager));
   4003 
   4004 	*managerp = NULL;
   4005 }
   4006 
   4007 static isc_result_t
   4008 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
   4009 	    unsigned int flags) {
   4010 	int io_state;
   4011 	bool have_lock = false;
   4012 	isc_task_t *ntask = NULL;
   4013 	isc_result_t result = ISC_R_SUCCESS;
   4014 
   4015 	dev->ev_sender = task;
   4016 
   4017 	if (sock->type == isc_sockettype_udp) {
   4018 		io_state = doio_recv(sock, dev);
   4019 	} else {
   4020 		LOCK(&sock->lock);
   4021 		have_lock = true;
   4022 
   4023 		if (ISC_LIST_EMPTY(sock->recv_list)) {
   4024 			io_state = doio_recv(sock, dev);
   4025 		} else {
   4026 			io_state = DOIO_SOFT;
   4027 		}
   4028 	}
   4029 
   4030 	switch (io_state) {
   4031 	case DOIO_SOFT:
   4032 		/*
   4033 		 * We couldn't read all or part of the request right now, so
   4034 		 * queue it.
   4035 		 *
   4036 		 * Attach to socket and to task
   4037 		 */
   4038 		isc_task_attach(task, &ntask);
   4039 		dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
   4040 
   4041 		if (!have_lock) {
   4042 			LOCK(&sock->lock);
   4043 			have_lock = true;
   4044 		}
   4045 
   4046 		/*
   4047 		 * Enqueue the request.  If the socket was previously not being
   4048 		 * watched, poke the watcher to start paying attention to it.
   4049 		 */
   4050 		bool do_poke = ISC_LIST_EMPTY(sock->recv_list);
   4051 		ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
   4052 		if (do_poke) {
   4053 			select_poke(sock->manager, sock->threadid, sock->fd,
   4054 				    SELECT_POKE_READ);
   4055 		}
   4056 
   4057 		socket_log(sock, NULL, EVENT,
   4058 			   "socket_recv: event %p -> task %p", dev, ntask);
   4059 
   4060 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) {
   4061 			result = ISC_R_INPROGRESS;
   4062 		}
   4063 		break;
   4064 
   4065 	case DOIO_EOF:
   4066 		dev->result = ISC_R_EOF;
   4067 		FALLTHROUGH;
   4068 
   4069 	case DOIO_HARD:
   4070 	case DOIO_SUCCESS:
   4071 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) {
   4072 			send_recvdone_event(sock, &dev);
   4073 		}
   4074 		break;
   4075 	}
   4076 
   4077 	if (have_lock) {
   4078 		UNLOCK(&sock->lock);
   4079 	}
   4080 
   4081 	return (result);
   4082 }
   4083 
   4084 isc_result_t
   4085 isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
   4086 		isc_task_t *task, isc_taskaction_t action, void *arg) {
   4087 	isc_socketevent_t *dev;
   4088 	isc_socketmgr_t *manager;
   4089 
   4090 	REQUIRE(VALID_SOCKET(sock));
   4091 	REQUIRE(action != NULL);
   4092 
   4093 	manager = sock->manager;
   4094 	REQUIRE(VALID_MANAGER(manager));
   4095 
   4096 	INSIST(sock->bound);
   4097 
   4098 	dev = allocate_socketevent(manager->mctx, sock, ISC_SOCKEVENT_RECVDONE,
   4099 				   action, arg);
   4100 	if (dev == NULL) {
   4101 		return (ISC_R_NOMEMORY);
   4102 	}
   4103 
   4104 	return (isc_socket_recv2(sock, region, minimum, task, dev, 0));
   4105 }
   4106 
   4107 isc_result_t
   4108 isc_socket_recv2(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
   4109 		 isc_task_t *task, isc_socketevent_t *event,
   4110 		 unsigned int flags) {
   4111 	event->ev_sender = sock;
   4112 	event->result = ISC_R_UNSET;
   4113 	event->region = *region;
   4114 	event->n = 0;
   4115 	event->offset = 0;
   4116 	event->attributes = 0;
   4117 
   4118 	/*
   4119 	 * UDP sockets are always partial read.
   4120 	 */
   4121 	if (sock->type == isc_sockettype_udp) {
   4122 		event->minimum = 1;
   4123 	} else {
   4124 		if (minimum == 0) {
   4125 			event->minimum = region->length;
   4126 		} else {
   4127 			event->minimum = minimum;
   4128 		}
   4129 	}
   4130 
   4131 	return (socket_recv(sock, event, task, flags));
   4132 }
   4133 
   4134 static isc_result_t
   4135 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
   4136 	    const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
   4137 	    unsigned int flags) {
   4138 	int io_state;
   4139 	bool have_lock = false;
   4140 	isc_task_t *ntask = NULL;
   4141 	isc_result_t result = ISC_R_SUCCESS;
   4142 
   4143 	dev->ev_sender = task;
   4144 
   4145 	set_dev_address(address, sock, dev);
   4146 	if (pktinfo != NULL) {
   4147 		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
   4148 		dev->pktinfo = *pktinfo;
   4149 
   4150 		if (!isc_sockaddr_issitelocal(&dev->address) &&
   4151 		    !isc_sockaddr_islinklocal(&dev->address))
   4152 		{
   4153 			socket_log(sock, NULL, TRACE,
   4154 				   "pktinfo structure provided, ifindex %u "
   4155 				   "(set to 0)",
   4156 				   pktinfo->ipi6_ifindex);
   4157 
   4158 			/*
   4159 			 * Set the pktinfo index to 0 here, to let the
   4160 			 * kernel decide what interface it should send on.
   4161 			 */
   4162 			dev->pktinfo.ipi6_ifindex = 0;
   4163 		}
   4164 	}
   4165 
   4166 	if (sock->type == isc_sockettype_udp) {
   4167 		io_state = doio_send(sock, dev);
   4168 	} else {
   4169 		LOCK(&sock->lock);
   4170 		have_lock = true;
   4171 
   4172 		if (ISC_LIST_EMPTY(sock->send_list)) {
   4173 			io_state = doio_send(sock, dev);
   4174 		} else {
   4175 			io_state = DOIO_SOFT;
   4176 		}
   4177 	}
   4178 
   4179 	switch (io_state) {
   4180 	case DOIO_SOFT:
   4181 		/*
   4182 		 * We couldn't send all or part of the request right now, so
   4183 		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
   4184 		 */
   4185 		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
   4186 			isc_task_attach(task, &ntask);
   4187 			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
   4188 
   4189 			if (!have_lock) {
   4190 				LOCK(&sock->lock);
   4191 				have_lock = true;
   4192 			}
   4193 
   4194 			/*
   4195 			 * Enqueue the request.  If the socket was previously
   4196 			 * not being watched, poke the watcher to start
   4197 			 * paying attention to it.
   4198 			 */
   4199 			bool do_poke = ISC_LIST_EMPTY(sock->send_list);
   4200 			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
   4201 			if (do_poke) {
   4202 				select_poke(sock->manager, sock->threadid,
   4203 					    sock->fd, SELECT_POKE_WRITE);
   4204 			}
   4205 			socket_log(sock, NULL, EVENT,
   4206 				   "socket_send: event %p -> task %p", dev,
   4207 				   ntask);
   4208 
   4209 			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) {
   4210 				result = ISC_R_INPROGRESS;
   4211 			}
   4212 			break;
   4213 		}
   4214 
   4215 		FALLTHROUGH;
   4216 
   4217 	case DOIO_HARD:
   4218 	case DOIO_SUCCESS:
   4219 		if (!have_lock) {
   4220 			LOCK(&sock->lock);
   4221 			have_lock = true;
   4222 		}
   4223 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) {
   4224 			send_senddone_event(sock, &dev);
   4225 		}
   4226 		break;
   4227 	}
   4228 
   4229 	if (have_lock) {
   4230 		UNLOCK(&sock->lock);
   4231 	}
   4232 
   4233 	return (result);
   4234 }
   4235 
   4236 isc_result_t
   4237 isc_socket_send(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
   4238 		isc_taskaction_t action, void *arg) {
   4239 	/*
   4240 	 * REQUIRE() checking is performed in isc_socket_sendto().
   4241 	 */
   4242 	return (isc_socket_sendto(sock, region, task, action, arg, NULL, NULL));
   4243 }
   4244 
   4245 isc_result_t
   4246 isc_socket_sendto(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
   4247 		  isc_taskaction_t action, void *arg,
   4248 		  const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo) {
   4249 	isc_socketevent_t *dev;
   4250 	isc_socketmgr_t *manager;
   4251 
   4252 	REQUIRE(VALID_SOCKET(sock));
   4253 	REQUIRE(region != NULL);
   4254 	REQUIRE(task != NULL);
   4255 	REQUIRE(action != NULL);
   4256 
   4257 	manager = sock->manager;
   4258 	REQUIRE(VALID_MANAGER(manager));
   4259 
   4260 	INSIST(sock->bound);
   4261 
   4262 	dev = allocate_socketevent(manager->mctx, sock, ISC_SOCKEVENT_SENDDONE,
   4263 				   action, arg);
   4264 	if (dev == NULL) {
   4265 		return (ISC_R_NOMEMORY);
   4266 	}
   4267 
   4268 	dev->region = *region;
   4269 
   4270 	return (socket_send(sock, dev, task, address, pktinfo, 0));
   4271 }
   4272 
   4273 isc_result_t
   4274 isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
   4275 		   const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
   4276 		   isc_socketevent_t *event, unsigned int flags) {
   4277 	REQUIRE(VALID_SOCKET(sock));
   4278 	REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE | ISC_SOCKFLAG_NORETRY)) ==
   4279 		0);
   4280 	if ((flags & ISC_SOCKFLAG_NORETRY) != 0) {
   4281 		REQUIRE(sock->type == isc_sockettype_udp);
   4282 	}
   4283 	event->ev_sender = sock;
   4284 	event->result = ISC_R_UNSET;
   4285 	event->region = *region;
   4286 	event->n = 0;
   4287 	event->offset = 0;
   4288 	event->attributes &= ~ISC_SOCKEVENTATTR_ATTACHED;
   4289 
   4290 	return (socket_send(sock, event, task, address, pktinfo, flags));
   4291 }
   4292 
   4293 void
   4294 isc_socket_cleanunix(const isc_sockaddr_t *sockaddr, bool active) {
   4295 #ifdef ISC_PLATFORM_HAVESYSUNH
   4296 	int s;
   4297 	struct stat sb;
   4298 	char strbuf[ISC_STRERRORSIZE];
   4299 
   4300 	if (sockaddr->type.sa.sa_family != AF_UNIX) {
   4301 		return;
   4302 	}
   4303 
   4304 #ifndef S_ISSOCK
   4305 #if defined(S_IFMT) && defined(S_IFSOCK)
   4306 #define S_ISSOCK(mode) ((mode & S_IFMT) == S_IFSOCK)
   4307 #elif defined(_S_IFMT) && defined(S_IFSOCK)
   4308 #define S_ISSOCK(mode) ((mode & _S_IFMT) == S_IFSOCK)
   4309 #endif /* if defined(S_IFMT) && defined(S_IFSOCK) */
   4310 #endif /* ifndef S_ISSOCK */
   4311 
   4312 #ifndef S_ISFIFO
   4313 #if defined(S_IFMT) && defined(S_IFIFO)
   4314 #define S_ISFIFO(mode) ((mode & S_IFMT) == S_IFIFO)
   4315 #elif defined(_S_IFMT) && defined(S_IFIFO)
   4316 #define S_ISFIFO(mode) ((mode & _S_IFMT) == S_IFIFO)
   4317 #endif /* if defined(S_IFMT) && defined(S_IFIFO) */
   4318 #endif /* ifndef S_ISFIFO */
   4319 
   4320 #if !defined(S_ISFIFO) && !defined(S_ISSOCK)
   4321 /* cppcheck-suppress preprocessorErrorDirective */
   4322 #error \
   4323 	You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform.  See <sys/stat.h>.
   4324 #endif /* if !defined(S_ISFIFO) && !defined(S_ISSOCK) */
   4325 
   4326 #ifndef S_ISFIFO
   4327 #define S_ISFIFO(mode) 0
   4328 #endif /* ifndef S_ISFIFO */
   4329 
   4330 #ifndef S_ISSOCK
   4331 #define S_ISSOCK(mode) 0
   4332 #endif /* ifndef S_ISSOCK */
   4333 
   4334 	if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
   4335 		switch (errno) {
   4336 		case ENOENT:
   4337 			if (active) { /* We exited cleanly last time */
   4338 				break;
   4339 			}
   4340 			FALLTHROUGH;
   4341 		default:
   4342 			strerror_r(errno, strbuf, sizeof(strbuf));
   4343 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
   4344 				      ISC_LOGMODULE_SOCKET,
   4345 				      active ? ISC_LOG_ERROR : ISC_LOG_WARNING,
   4346 				      "isc_socket_cleanunix: stat(%s): %s",
   4347 				      sockaddr->type.sunix.sun_path, strbuf);
   4348 			return;
   4349 		}
   4350 	} else {
   4351 		if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
   4352 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
   4353 				      ISC_LOGMODULE_SOCKET,
   4354 				      active ? ISC_LOG_ERROR : ISC_LOG_WARNING,
   4355 				      "isc_socket_cleanunix: %s: not a socket",
   4356 				      sockaddr->type.sunix.sun_path);
   4357 			return;
   4358 		}
   4359 	}
   4360 
   4361 	if (active) {
   4362 		if (unlink(sockaddr->type.sunix.sun_path) < 0) {
   4363 			strerror_r(errno, strbuf, sizeof(strbuf));
   4364 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
   4365 				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
   4366 				      "isc_socket_cleanunix: unlink(%s): %s",
   4367 				      sockaddr->type.sunix.sun_path, strbuf);
   4368 		}
   4369 		return;
   4370 	}
   4371 
   4372 	s = socket(AF_UNIX, SOCK_STREAM, 0);
   4373 	if (s < 0) {
   4374 		strerror_r(errno, strbuf, sizeof(strbuf));
   4375 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
   4376 			      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
   4377 			      "isc_socket_cleanunix: socket(%s): %s",
   4378 			      sockaddr->type.sunix.sun_path, strbuf);
   4379 		return;
   4380 	}
   4381 
   4382 	if (connect(s, (const struct sockaddr *)&sockaddr->type.sunix,
   4383 		    sizeof(sockaddr->type.sunix)) < 0)
   4384 	{
   4385 		switch (errno) {
   4386 		case ECONNREFUSED:
   4387 		case ECONNRESET:
   4388 			if (unlink(sockaddr->type.sunix.sun_path) < 0) {
   4389 				strerror_r(errno, strbuf, sizeof(strbuf));
   4390 				isc_log_write(
   4391 					isc_lctx, ISC_LOGCATEGORY_GENERAL,
   4392 					ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
   4393 					"isc_socket_cleanunix: "
   4394 					"unlink(%s): %s",
   4395 					sockaddr->type.sunix.sun_path, strbuf);
   4396 			}
   4397 			break;
   4398 		default:
   4399 			strerror_r(errno, strbuf, sizeof(strbuf));
   4400 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
   4401 				      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
   4402 				      "isc_socket_cleanunix: connect(%s): %s",
   4403 				      sockaddr->type.sunix.sun_path, strbuf);
   4404 			break;
   4405 		}
   4406 	}
   4407 	close(s);
   4408 #else  /* ifdef ISC_PLATFORM_HAVESYSUNH */
   4409 	UNUSED(sockaddr);
   4410 	UNUSED(active);
   4411 #endif /* ifdef ISC_PLATFORM_HAVESYSUNH */
   4412 }
   4413 
   4414 isc_result_t
   4415 isc_socket_permunix(const isc_sockaddr_t *sockaddr, uint32_t perm,
   4416 		    uint32_t owner, uint32_t group) {
   4417 #ifdef ISC_PLATFORM_HAVESYSUNH
   4418 	isc_result_t result = ISC_R_SUCCESS;
   4419 	char strbuf[ISC_STRERRORSIZE];
   4420 	char path[sizeof(sockaddr->type.sunix.sun_path)];
   4421 #ifdef NEED_SECURE_DIRECTORY
   4422 	char *slash;
   4423 #endif /* ifdef NEED_SECURE_DIRECTORY */
   4424 
   4425 	REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
   4426 	INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
   4427 	strlcpy(path, sockaddr->type.sunix.sun_path, sizeof(path));
   4428 
   4429 #ifdef NEED_SECURE_DIRECTORY
   4430 	slash = strrchr(path, '/');
   4431 	if (slash != NULL) {
   4432 		if (slash != path) {
   4433 			*slash = '\0';
   4434 		} else {
   4435 			strlcpy(path, "/", sizeof(path));
   4436 		}
   4437 	} else {
   4438 		strlcpy(path, ".", sizeof(path));
   4439 	}
   4440 #endif /* ifdef NEED_SECURE_DIRECTORY */
   4441 
   4442 	if (chmod(path, perm) < 0) {
   4443 		strerror_r(errno, strbuf, sizeof(strbuf));
   4444 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
   4445 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
   4446 			      "isc_socket_permunix: chmod(%s, %d): %s", path,
   4447 			      perm, strbuf);
   4448 		result = ISC_R_FAILURE;
   4449 	}
   4450 	if (chown(path, owner, group) < 0) {
   4451 		strerror_r(errno, strbuf, sizeof(strbuf));
   4452 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
   4453 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
   4454 			      "isc_socket_permunix: chown(%s, %d, %d): %s",
   4455 			      path, owner, group, strbuf);
   4456 		result = ISC_R_FAILURE;
   4457 	}
   4458 	return (result);
   4459 #else  /* ifdef ISC_PLATFORM_HAVESYSUNH */
   4460 	UNUSED(sockaddr);
   4461 	UNUSED(perm);
   4462 	UNUSED(owner);
   4463 	UNUSED(group);
   4464 	return (ISC_R_NOTIMPLEMENTED);
   4465 #endif /* ifdef ISC_PLATFORM_HAVESYSUNH */
   4466 }
   4467 
   4468 isc_result_t
   4469 isc_socket_bind(isc_socket_t *sock, const isc_sockaddr_t *sockaddr,
   4470 		isc_socket_options_t options) {
   4471 	char strbuf[ISC_STRERRORSIZE];
   4472 	int on = 1;
   4473 
   4474 	REQUIRE(VALID_SOCKET(sock));
   4475 
   4476 	LOCK(&sock->lock);
   4477 
   4478 	INSIST(!sock->bound);
   4479 	INSIST(!sock->dupped);
   4480 
   4481 	if (sock->pf != sockaddr->type.sa.sa_family) {
   4482 		UNLOCK(&sock->lock);
   4483 		return (ISC_R_FAMILYMISMATCH);
   4484 	}
   4485 
   4486 	/*
   4487 	 * Only set SO_REUSEADDR when we want a specific port.
   4488 	 */
   4489 #ifdef AF_UNIX
   4490 	if (sock->pf == AF_UNIX) {
   4491 		goto bind_socket;
   4492 	}
   4493 #endif /* ifdef AF_UNIX */
   4494 	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
   4495 	    isc_sockaddr_getport(sockaddr) != (in_port_t)0)
   4496 	{
   4497 		if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
   4498 			       sizeof(on)) < 0)
   4499 		{
   4500 			UNEXPECTED_ERROR(__FILE__, __LINE__,
   4501 					 "setsockopt(%d) failed", sock->fd);
   4502 		}
   4503 #if defined(__FreeBSD_kernel__) && defined(SO_REUSEPORT_LB)
   4504 		if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT_LB,
   4505 			       (void *)&on, sizeof(on)) < 0)
   4506 		{
   4507 			UNEXPECTED_ERROR(__FILE__, __LINE__,
   4508 					 "setsockopt(%d) failed", sock->fd);
   4509 		}
   4510 #elif defined(__linux__) && defined(SO_REUSEPORT)
   4511 		if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT, (void *)&on,
   4512 			       sizeof(on)) < 0)
   4513 		{
   4514 			UNEXPECTED_ERROR(__FILE__, __LINE__,
   4515 					 "setsockopt(%d) failed", sock->fd);
   4516 		}
   4517 #endif		/* if defined(__FreeBSD_kernel__) && defined(SO_REUSEPORT_LB) */
   4518 		/* Press on... */
   4519 	}
   4520 #ifdef AF_UNIX
   4521 bind_socket:
   4522 #endif /* ifdef AF_UNIX */
   4523 	if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
   4524 		inc_stats(sock->manager->stats,
   4525 			  sock->statsindex[STATID_BINDFAIL]);
   4526 
   4527 		UNLOCK(&sock->lock);
   4528 		switch (errno) {
   4529 		case EACCES:
   4530 			return (ISC_R_NOPERM);
   4531 		case EADDRNOTAVAIL:
   4532 			return (ISC_R_ADDRNOTAVAIL);
   4533 		case EADDRINUSE:
   4534 			return (ISC_R_ADDRINUSE);
   4535 		case EINVAL:
   4536 			return (ISC_R_BOUND);
   4537 		default:
   4538 			strerror_r(errno, strbuf, sizeof(strbuf));
   4539 			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
   4540 					 strbuf);
   4541 			return (ISC_R_UNEXPECTED);
   4542 		}
   4543 	}
   4544 
   4545 	socket_log(sock, sockaddr, TRACE, "bound");
   4546 	sock->bound = 1;
   4547 
   4548 	UNLOCK(&sock->lock);
   4549 	return (ISC_R_SUCCESS);
   4550 }
   4551 
   4552 /*
   4553  * Enable this only for specific OS versions, and only when they have repaired
   4554  * their problems with it.  Until then, this is is broken and needs to be
   4555  * disabled by default.  See RT22589 for details.
   4556  */
   4557 #undef ENABLE_ACCEPTFILTER
   4558 
   4559 isc_result_t
   4560 isc_socket_filter(isc_socket_t *sock, const char *filter) {
   4561 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
   4562 	char strbuf[ISC_STRERRORSIZE];
   4563 	struct accept_filter_arg afa;
   4564 #else  /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
   4565 	UNUSED(sock);
   4566 	UNUSED(filter);
   4567 #endif /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
   4568 
   4569 	REQUIRE(VALID_SOCKET(sock));
   4570 
   4571 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
   4572 	bzero(&afa, sizeof(afa));
   4573 	strlcpy(afa.af_name, filter, sizeof(afa.af_name));
   4574 	if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER, &afa,
   4575 		       sizeof(afa)) == -1)
   4576 	{
   4577 		strerror_r(errno, strbuf, sizeof(strbuf));
   4578 		socket_log(sock, NULL, CREATION,
   4579 			   "setsockopt(SO_ACCEPTFILTER): %s", strbuf);
   4580 		return (ISC_R_FAILURE);
   4581 	}
   4582 	return (ISC_R_SUCCESS);
   4583 #else  /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
   4584 	return (ISC_R_NOTIMPLEMENTED);
   4585 #endif /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
   4586 }
   4587 
   4588 /*
   4589  * Try enabling TCP Fast Open for a given socket if the OS supports it.
   4590  */
   4591 static void
   4592 set_tcp_fastopen(isc_socket_t *sock, unsigned int backlog) {
   4593 #if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN)
   4594 	char strbuf[ISC_STRERRORSIZE];
   4595 
   4596 /*
   4597  * FreeBSD, as of versions 10.3 and 11.0, defines TCP_FASTOPEN while also
   4598  * shipping a default kernel without TFO support, so we special-case it by
   4599  * performing an additional runtime check for TFO support using sysctl to
   4600  * prevent setsockopt() errors from being logged.
   4601  */
   4602 #if defined(__FreeBSD__) && defined(HAVE_SYSCTLBYNAME)
   4603 #define SYSCTL_TFO "net.inet.tcp.fastopen.enabled"
   4604 	unsigned int enabled;
   4605 	size_t enabledlen = sizeof(enabled);
   4606 	static bool tfo_notice_logged = false;
   4607 
   4608 	if (sysctlbyname(SYSCTL_TFO, &enabled, &enabledlen, NULL, 0) < 0) {
   4609 		/*
   4610 		 * This kernel does not support TCP Fast Open.  There is
   4611 		 * nothing more we can do.
   4612 		 */
   4613 		return;
   4614 	} else if (enabled == 0) {
   4615 		/*
   4616 		 * This kernel does support TCP Fast Open, but it is disabled
   4617 		 * by sysctl.  Notify the user, but do not nag.
   4618 		 */
   4619 		if (!tfo_notice_logged) {
   4620 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
   4621 				      ISC_LOGMODULE_SOCKET, ISC_LOG_NOTICE,
   4622 				      "TCP_FASTOPEN support is disabled by "
   4623 				      "sysctl (" SYSCTL_TFO " = 0)");
   4624 			tfo_notice_logged = true;
   4625 		}
   4626 		return;
   4627 	}
   4628 #endif /* if defined(__FreeBSD__) && defined(HAVE_SYSCTLBYNAME) */
   4629 
   4630 #ifdef __APPLE__
   4631 	backlog = 1;
   4632 #else  /* ifdef __APPLE__ */
   4633 	backlog = backlog / 2;
   4634 	if (backlog == 0) {
   4635 		backlog = 1;
   4636 	}
   4637 #endif /* ifdef __APPLE__ */
   4638 	if (setsockopt(sock->fd, IPPROTO_TCP, TCP_FASTOPEN, (void *)&backlog,
   4639 		       sizeof(backlog)) < 0)
   4640 	{
   4641 		strerror_r(errno, strbuf, sizeof(strbuf));
   4642 		UNEXPECTED_ERROR(__FILE__, __LINE__,
   4643 				 "setsockopt(%d, TCP_FASTOPEN) failed with %s",
   4644 				 sock->fd, strbuf);
   4645 		/* TCP_FASTOPEN is experimental so ignore failures */
   4646 	}
   4647 #else  /* if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN) */
   4648 	UNUSED(sock);
   4649 	UNUSED(backlog);
   4650 #endif /* if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN) */
   4651 }
   4652 
   4653 /*
   4654  * Set up to listen on a given socket.  We do this by creating an internal
   4655  * event that will be dispatched when the socket has read activity.  The
   4656  * watcher will send the internal event to the task when there is a new
   4657  * connection.
   4658  *
   4659  * Unlike in read, we don't preallocate a done event here.  Every time there
   4660  * is a new connection we'll have to allocate a new one anyway, so we might
   4661  * as well keep things simple rather than having to track them.
   4662  */
   4663 isc_result_t
   4664 isc_socket_listen(isc_socket_t *sock, unsigned int backlog) {
   4665 	char strbuf[ISC_STRERRORSIZE];
   4666 
   4667 	REQUIRE(VALID_SOCKET(sock));
   4668 
   4669 	LOCK(&sock->lock);
   4670 
   4671 	REQUIRE(!sock->listener);
   4672 	REQUIRE(sock->bound);
   4673 	REQUIRE(sock->type == isc_sockettype_tcp ||
   4674 		sock->type == isc_sockettype_unix);
   4675 
   4676 	if (backlog == 0) {
   4677 		backlog = SOMAXCONN;
   4678 	}
   4679 
   4680 	if (listen(sock->fd, (int)backlog) < 0) {
   4681 		UNLOCK(&sock->lock);
   4682 		strerror_r(errno, strbuf, sizeof(strbuf));
   4683 
   4684 		UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
   4685 
   4686 		return (ISC_R_UNEXPECTED);
   4687 	}
   4688 
   4689 	set_tcp_fastopen(sock, backlog);
   4690 
   4691 	sock->listener = 1;
   4692 
   4693 	UNLOCK(&sock->lock);
   4694 	return (ISC_R_SUCCESS);
   4695 }
   4696 
   4697 /*
   4698  * This should try to do aggressive accept() XXXMLG
   4699  */
   4700 isc_result_t
   4701 isc_socket_accept(isc_socket_t *sock, isc_task_t *task, isc_taskaction_t action,
   4702 		  void *arg) {
   4703 	isc_socket_newconnev_t *dev;
   4704 	isc_socketmgr_t *manager;
   4705 	isc_task_t *ntask = NULL;
   4706 	isc_socket_t *nsock;
   4707 	isc_result_t result;
   4708 	bool do_poke = false;
   4709 
   4710 	REQUIRE(VALID_SOCKET(sock));
   4711 	manager = sock->manager;
   4712 	REQUIRE(VALID_MANAGER(manager));
   4713 
   4714 	LOCK(&sock->lock);
   4715 
   4716 	REQUIRE(sock->listener);
   4717 
   4718 	/*
   4719 	 * Sender field is overloaded here with the task we will be sending
   4720 	 * this event to.  Just before the actual event is delivered the
   4721 	 * actual ev_sender will be touched up to be the socket.
   4722 	 */
   4723 	dev = (isc_socket_newconnev_t *)isc_event_allocate(
   4724 		manager->mctx, task, ISC_SOCKEVENT_NEWCONN, action, arg,
   4725 		sizeof(*dev));
   4726 	ISC_LINK_INIT(dev, ev_link);
   4727 
   4728 	result = allocate_socket(manager, sock->type, &nsock);
   4729 	if (result != ISC_R_SUCCESS) {
   4730 		isc_event_free(ISC_EVENT_PTR(&dev));
   4731 		UNLOCK(&sock->lock);
   4732 		return (result);
   4733 	}
   4734 
   4735 	/*
   4736 	 * Attach to socket and to task.
   4737 	 */
   4738 	isc_task_attach(task, &ntask);
   4739 	if (isc_task_exiting(ntask)) {
   4740 		free_socket(&nsock);
   4741 		isc_task_detach(&ntask);
   4742 		isc_event_free(ISC_EVENT_PTR(&dev));
   4743 		UNLOCK(&sock->lock);
   4744 		return (ISC_R_SHUTTINGDOWN);
   4745 	}
   4746 	isc_refcount_increment0(&nsock->references);
   4747 	nsock->statsindex = sock->statsindex;
   4748 
   4749 	dev->ev_sender = ntask;
   4750 	dev->newsocket = nsock;
   4751 
   4752 	/*
   4753 	 * Poke watcher here.  We still have the socket locked, so there
   4754 	 * is no race condition.  We will keep the lock for such a short
   4755 	 * bit of time waking it up now or later won't matter all that much.
   4756 	 */
   4757 	do_poke = ISC_LIST_EMPTY(sock->accept_list);
   4758 	ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
   4759 	if (do_poke) {
   4760 		select_poke(manager, sock->threadid, sock->fd,
   4761 			    SELECT_POKE_ACCEPT);
   4762 	}
   4763 	UNLOCK(&sock->lock);
   4764 	return (ISC_R_SUCCESS);
   4765 }
   4766 
   4767 isc_result_t
   4768 isc_socket_connect(isc_socket_t *sock, const isc_sockaddr_t *addr,
   4769 		   isc_task_t *task, isc_taskaction_t action, void *arg) {
   4770 	isc_socket_connev_t *dev;
   4771 	isc_task_t *ntask = NULL;
   4772 	isc_socketmgr_t *manager;
   4773 	int cc;
   4774 	char strbuf[ISC_STRERRORSIZE];
   4775 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
   4776 
   4777 	REQUIRE(VALID_SOCKET(sock));
   4778 	REQUIRE(addr != NULL);
   4779 	REQUIRE(task != NULL);
   4780 	REQUIRE(action != NULL);
   4781 
   4782 	manager = sock->manager;
   4783 	REQUIRE(VALID_MANAGER(manager));
   4784 	REQUIRE(addr != NULL);
   4785 
   4786 	if (isc_sockaddr_ismulticast(addr)) {
   4787 		return (ISC_R_MULTICAST);
   4788 	}
   4789 
   4790 	LOCK(&sock->lock);
   4791 
   4792 	dev = (isc_socket_connev_t *)isc_event_allocate(
   4793 		manager->mctx, sock, ISC_SOCKEVENT_CONNECT, action, arg,
   4794 		sizeof(*dev));
   4795 	ISC_LINK_INIT(dev, ev_link);
   4796 
   4797 	if (sock->connecting) {
   4798 		INSIST(isc_sockaddr_equal(&sock->peer_address, addr));
   4799 		goto queue;
   4800 	}
   4801 
   4802 	if (sock->connected) {
   4803 		INSIST(isc_sockaddr_equal(&sock->peer_address, addr));
   4804 		dev->result = ISC_R_SUCCESS;
   4805 		isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
   4806 
   4807 		UNLOCK(&sock->lock);
   4808 
   4809 		return (ISC_R_SUCCESS);
   4810 	}
   4811 
   4812 	/*
   4813 	 * Try to do the connect right away, as there can be only one
   4814 	 * outstanding, and it might happen to complete.
   4815 	 */
   4816 	sock->peer_address = *addr;
   4817 	cc = connect(sock->fd, &addr->type.sa, addr->length);
   4818 	if (cc < 0) {
   4819 		/*
   4820 		 * The socket is nonblocking and the connection cannot be
   4821 		 * completed immediately.  It is possible to select(2) or
   4822 		 * poll(2) for completion by selecting the socket for writing.
   4823 		 * After select(2) indicates writability, use getsockopt(2) to
   4824 		 * read the SO_ERROR option at level SOL_SOCKET to determine
   4825 		 * whether connect() completed successfully (SO_ERROR is zero)
   4826 		 * or unsuccessfully (SO_ERROR is one of the usual error codes
   4827 		 * listed here, explaining the reason for the failure).
   4828 		 */
   4829 		if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
   4830 			cc = 0;
   4831 			goto success;
   4832 		}
   4833 		if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
   4834 			goto queue;
   4835 		}
   4836 
   4837 		switch (errno) {
   4838 #define ERROR_MATCH(a, b)        \
   4839 	case a:                  \
   4840 		dev->result = b; \
   4841 		goto err_exit;
   4842 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
   4843 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
   4844 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
   4845 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
   4846 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
   4847 #ifdef EHOSTDOWN
   4848 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
   4849 #endif /* ifdef EHOSTDOWN */
   4850 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
   4851 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
   4852 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
   4853 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
   4854 			ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
   4855 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
   4856 #undef ERROR_MATCH
   4857 		}
   4858 
   4859 		sock->connected = 0;
   4860 
   4861 		strerror_r(errno, strbuf, sizeof(strbuf));
   4862 		isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
   4863 		UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
   4864 				 addrbuf, errno, strbuf);
   4865 
   4866 		UNLOCK(&sock->lock);
   4867 		inc_stats(sock->manager->stats,
   4868 			  sock->statsindex[STATID_CONNECTFAIL]);
   4869 		isc_event_free(ISC_EVENT_PTR(&dev));
   4870 		return (ISC_R_UNEXPECTED);
   4871 
   4872 	err_exit:
   4873 		sock->connected = 0;
   4874 		isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
   4875 
   4876 		UNLOCK(&sock->lock);
   4877 		inc_stats(sock->manager->stats,
   4878 			  sock->statsindex[STATID_CONNECTFAIL]);
   4879 		return (ISC_R_SUCCESS);
   4880 	}
   4881 
   4882 	/*
   4883 	 * If connect completed, fire off the done event.
   4884 	 */
   4885 success:
   4886 	if (cc == 0) {
   4887 		sock->connected = 1;
   4888 		sock->bound = 1;
   4889 		dev->result = ISC_R_SUCCESS;
   4890 		isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
   4891 
   4892 		UNLOCK(&sock->lock);
   4893 
   4894 		inc_stats(sock->manager->stats,
   4895 			  sock->statsindex[STATID_CONNECT]);
   4896 
   4897 		return (ISC_R_SUCCESS);
   4898 	}
   4899 
   4900 queue:
   4901 
   4902 	/*
   4903 	 * Attach to task.
   4904 	 */
   4905 	isc_task_attach(task, &ntask);
   4906 
   4907 	dev->ev_sender = ntask;
   4908 
   4909 	/*
   4910 	 * Poke watcher here.  We still have the socket locked, so there
   4911 	 * is no race condition.  We will keep the lock for such a short
   4912 	 * bit of time waking it up now or later won't matter all that much.
   4913 	 */
   4914 	bool do_poke = ISC_LIST_EMPTY(sock->connect_list);
   4915 	ISC_LIST_ENQUEUE(sock->connect_list, dev, ev_link);
   4916 	if (do_poke && !sock->connecting) {
   4917 		sock->connecting = 1;
   4918 		select_poke(manager, sock->threadid, sock->fd,
   4919 			    SELECT_POKE_CONNECT);
   4920 	}
   4921 
   4922 	UNLOCK(&sock->lock);
   4923 	return (ISC_R_SUCCESS);
   4924 }
   4925 
   4926 /*
   4927  * Called when a socket with a pending connect() finishes.
   4928  */
   4929 static void
   4930 internal_connect(isc_socket_t *sock) {
   4931 	isc_socket_connev_t *dev;
   4932 	int cc;
   4933 	isc_result_t result;
   4934 	socklen_t optlen;
   4935 	char strbuf[ISC_STRERRORSIZE];
   4936 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
   4937 
   4938 	INSIST(VALID_SOCKET(sock));
   4939 	REQUIRE(sock->fd >= 0);
   4940 
   4941 	/*
   4942 	 * Get the first item off the connect list.
   4943 	 * If it is empty, unlock the socket and return.
   4944 	 */
   4945 	dev = ISC_LIST_HEAD(sock->connect_list);
   4946 	if (dev == NULL) {
   4947 		INSIST(!sock->connecting);
   4948 		goto finish;
   4949 	}
   4950 
   4951 	INSIST(sock->connecting);
   4952 	sock->connecting = 0;
   4953 
   4954 	/*
   4955 	 * Get any possible error status here.
   4956 	 */
   4957 	optlen = sizeof(cc);
   4958 	if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, (void *)&cc,
   4959 		       (void *)&optlen) != 0)
   4960 	{
   4961 		cc = errno;
   4962 	} else {
   4963 		errno = cc;
   4964 	}
   4965 
   4966 	if (errno != 0) {
   4967 		/*
   4968 		 * If the error is EAGAIN, just re-select on this
   4969 		 * fd and pretend nothing strange happened.
   4970 		 */
   4971 		if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
   4972 			sock->connecting = 1;
   4973 			return;
   4974 		}
   4975 
   4976 		inc_stats(sock->manager->stats,
   4977 			  sock->statsindex[STATID_CONNECTFAIL]);
   4978 
   4979 		/*
   4980 		 * Translate other errors into ISC_R_* flavors.
   4981 		 */
   4982 		switch (errno) {
   4983 #define ERROR_MATCH(a, b)   \
   4984 	case a:             \
   4985 		result = b; \
   4986 		break;
   4987 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
   4988 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
   4989 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
   4990 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
   4991 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
   4992 #ifdef EHOSTDOWN
   4993 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
   4994 #endif /* ifdef EHOSTDOWN */
   4995 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
   4996 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
   4997 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
   4998 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
   4999 			ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
   5000 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
   5001 #undef ERROR_MATCH
   5002 		default:
   5003 			result = ISC_R_UNEXPECTED;
   5004 			isc_sockaddr_format(&sock->peer_address, peerbuf,
   5005 					    sizeof(peerbuf));
   5006 			strerror_r(errno, strbuf, sizeof(strbuf));
   5007 			UNEXPECTED_ERROR(__FILE__, __LINE__,
   5008 					 "internal_connect: connect(%s) %s",
   5009 					 peerbuf, strbuf);
   5010 		}
   5011 	} else {
   5012 		inc_stats(sock->manager->stats,
   5013 			  sock->statsindex[STATID_CONNECT]);
   5014 		result = ISC_R_SUCCESS;
   5015 		sock->connected = 1;
   5016 		sock->bound = 1;
   5017 	}
   5018 
   5019 	do {
   5020 		dev->result = result;
   5021 		send_connectdone_event(sock, &dev);
   5022 		dev = ISC_LIST_HEAD(sock->connect_list);
   5023 	} while (dev != NULL);
   5024 
   5025 finish:
   5026 	unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
   5027 		   SELECT_POKE_CONNECT);
   5028 }
   5029 
   5030 isc_result_t
   5031 isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
   5032 	isc_result_t result;
   5033 
   5034 	REQUIRE(VALID_SOCKET(sock));
   5035 	REQUIRE(addressp != NULL);
   5036 
   5037 	LOCK(&sock->lock);
   5038 
   5039 	if (sock->connected) {
   5040 		*addressp = sock->peer_address;
   5041 		result = ISC_R_SUCCESS;
   5042 	} else {
   5043 		result = ISC_R_NOTCONNECTED;
   5044 	}
   5045 
   5046 	UNLOCK(&sock->lock);
   5047 
   5048 	return (result);
   5049 }
   5050 
   5051 isc_result_t
   5052 isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
   5053 	socklen_t len;
   5054 	isc_result_t result;
   5055 	char strbuf[ISC_STRERRORSIZE];
   5056 
   5057 	REQUIRE(VALID_SOCKET(sock));
   5058 	REQUIRE(addressp != NULL);
   5059 
   5060 	LOCK(&sock->lock);
   5061 
   5062 	if (!sock->bound) {
   5063 		result = ISC_R_NOTBOUND;
   5064 		goto out;
   5065 	}
   5066 
   5067 	result = ISC_R_SUCCESS;
   5068 
   5069 	len = sizeof(addressp->type);
   5070 	if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
   5071 		strerror_r(errno, strbuf, sizeof(strbuf));
   5072 		UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s", strbuf);
   5073 		result = ISC_R_UNEXPECTED;
   5074 		goto out;
   5075 	}
   5076 	addressp->length = (unsigned int)len;
   5077 
   5078 out:
   5079 	UNLOCK(&sock->lock);
   5080 
   5081 	return (result);
   5082 }
   5083 
   5084 /*
   5085  * Run through the list of events on this socket, and cancel the ones
   5086  * queued for task "task" of type "how".  "how" is a bitmask.
   5087  */
   5088 void
   5089 isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
   5090 	REQUIRE(VALID_SOCKET(sock));
   5091 
   5092 	/*
   5093 	 * Quick exit if there is nothing to do.  Don't even bother locking
   5094 	 * in this case.
   5095 	 */
   5096 	if (how == 0) {
   5097 		return;
   5098 	}
   5099 
   5100 	LOCK(&sock->lock);
   5101 
   5102 	/*
   5103 	 * All of these do the same thing, more or less.
   5104 	 * Each will:
   5105 	 *	o If the internal event is marked as "posted" try to
   5106 	 *	  remove it from the task's queue.  If this fails, mark it
   5107 	 *	  as canceled instead, and let the task clean it up later.
   5108 	 *	o For each I/O request for that task of that type, post
   5109 	 *	  its done event with status of "ISC_R_CANCELED".
   5110 	 *	o Reset any state needed.
   5111 	 */
   5112 	if (((how & ISC_SOCKCANCEL_RECV) != 0) &&
   5113 	    !ISC_LIST_EMPTY(sock->recv_list))
   5114 	{
   5115 		isc_socketevent_t *dev;
   5116 		isc_socketevent_t *next;
   5117 		isc_task_t *current_task;
   5118 
   5119 		dev = ISC_LIST_HEAD(sock->recv_list);
   5120 
   5121 		while (dev != NULL) {
   5122 			current_task = dev->ev_sender;
   5123 			next = ISC_LIST_NEXT(dev, ev_link);
   5124 
   5125 			if ((task == NULL) || (task == current_task)) {
   5126 				dev->result = ISC_R_CANCELED;
   5127 				send_recvdone_event(sock, &dev);
   5128 			}
   5129 			dev = next;
   5130 		}
   5131 	}
   5132 
   5133 	if (((how & ISC_SOCKCANCEL_SEND) != 0) &&
   5134 	    !ISC_LIST_EMPTY(sock->send_list))
   5135 	{
   5136 		isc_socketevent_t *dev;
   5137 		isc_socketevent_t *next;
   5138 		isc_task_t *current_task;
   5139 
   5140 		dev = ISC_LIST_HEAD(sock->send_list);
   5141 
   5142 		while (dev != NULL) {
   5143 			current_task = dev->ev_sender;
   5144 			next = ISC_LIST_NEXT(dev, ev_link);
   5145 
   5146 			if ((task == NULL) || (task == current_task)) {
   5147 				dev->result = ISC_R_CANCELED;
   5148 				send_senddone_event(sock, &dev);
   5149 			}
   5150 			dev = next;
   5151 		}
   5152 	}
   5153 
   5154 	if (((how & ISC_SOCKCANCEL_ACCEPT) != 0) &&
   5155 	    !ISC_LIST_EMPTY(sock->accept_list))
   5156 	{
   5157 		isc_socket_newconnev_t *dev;
   5158 		isc_socket_newconnev_t *next;
   5159 		isc_task_t *current_task;
   5160 
   5161 		dev = ISC_LIST_HEAD(sock->accept_list);
   5162 		while (dev != NULL) {
   5163 			current_task = dev->ev_sender;
   5164 			next = ISC_LIST_NEXT(dev, ev_link);
   5165 
   5166 			if ((task == NULL) || (task == current_task)) {
   5167 				ISC_LIST_UNLINK(sock->accept_list, dev,
   5168 						ev_link);
   5169 
   5170 				isc_refcount_decrementz(
   5171 					&NEWCONNSOCK(dev)->references);
   5172 				free_socket((isc_socket_t **)&dev->newsocket);
   5173 
   5174 				dev->result = ISC_R_CANCELED;
   5175 				dev->ev_sender = sock;
   5176 				isc_task_sendtoanddetach(&current_task,
   5177 							 ISC_EVENT_PTR(&dev),
   5178 							 sock->threadid);
   5179 			}
   5180 
   5181 			dev = next;
   5182 		}
   5183 	}
   5184 
   5185 	if (((how & ISC_SOCKCANCEL_CONNECT) != 0) &&
   5186 	    !ISC_LIST_EMPTY(sock->connect_list))
   5187 	{
   5188 		isc_socket_connev_t *dev;
   5189 		isc_socket_connev_t *next;
   5190 		isc_task_t *current_task;
   5191 
   5192 		INSIST(sock->connecting);
   5193 		sock->connecting = 0;
   5194 
   5195 		dev = ISC_LIST_HEAD(sock->connect_list);
   5196 
   5197 		while (dev != NULL) {
   5198 			current_task = dev->ev_sender;
   5199 			next = ISC_LIST_NEXT(dev, ev_link);
   5200 
   5201 			if ((task == NULL) || (task == current_task)) {
   5202 				dev->result = ISC_R_CANCELED;
   5203 				send_connectdone_event(sock, &dev);
   5204 			}
   5205 			dev = next;
   5206 		}
   5207 	}
   5208 
   5209 	UNLOCK(&sock->lock);
   5210 }
   5211 
   5212 isc_sockettype_t
   5213 isc_socket_gettype(isc_socket_t *sock) {
   5214 	REQUIRE(VALID_SOCKET(sock));
   5215 
   5216 	return (sock->type);
   5217 }
   5218 
   5219 void
   5220 isc_socket_ipv6only(isc_socket_t *sock, bool yes) {
   5221 #if defined(IPV6_V6ONLY) && !defined(__OpenBSD__)
   5222 	int onoff = yes ? 1 : 0;
   5223 #else  /* if defined(IPV6_V6ONLY) */
   5224 	UNUSED(yes);
   5225 	UNUSED(sock);
   5226 #endif /* if defined(IPV6_V6ONLY) */
   5227 
   5228 	REQUIRE(VALID_SOCKET(sock));
   5229 	INSIST(!sock->dupped);
   5230 
   5231 #if defined(IPV6_V6ONLY) && !defined(__OpenBSD__)
   5232 	if (sock->pf == AF_INET6) {
   5233 		if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
   5234 			       (void *)&onoff, sizeof(int)) < 0)
   5235 		{
   5236 			char strbuf[ISC_STRERRORSIZE];
   5237 			strerror_r(errno, strbuf, sizeof(strbuf));
   5238 			UNEXPECTED_ERROR(__FILE__, __LINE__,
   5239 					 "setsockopt(%d, IPV6_V6ONLY) failed: "
   5240 					 "%s",
   5241 					 sock->fd, strbuf);
   5242 		}
   5243 	}
   5244 #endif /* ifdef IPV6_V6ONLY */
   5245 }
   5246 
   5247 static void
   5248 setdscp(isc_socket_t *sock, isc_dscp_t dscp) {
   5249 #if defined(IP_TOS) || defined(IPV6_TCLASS)
   5250 	int value = dscp << 2;
   5251 #endif /* if defined(IP_TOS) || defined(IPV6_TCLASS) */
   5252 
   5253 	sock->dscp = dscp;
   5254 
   5255 #ifdef IP_TOS
   5256 	if (sock->pf == AF_INET) {
   5257 		if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS, (void *)&value,
   5258 			       sizeof(value)) < 0)
   5259 		{
   5260 			char strbuf[ISC_STRERRORSIZE];
   5261 			strerror_r(errno, strbuf, sizeof(strbuf));
   5262 			UNEXPECTED_ERROR(__FILE__, __LINE__,
   5263 					 "setsockopt(%d, IP_TOS, %.02x) "
   5264 					 "failed: %s",
   5265 					 sock->fd, value >> 2, strbuf);
   5266 		}
   5267 	}
   5268 #endif /* ifdef IP_TOS */
   5269 #ifdef IPV6_TCLASS
   5270 	if (sock->pf == AF_INET6) {
   5271 		if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
   5272 			       (void *)&value, sizeof(value)) < 0)
   5273 		{
   5274 			char strbuf[ISC_STRERRORSIZE];
   5275 			strerror_r(errno, strbuf, sizeof(strbuf));
   5276 			UNEXPECTED_ERROR(__FILE__, __LINE__,
   5277 					 "setsockopt(%d, IPV6_TCLASS, %.02x) "
   5278 					 "failed: %s",
   5279 					 sock->fd, dscp >> 2, strbuf);
   5280 		}
   5281 	}
   5282 #endif /* ifdef IPV6_TCLASS */
   5283 }
   5284 
   5285 void
   5286 isc_socket_dscp(isc_socket_t *sock, isc_dscp_t dscp) {
   5287 	REQUIRE(VALID_SOCKET(sock));
   5288 	REQUIRE(dscp < 0x40);
   5289 
   5290 #if !defined(IP_TOS) && !defined(IPV6_TCLASS)
   5291 	UNUSED(dscp);
   5292 #else  /* if !defined(IP_TOS) && !defined(IPV6_TCLASS) */
   5293 	if (dscp < 0) {
   5294 		return;
   5295 	}
   5296 
   5297 	/* The DSCP value must not be changed once it has been set. */
   5298 	if (isc_dscp_check_value != -1) {
   5299 		INSIST(dscp == isc_dscp_check_value);
   5300 	}
   5301 #endif /* if !defined(IP_TOS) && !defined(IPV6_TCLASS) */
   5302 
   5303 #ifdef notyet
   5304 	REQUIRE(!sock->dupped);
   5305 #endif /* ifdef notyet */
   5306 
   5307 	setdscp(sock, dscp);
   5308 }
   5309 
   5310 isc_socketevent_t *
   5311 isc_socket_socketevent(isc_mem_t *mctx, void *sender, isc_eventtype_t eventtype,
   5312 		       isc_taskaction_t action, void *arg) {
   5313 	return (allocate_socketevent(mctx, sender, eventtype, action, arg));
   5314 }
   5315 
   5316 void
   5317 isc_socket_setname(isc_socket_t *sock, const char *name, void *tag) {
   5318 	/*
   5319 	 * Name 'sock'.
   5320 	 */
   5321 
   5322 	REQUIRE(VALID_SOCKET(sock));
   5323 
   5324 	LOCK(&sock->lock);
   5325 	strlcpy(sock->name, name, sizeof(sock->name));
   5326 	sock->tag = tag;
   5327 	UNLOCK(&sock->lock);
   5328 }
   5329 
   5330 const char *
   5331 isc_socket_getname(isc_socket_t *sock) {
   5332 	return (sock->name);
   5333 }
   5334 
   5335 void *
   5336 isc_socket_gettag(isc_socket_t *sock) {
   5337 	return (sock->tag);
   5338 }
   5339 
   5340 int
   5341 isc_socket_getfd(isc_socket_t *sock) {
   5342 	return ((short)sock->fd);
   5343 }
   5344 
   5345 static isc_once_t hasreuseport_once = ISC_ONCE_INIT;
   5346 static bool hasreuseport = false;
   5347 
   5348 static void
   5349 init_hasreuseport(void) {
   5350 /*
   5351  * SO_REUSEPORT works very differently on *BSD and on Linux (because why not).
   5352  * We only want to use it on Linux, if it's available. On BSD we want to dup()
   5353  * sockets instead of re-binding them.
   5354  */
   5355 #if (defined(SO_REUSEPORT) && defined(__linux__)) || \
   5356 	(defined(SO_REUSEPORT_LB) && defined(__FreeBSD_kernel__))
   5357 	int sock, yes = 1;
   5358 	sock = socket(AF_INET, SOCK_DGRAM, 0);
   5359 	if (sock < 0) {
   5360 		sock = socket(AF_INET6, SOCK_DGRAM, 0);
   5361 		if (sock < 0) {
   5362 			return;
   5363 		}
   5364 	}
   5365 	if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (void *)&yes,
   5366 		       sizeof(yes)) < 0)
   5367 	{
   5368 		close(sock);
   5369 		return;
   5370 #if defined(__FreeBSD_kernel__)
   5371 	} else if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT_LB, (void *)&yes,
   5372 			      sizeof(yes)) < 0)
   5373 #else  /* if defined(__FreeBSD_kernel__) */
   5374 	} else if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, (void *)&yes,
   5375 			      sizeof(yes)) < 0)
   5376 #endif /* if defined(__FreeBSD_kernel__) */
   5377 	{
   5378 		close(sock);
   5379 		return;
   5380 	}
   5381 	hasreuseport = true;
   5382 	close(sock);
   5383 #endif /* if (defined(SO_REUSEPORT) && defined(__linux__)) || \
   5384 	* (defined(SO_REUSEPORT_LB) && defined(__FreeBSD_kernel__)) */
   5385 }
   5386 
   5387 bool
   5388 isc_socket_hasreuseport() {
   5389 	RUNTIME_CHECK(isc_once_do(&hasreuseport_once, init_hasreuseport) ==
   5390 		      ISC_R_SUCCESS);
   5391 	return (hasreuseport);
   5392 }
   5393 
   5394 #if defined(HAVE_LIBXML2) || defined(HAVE_JSON_C)
   5395 static const char *
   5396 _socktype(isc_sockettype_t type) {
   5397 	switch (type) {
   5398 	case isc_sockettype_udp:
   5399 		return ("udp");
   5400 	case isc_sockettype_tcp:
   5401 		return ("tcp");
   5402 	case isc_sockettype_unix:
   5403 		return ("unix");
   5404 	case isc_sockettype_fdwatch:
   5405 		return ("fdwatch");
   5406 	default:
   5407 		return ("not-initialized");
   5408 	}
   5409 }
   5410 #endif /* if defined(HAVE_LIBXML2) || defined(HAVE_JSON_C) */
   5411 
   5412 #ifdef HAVE_LIBXML2
   5413 #define TRY0(a)                     \
   5414 	do {                        \
   5415 		xmlrc = (a);        \
   5416 		if (xmlrc < 0)      \
   5417 			goto error; \
   5418 	} while (0)
   5419 int
   5420 isc_socketmgr_renderxml(isc_socketmgr_t *mgr, void *writer0) {
   5421 	isc_socket_t *sock = NULL;
   5422 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
   5423 	isc_sockaddr_t addr;
   5424 	socklen_t len;
   5425 	int xmlrc;
   5426 	xmlTextWriterPtr writer = (xmlTextWriterPtr)writer0;
   5427 
   5428 	LOCK(&mgr->lock);
   5429 
   5430 	TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets"));
   5431 	sock = ISC_LIST_HEAD(mgr->socklist);
   5432 	while (sock != NULL) {
   5433 		LOCK(&sock->lock);
   5434 		TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket"));
   5435 
   5436 		TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "id"));
   5437 		TRY0(xmlTextWriterWriteFormatString(writer, "%p", sock));
   5438 		TRY0(xmlTextWriterEndElement(writer));
   5439 
   5440 		if (sock->name[0] != 0) {
   5441 			TRY0(xmlTextWriterStartElement(writer,
   5442 						       ISC_XMLCHAR "name"));
   5443 			TRY0(xmlTextWriterWriteFormatString(writer, "%s",
   5444 							    sock->name));
   5445 			TRY0(xmlTextWriterEndElement(writer)); /* name */
   5446 		}
   5447 
   5448 		TRY0(xmlTextWriterStartElement(writer,
   5449 					       ISC_XMLCHAR "references"));
   5450 		TRY0(xmlTextWriterWriteFormatString(
   5451 			writer, "%d",
   5452 			(int)isc_refcount_current(&sock->references)));
   5453 		TRY0(xmlTextWriterEndElement(writer));
   5454 
   5455 		TRY0(xmlTextWriterWriteElement(
   5456 			writer, ISC_XMLCHAR "type",
   5457 			ISC_XMLCHAR _socktype(sock->type)));
   5458 
   5459 		if (sock->connected) {
   5460 			isc_sockaddr_format(&sock->peer_address, peerbuf,
   5461 					    sizeof(peerbuf));
   5462 			TRY0(xmlTextWriterWriteElement(
   5463 				writer, ISC_XMLCHAR "peer-address",
   5464 				ISC_XMLCHAR peerbuf));
   5465 		}
   5466 
   5467 		len = sizeof(addr);
   5468 		if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
   5469 			isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
   5470 			TRY0(xmlTextWriterWriteElement(
   5471 				writer, ISC_XMLCHAR "local-address",
   5472 				ISC_XMLCHAR peerbuf));
   5473 		}
   5474 
   5475 		TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "states"));
   5476 		if (sock->listener) {
   5477 			TRY0(xmlTextWriterWriteElement(writer,
   5478 						       ISC_XMLCHAR "state",
   5479 						       ISC_XMLCHAR "listener"));
   5480 		}
   5481 		if (sock->connected) {
   5482 			TRY0(xmlTextWriterWriteElement(
   5483 				writer, ISC_XMLCHAR "state",
   5484 				ISC_XMLCHAR "connected"));
   5485 		}
   5486 		if (sock->connecting) {
   5487 			TRY0(xmlTextWriterWriteElement(
   5488 				writer, ISC_XMLCHAR "state",
   5489 				ISC_XMLCHAR "connecting"));
   5490 		}
   5491 		if (sock->bound) {
   5492 			TRY0(xmlTextWriterWriteElement(writer,
   5493 						       ISC_XMLCHAR "state",
   5494 						       ISC_XMLCHAR "bound"));
   5495 		}
   5496 
   5497 		TRY0(xmlTextWriterEndElement(writer)); /* states */
   5498 
   5499 		TRY0(xmlTextWriterEndElement(writer)); /* socket */
   5500 
   5501 		UNLOCK(&sock->lock);
   5502 		sock = ISC_LIST_NEXT(sock, link);
   5503 	}
   5504 	TRY0(xmlTextWriterEndElement(writer)); /* sockets */
   5505 
   5506 error:
   5507 	if (sock != NULL) {
   5508 		UNLOCK(&sock->lock);
   5509 	}
   5510 
   5511 	UNLOCK(&mgr->lock);
   5512 
   5513 	return (xmlrc);
   5514 }
   5515 #endif /* HAVE_LIBXML2 */
   5516 
   5517 #ifdef HAVE_JSON_C
   5518 #define CHECKMEM(m)                              \
   5519 	do {                                     \
   5520 		if (m == NULL) {                 \
   5521 			result = ISC_R_NOMEMORY; \
   5522 			goto error;              \
   5523 		}                                \
   5524 	} while (0)
   5525 
   5526 isc_result_t
   5527 isc_socketmgr_renderjson(isc_socketmgr_t *mgr, void *stats0) {
   5528 	isc_result_t result = ISC_R_SUCCESS;
   5529 	isc_socket_t *sock = NULL;
   5530 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
   5531 	isc_sockaddr_t addr;
   5532 	socklen_t len;
   5533 	json_object *obj, *array = json_object_new_array();
   5534 	json_object *stats = (json_object *)stats0;
   5535 
   5536 	CHECKMEM(array);
   5537 
   5538 	LOCK(&mgr->lock);
   5539 
   5540 	sock = ISC_LIST_HEAD(mgr->socklist);
   5541 	while (sock != NULL) {
   5542 		json_object *states, *entry = json_object_new_object();
   5543 		char buf[255];
   5544 
   5545 		CHECKMEM(entry);
   5546 		json_object_array_add(array, entry);
   5547 
   5548 		LOCK(&sock->lock);
   5549 
   5550 		snprintf(buf, sizeof(buf), "%p", sock);
   5551 		obj = json_object_new_string(buf);
   5552 		CHECKMEM(obj);
   5553 		json_object_object_add(entry, "id", obj);
   5554 
   5555 		if (sock->name[0] != 0) {
   5556 			obj = json_object_new_string(sock->name);
   5557 			CHECKMEM(obj);
   5558 			json_object_object_add(entry, "name", obj);
   5559 		}
   5560 
   5561 		obj = json_object_new_int(
   5562 			(int)isc_refcount_current(&sock->references));
   5563 		CHECKMEM(obj);
   5564 		json_object_object_add(entry, "references", obj);
   5565 
   5566 		obj = json_object_new_string(_socktype(sock->type));
   5567 		CHECKMEM(obj);
   5568 		json_object_object_add(entry, "type", obj);
   5569 
   5570 		if (sock->connected) {
   5571 			isc_sockaddr_format(&sock->peer_address, peerbuf,
   5572 					    sizeof(peerbuf));
   5573 			obj = json_object_new_string(peerbuf);
   5574 			CHECKMEM(obj);
   5575 			json_object_object_add(entry, "peer-address", obj);
   5576 		}
   5577 
   5578 		len = sizeof(addr);
   5579 		if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
   5580 			isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
   5581 			obj = json_object_new_string(peerbuf);
   5582 			CHECKMEM(obj);
   5583 			json_object_object_add(entry, "local-address", obj);
   5584 		}
   5585 
   5586 		states = json_object_new_array();
   5587 		CHECKMEM(states);
   5588 		json_object_object_add(entry, "states", states);
   5589 
   5590 		if (sock->listener) {
   5591 			obj = json_object_new_string("listener");
   5592 			CHECKMEM(obj);
   5593 			json_object_array_add(states, obj);
   5594 		}
   5595 
   5596 		if (sock->connected) {
   5597 			obj = json_object_new_string("connected");
   5598 			CHECKMEM(obj);
   5599 			json_object_array_add(states, obj);
   5600 		}
   5601 
   5602 		if (sock->connecting) {
   5603 			obj = json_object_new_string("connecting");
   5604 			CHECKMEM(obj);
   5605 			json_object_array_add(states, obj);
   5606 		}
   5607 
   5608 		if (sock->bound) {
   5609 			obj = json_object_new_string("bound");
   5610 			CHECKMEM(obj);
   5611 			json_object_array_add(states, obj);
   5612 		}
   5613 
   5614 		UNLOCK(&sock->lock);
   5615 		sock = ISC_LIST_NEXT(sock, link);
   5616 	}
   5617 
   5618 	json_object_object_add(stats, "sockets", array);
   5619 	array = NULL;
   5620 	result = ISC_R_SUCCESS;
   5621 
   5622 error:
   5623 	if (array != NULL) {
   5624 		json_object_put(array);
   5625 	}
   5626 
   5627 	if (sock != NULL) {
   5628 		UNLOCK(&sock->lock);
   5629 	}
   5630 
   5631 	UNLOCK(&mgr->lock);
   5632 
   5633 	return (result);
   5634 }
   5635 #endif /* HAVE_JSON_C */
   5636 
   5637 /*
   5638  * Create a new 'type' socket managed by 'manager'.  Events
   5639  * will be posted to 'task' and when dispatched 'action' will be
   5640  * called with 'arg' as the arg value.  The new socket is returned
   5641  * in 'socketp'.
   5642  */
   5643 isc_result_t
   5644 isc_socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags,
   5645 			 isc_sockfdwatch_t callback, void *cbarg,
   5646 			 isc_task_t *task, isc_socket_t **socketp)
   5647 {
   5648 	isc_socket_t *sock = NULL;
   5649 	isc__socketthread_t *thread;
   5650 	isc_result_t result;
   5651 	int lockid;
   5652 
   5653 	REQUIRE(VALID_MANAGER(manager));
   5654 	REQUIRE(socketp != NULL && *socketp == NULL);
   5655 
   5656 	if (fd < 0 || (unsigned int)fd >= manager->maxsocks)
   5657 		return (ISC_R_RANGE);
   5658 
   5659 	result = allocate_socket(manager, isc_sockettype_fdwatch, &sock);
   5660 	if (result != ISC_R_SUCCESS)
   5661 		return (result);
   5662 
   5663 	sock->fd = fd;
   5664 	sock->fdwatcharg = cbarg;
   5665 	sock->fdwatchcb = callback;
   5666 	sock->fdwatchflags = flags;
   5667 	sock->fdwatchtask = task;
   5668 
   5669 	sock->threadid = gen_threadid(sock);
   5670 	isc_refcount_init(&sock->references, 1);
   5671 	thread = &manager->threads[sock->threadid];
   5672 	*socketp = (isc_socket_t *)sock;
   5673 
   5674 	/*
   5675 	 * Note we don't have to lock the socket like we normally would because
   5676 	 * there are no external references to it yet.
   5677 	 */
   5678 
   5679 	lockid = FDLOCK_ID(sock->fd);
   5680 	LOCK(&thread->fdlock[lockid]);
   5681 	thread->fds[sock->fd] = sock;
   5682 	thread->fdstate[sock->fd] = MANAGED;
   5683 
   5684 #if defined(USE_EPOLL)
   5685 	manager->epoll_events[sock->fd] = 0;
   5686 #endif
   5687 #ifdef USE_DEVPOLL
   5688 	INSIST(thread->fdpollinfo[sock->fd].want_read == 0 &&
   5689 	       thread->fdpollinfo[sock->fd].want_write == 0);
   5690 #endif /* ifdef USE_DEVPOLL */
   5691 	UNLOCK(&thread->fdlock[lockid]);
   5692 
   5693 	LOCK(&manager->lock);
   5694 	ISC_LIST_APPEND(manager->socklist, sock, link);
   5695 #ifdef USE_SELECT
   5696 	if (thread->maxfd < sock->fd)
   5697 		thread->maxfd = sock->fd;
   5698 #endif
   5699 	UNLOCK(&manager->lock);
   5700 
   5701 	sock->active = 1;
   5702 	if (flags & ISC_SOCKFDWATCH_READ)
   5703 		select_poke(sock->manager, sock->threadid, sock->fd,
   5704 		    SELECT_POKE_READ);
   5705 	if (flags & ISC_SOCKFDWATCH_WRITE)
   5706 		select_poke(sock->manager, sock->threadid, sock->fd,
   5707 		    SELECT_POKE_WRITE);
   5708 
   5709 	socket_log(sock, NULL, CREATION, "fdwatch-created");
   5710 
   5711 	return (ISC_R_SUCCESS);
   5712 }
   5713 
   5714 /*
   5715  * Indicate to the manager that it should watch the socket again.
   5716  * This can be used to restart watching if the previous event handler
   5717  * didn't indicate there was more data to be processed.  Primarily
   5718  * it is for writing but could be used for reading if desired
   5719  */
   5720 
   5721 isc_result_t
   5722 isc_socket_fdwatchpoke(isc_socket_t *sock, int flags)
   5723 {
   5724 	REQUIRE(VALID_SOCKET(sock));
   5725 
   5726 	/*
   5727 	 * We check both flags first to allow us to get the lock
   5728 	 * once but only if we need it.
   5729 	 */
   5730 
   5731 	if ((flags & (ISC_SOCKFDWATCH_READ | ISC_SOCKFDWATCH_WRITE)) != 0) {
   5732 		LOCK(&sock->lock);
   5733 		if ((flags & ISC_SOCKFDWATCH_READ) != 0)
   5734 			select_poke(sock->manager, sock->threadid, sock->fd,
   5735 				    SELECT_POKE_READ);
   5736 		if ((flags & ISC_SOCKFDWATCH_WRITE) != 0)
   5737 			select_poke(sock->manager, sock->threadid, sock->fd,
   5738 				    SELECT_POKE_WRITE);
   5739 		UNLOCK(&sock->lock);
   5740 	}
   5741 
   5742 	socket_log(sock, NULL, TRACE, "fdwatch-poked flags: %d", flags);
   5743 
   5744 	return (ISC_R_SUCCESS);
   5745 }
   5746