Home | History | Annotate | Line # | Download | only in libevent
kqueue.c revision 1.6.8.1
      1 /*	$NetBSD: kqueue.c,v 1.6.8.1 2025/08/02 05:22:53 perseant Exp $	*/
      2 
      3 /*	$OpenBSD: kqueue.c,v 1.5 2002/07/10 14:41:31 art Exp $	*/
      4 
      5 /*
      6  * Copyright 2000-2007 Niels Provos <provos (at) citi.umich.edu>
      7  * Copyright 2007-2012 Niels Provos and Nick Mathewson
      8  *
      9  * Redistribution and use in source and binary forms, with or without
     10  * modification, are permitted provided that the following conditions
     11  * are met:
     12  * 1. Redistributions of source code must retain the above copyright
     13  *    notice, this list of conditions and the following disclaimer.
     14  * 2. Redistributions in binary form must reproduce the above copyright
     15  *    notice, this list of conditions and the following disclaimer in the
     16  *    documentation and/or other materials provided with the distribution.
     17  * 3. The name of the author may not be used to endorse or promote products
     18  *    derived from this software without specific prior written permission.
     19  *
     20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
     21  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     22  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     23  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
     24  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
     25  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
     29  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 #include "event2/event-config.h"
     32 #include "evconfig-private.h"
     33 
     34 #ifdef EVENT__HAVE_KQUEUE
     35 
     36 #include <sys/types.h>
     37 #ifdef EVENT__HAVE_SYS_TIME_H
     38 #include <sys/time.h>
     39 #endif
     40 #include <sys/queue.h>
     41 #include <sys/event.h>
     42 #include <limits.h>
     43 #include <signal.h>
     44 #include <stdio.h>
     45 #include <stdlib.h>
     46 #include <string.h>
     47 #include <unistd.h>
     48 #include <errno.h>
     49 #ifdef EVENT__HAVE_INTTYPES_H
     50 #include <inttypes.h>
     51 #endif
     52 
     53 /* Some platforms apparently define the udata field of struct kevent as
     54  * intptr_t, whereas others define it as void*.  There doesn't seem to be an
     55  * easy way to tell them apart via autoconf, so we need to use OS macros. */
     56 #if defined(__NetBSD__)
     57 #define PTR_TO_UDATA(x) ((typeof(((struct kevent *)0)->udata))(x))
     58 #define INT_TO_UDATA(x) ((typeof(((struct kevent *)0)->udata))(intptr_t)(x))
     59 #elif defined(EVENT__HAVE_INTTYPES_H) && !defined(__OpenBSD__) && !defined(__FreeBSD__) && !defined(__darwin__) && !defined(__APPLE__) && !defined(__CloudABI__)
     60 #define PTR_TO_UDATA(x)	((intptr_t)(x))
     61 #define INT_TO_UDATA(x) ((intptr_t)(x))
     62 #else
     63 #define PTR_TO_UDATA(x)	(x)
     64 #define INT_TO_UDATA(x) ((void*)(x))
     65 #endif
     66 
     67 #include "event-internal.h"
     68 #include "log-internal.h"
     69 #include "evmap-internal.h"
     70 #include "event2/thread.h"
     71 #include "event2/util.h"
     72 #include "evthread-internal.h"
     73 #include "changelist-internal.h"
     74 
     75 #include "kqueue-internal.h"
     76 
     77 #define NEVENT		64
     78 
     79 struct kqop {
     80 	struct kevent *changes;
     81 	int changes_size;
     82 
     83 	struct kevent *events;
     84 	int events_size;
     85 	int kq;
     86 	int notify_event_added;
     87 	pid_t pid;
     88 };
     89 
     90 static void kqop_free(struct kqop *kqop);
     91 
     92 static void *kq_init(struct event_base *);
     93 static int kq_sig_add(struct event_base *, int, short, short, void *);
     94 static int kq_sig_del(struct event_base *, int, short, short, void *);
     95 static int kq_dispatch(struct event_base *, struct timeval *);
     96 static void kq_dealloc(struct event_base *);
     97 
     98 const struct eventop kqops = {
     99 	"kqueue",
    100 	kq_init,
    101 	event_changelist_add_,
    102 	event_changelist_del_,
    103 	kq_dispatch,
    104 	kq_dealloc,
    105 	1 /* need reinit */,
    106     EV_FEATURE_ET|EV_FEATURE_O1|EV_FEATURE_FDS,
    107 	EVENT_CHANGELIST_FDINFO_SIZE
    108 };
    109 
    110 static const struct eventop kqsigops = {
    111 	"kqueue_signal",
    112 	NULL,
    113 	kq_sig_add,
    114 	kq_sig_del,
    115 	NULL,
    116 	NULL,
    117 	1 /* need reinit */,
    118 	0,
    119 	0
    120 };
    121 
    122 static void *
    123 kq_init(struct event_base *base)
    124 {
    125 	int kq = -1;
    126 	struct kqop *kqueueop = NULL;
    127 
    128 	if (!(kqueueop = mm_calloc(1, sizeof(struct kqop))))
    129 		return (NULL);
    130 
    131 /* Initialize the kernel queue */
    132 
    133 	if ((kq = kqueue()) == -1) {
    134 		event_warn("kqueue");
    135 		goto err;
    136 	}
    137 
    138 	kqueueop->kq = kq;
    139 
    140 	kqueueop->pid = getpid();
    141 
    142 	/* Initialize fields */
    143 	kqueueop->changes = mm_calloc(NEVENT, sizeof(struct kevent));
    144 	if (kqueueop->changes == NULL)
    145 		goto err;
    146 	kqueueop->events = mm_calloc(NEVENT, sizeof(struct kevent));
    147 	if (kqueueop->events == NULL)
    148 		goto err;
    149 	kqueueop->events_size = kqueueop->changes_size = NEVENT;
    150 
    151 	/* Check for Mac OS X kqueue bug. */
    152 	memset(&kqueueop->changes[0], 0, sizeof kqueueop->changes[0]);
    153 	kqueueop->changes[0].ident = -1;
    154 	kqueueop->changes[0].filter = EVFILT_READ;
    155 	kqueueop->changes[0].flags = EV_ADD;
    156 	/*
    157 	 * If kqueue works, then kevent will succeed, and it will
    158 	 * stick an error in events[0].  If kqueue is broken, then
    159 	 * kevent will fail.
    160 	 */
    161 	if (kevent(kq,
    162 		kqueueop->changes, 1, kqueueop->events, NEVENT, NULL) != 1 ||
    163 	    (int)kqueueop->events[0].ident != -1 ||
    164 	    !(kqueueop->events[0].flags & EV_ERROR)) {
    165 		event_warn("%s: detected broken kqueue; not using.", __func__);
    166 		goto err;
    167 	}
    168 
    169 	base->evsigsel = &kqsigops;
    170 
    171 	return (kqueueop);
    172 err:
    173 	if (kqueueop)
    174 		kqop_free(kqueueop);
    175 
    176 	return (NULL);
    177 }
    178 
    179 #define ADD_UDATA 0x30303
    180 
    181 static void
    182 kq_setup_kevent(struct kevent *out, evutil_socket_t fd, int filter, short change)
    183 {
    184 	memset(out, 0, sizeof(struct kevent));
    185 	out->ident = fd;
    186 	out->filter = filter;
    187 
    188 	if (change & EV_CHANGE_ADD) {
    189 		out->flags = EV_ADD;
    190 		/* We set a magic number here so that we can tell 'add'
    191 		 * errors from 'del' errors. */
    192 		out->udata = INT_TO_UDATA(ADD_UDATA);
    193 		if (change & EV_ET)
    194 			out->flags |= EV_CLEAR;
    195 #ifdef NOTE_EOF
    196 		/* Make it behave like select() and poll() */
    197 		if (filter == EVFILT_READ)
    198 			out->fflags = NOTE_EOF;
    199 #endif
    200 	} else {
    201 		EVUTIL_ASSERT(change & EV_CHANGE_DEL);
    202 		out->flags = EV_DELETE;
    203 	}
    204 }
    205 
    206 static int
    207 kq_build_changes_list(const struct event_changelist *changelist,
    208     struct kqop *kqop)
    209 {
    210 	int i;
    211 	int n_changes = 0;
    212 
    213 	for (i = 0; i < changelist->n_changes; ++i) {
    214 		struct event_change *in_ch = &changelist->changes[i];
    215 		struct kevent *out_ch;
    216 		if (n_changes >= kqop->changes_size - 1) {
    217 			int newsize;
    218 			struct kevent *newchanges;
    219 
    220 			if (kqop->changes_size > INT_MAX / 2 ||
    221 			    (size_t)kqop->changes_size * 2 > EV_SIZE_MAX /
    222 			    sizeof(struct kevent)) {
    223 				event_warnx("%s: int overflow", __func__);
    224 				return (-1);
    225 			}
    226 
    227 			newsize = kqop->changes_size * 2;
    228 			newchanges = mm_realloc(kqop->changes,
    229 			    newsize * sizeof(struct kevent));
    230 			if (newchanges == NULL) {
    231 				event_warn("%s: realloc", __func__);
    232 				return (-1);
    233 			}
    234 			kqop->changes = newchanges;
    235 			kqop->changes_size = newsize;
    236 		}
    237 		if (in_ch->read_change) {
    238 			out_ch = &kqop->changes[n_changes++];
    239 			kq_setup_kevent(out_ch, in_ch->fd, EVFILT_READ,
    240 			    in_ch->read_change);
    241 		}
    242 		if (in_ch->write_change) {
    243 			out_ch = &kqop->changes[n_changes++];
    244 			kq_setup_kevent(out_ch, in_ch->fd, EVFILT_WRITE,
    245 			    in_ch->write_change);
    246 		}
    247 	}
    248 	return n_changes;
    249 }
    250 
    251 static int
    252 kq_grow_events(struct kqop *kqop, size_t new_size)
    253 {
    254 	struct kevent *newresult;
    255 
    256 	newresult = mm_realloc(kqop->events,
    257 	    new_size * sizeof(struct kevent));
    258 
    259 	if (newresult) {
    260 		kqop->events = newresult;
    261 		kqop->events_size = new_size;
    262 		return 0;
    263 	} else {
    264 		return -1;
    265 	}
    266 }
    267 
    268 static int
    269 kq_dispatch(struct event_base *base, struct timeval *tv)
    270 {
    271 	struct kqop *kqop = base->evbase;
    272 	struct kevent *events = kqop->events;
    273 	struct kevent *changes;
    274 	struct timespec ts, *ts_p = NULL;
    275 	int i, n_changes, res;
    276 
    277 	if (tv != NULL) {
    278 		ts.tv_sec = tv->tv_sec;
    279 		ts.tv_nsec = tv->tv_usec * 1000;
    280 		ts_p = &ts;
    281 	}
    282 
    283 	/* Build "changes" from "base->changes" */
    284 	EVUTIL_ASSERT(kqop->changes);
    285 	n_changes = kq_build_changes_list(&base->changelist, kqop);
    286 	if (n_changes < 0)
    287 		return -1;
    288 
    289 	event_changelist_remove_all_(&base->changelist, base);
    290 
    291 	/* steal the changes array in case some broken code tries to call
    292 	 * dispatch twice at once. */
    293 	changes = kqop->changes;
    294 	kqop->changes = NULL;
    295 
    296 	/* Make sure that 'events' is at least as long as the list of changes:
    297 	 * otherwise errors in the changes can get reported as a -1 return
    298 	 * value from kevent() rather than as EV_ERROR events in the events
    299 	 * array.
    300 	 *
    301 	 * (We could instead handle -1 return values from kevent() by
    302 	 * retrying with a smaller changes array or a larger events array,
    303 	 * but this approach seems less risky for now.)
    304 	 */
    305 	if (kqop->events_size < n_changes) {
    306 		int new_size = kqop->events_size;
    307 		do {
    308 			new_size *= 2;
    309 		} while (new_size < n_changes);
    310 
    311 		kq_grow_events(kqop, new_size);
    312 		events = kqop->events;
    313 	}
    314 
    315 	EVBASE_RELEASE_LOCK(base, th_base_lock);
    316 
    317 	res = kevent(kqop->kq, changes, n_changes,
    318 	    events, kqop->events_size, ts_p);
    319 
    320 	EVBASE_ACQUIRE_LOCK(base, th_base_lock);
    321 
    322 	EVUTIL_ASSERT(kqop->changes == NULL);
    323 	kqop->changes = changes;
    324 
    325 	if (res == -1) {
    326 		if (errno != EINTR) {
    327 			event_warn("kevent");
    328 			return (-1);
    329 		}
    330 
    331 		return (0);
    332 	}
    333 
    334 	event_debug(("%s: kevent reports %d", __func__, res));
    335 
    336 	for (i = 0; i < res; i++) {
    337 		int which = 0;
    338 
    339 		if (events[i].flags & EV_ERROR) {
    340 			switch (events[i].data) {
    341 
    342 			/* Can occur on delete if we are not currently
    343 			 * watching any events on this fd.  That can
    344 			 * happen when the fd was closed and another
    345 			 * file was opened with that fd. */
    346 			case ENOENT:
    347 			/* Can occur for reasons not fully understood
    348 			 * on FreeBSD. */
    349 			case EINVAL:
    350 				continue;
    351 #if defined(__FreeBSD__) && defined(ENOTCAPABLE)
    352 			/*
    353 			 * This currently occurs if an FD is closed
    354 			 * before the EV_DELETE makes it out via kevent().
    355 			 * The FreeBSD capabilities code sees the blank
    356 			 * capability set and rejects the request to
    357 			 * modify an event.
    358 			 *
    359 			 * To be strictly correct - when an FD is closed,
    360 			 * all the registered events are also removed.
    361 			 * Queuing EV_DELETE to a closed FD is wrong.
    362 			 * The event(s) should just be deleted from
    363 			 * the pending changelist.
    364 			 */
    365 			case ENOTCAPABLE:
    366 				continue;
    367 #endif
    368 
    369 			/* Can occur on a delete if the fd is closed. */
    370 			case EBADF:
    371 				/* XXXX On NetBSD, we can also get EBADF if we
    372 				 * try to add the write side of a pipe, but
    373 				 * the read side has already been closed.
    374 				 * Other BSDs call this situation 'EPIPE'. It
    375 				 * would be good if we had a way to report
    376 				 * this situation. */
    377 				continue;
    378 			/* These two can occur on an add if the fd was one side
    379 			 * of a pipe, and the other side was closed. */
    380 			case EPERM:
    381 			case EPIPE:
    382 				/* Report read events, if we're listening for
    383 				 * them, so that the user can learn about any
    384 				 * add errors.  (If the operation was a
    385 				 * delete, then udata should be cleared.) */
    386 				if (events[i].udata) {
    387 					/* The operation was an add:
    388 					 * report the error as a read. */
    389 					which |= EV_READ;
    390 					break;
    391 				} else {
    392 					/* The operation was a del:
    393 					 * report nothing. */
    394 					continue;
    395 				}
    396 
    397 			/* Other errors shouldn't occur. */
    398 			default:
    399 				errno = events[i].data;
    400 				return (-1);
    401 			}
    402 		} else if (events[i].filter == EVFILT_READ) {
    403 			which |= EV_READ;
    404 		} else if (events[i].filter == EVFILT_WRITE) {
    405 			which |= EV_WRITE;
    406 		} else if (events[i].filter == EVFILT_SIGNAL) {
    407 			which |= EV_SIGNAL;
    408 #ifdef EVFILT_USER
    409 		} else if (events[i].filter == EVFILT_USER) {
    410 			base->is_notify_pending = 0;
    411 #endif
    412 		}
    413 
    414 		if (!which)
    415 			continue;
    416 
    417 		if (events[i].filter == EVFILT_SIGNAL) {
    418 			evmap_signal_active_(base, events[i].ident, 1);
    419 		} else {
    420 			evmap_io_active_(base, events[i].ident, which | EV_ET);
    421 		}
    422 	}
    423 
    424 	if (res == kqop->events_size) {
    425 		/* We used all the events space that we have. Maybe we should
    426 		   make it bigger. */
    427 		kq_grow_events(kqop, kqop->events_size * 2);
    428 	}
    429 
    430 	return (0);
    431 }
    432 
    433 static void
    434 kqop_free(struct kqop *kqop)
    435 {
    436 	if (kqop->changes)
    437 		mm_free(kqop->changes);
    438 	if (kqop->events)
    439 		mm_free(kqop->events);
    440 	if (kqop->kq >= 0 && kqop->pid == getpid())
    441 		close(kqop->kq);
    442 	memset(kqop, 0, sizeof(struct kqop));
    443 	mm_free(kqop);
    444 }
    445 
    446 static void
    447 kq_dealloc(struct event_base *base)
    448 {
    449 	struct kqop *kqop = base->evbase;
    450 	evsig_dealloc_(base);
    451 	kqop_free(kqop);
    452 }
    453 
    454 /* signal handling */
    455 static int
    456 kq_sig_add(struct event_base *base, int nsignal, short old, short events, void *p)
    457 {
    458 	struct kqop *kqop = base->evbase;
    459 	struct kevent kev;
    460 	struct timespec timeout = { 0, 0 };
    461 	(void)p;
    462 
    463 	EVUTIL_ASSERT(nsignal >= 0 && nsignal < NSIG);
    464 
    465 	memset(&kev, 0, sizeof(kev));
    466 	kev.ident = nsignal;
    467 	kev.filter = EVFILT_SIGNAL;
    468 	kev.flags = EV_ADD;
    469 
    470 	/* Be ready for the signal if it is sent any
    471 	 * time between now and the next call to
    472 	 * kq_dispatch. */
    473 	if (kevent(kqop->kq, &kev, 1, NULL, 0, &timeout) == -1)
    474 		return (-1);
    475 
    476         /* We can set the handler for most signals to SIG_IGN and
    477          * still have them reported to us in the queue.  However,
    478          * if the handler for SIGCHLD is SIG_IGN, the system reaps
    479          * zombie processes for us, and we don't get any notification.
    480          * This appears to be the only signal with this quirk. */
    481 	if (evsig_set_handler_(base, nsignal,
    482                                nsignal == SIGCHLD ? SIG_DFL : SIG_IGN) == -1)
    483 		return (-1);
    484 
    485 	return (0);
    486 }
    487 
    488 static int
    489 kq_sig_del(struct event_base *base, int nsignal, short old, short events, void *p)
    490 {
    491 	struct kqop *kqop = base->evbase;
    492 	struct kevent kev;
    493 
    494 	struct timespec timeout = { 0, 0 };
    495 	(void)p;
    496 
    497 	EVUTIL_ASSERT(nsignal >= 0 && nsignal < NSIG);
    498 
    499 	memset(&kev, 0, sizeof(kev));
    500 	kev.ident = nsignal;
    501 	kev.filter = EVFILT_SIGNAL;
    502 	kev.flags = EV_DELETE;
    503 
    504 	/* Because we insert signal events
    505 	 * immediately, we need to delete them
    506 	 * immediately, too */
    507 	if (kevent(kqop->kq, &kev, 1, NULL, 0, &timeout) == -1)
    508 		return (-1);
    509 
    510 	if (evsig_restore_handler_(base, nsignal) == -1)
    511 		return (-1);
    512 
    513 	return (0);
    514 }
    515 
    516 
    517 /* OSX 10.6 and FreeBSD 8.1 add support for EVFILT_USER, which we can use
    518  * to wake up the event loop from another thread. */
    519 
    520 /* Magic number we use for our filter ID. */
    521 #define NOTIFY_IDENT 42
    522 
    523 int
    524 event_kq_add_notify_event_(struct event_base *base)
    525 {
    526 	struct kqop *kqop = base->evbase;
    527 #if defined(EVFILT_USER) && defined(NOTE_TRIGGER)
    528 	struct kevent kev;
    529 	struct timespec timeout = { 0, 0 };
    530 #endif
    531 
    532 	if (kqop->notify_event_added)
    533 		return 0;
    534 
    535 #if defined(EVFILT_USER) && defined(NOTE_TRIGGER)
    536 	memset(&kev, 0, sizeof(kev));
    537 	kev.ident = NOTIFY_IDENT;
    538 	kev.filter = EVFILT_USER;
    539 	kev.flags = EV_ADD | EV_CLEAR;
    540 
    541 	if (kevent(kqop->kq, &kev, 1, NULL, 0, &timeout) == -1) {
    542 		event_warn("kevent: adding EVFILT_USER event");
    543 		return -1;
    544 	}
    545 
    546 	kqop->notify_event_added = 1;
    547 
    548 	return 0;
    549 #else
    550 	return -1;
    551 #endif
    552 }
    553 
    554 int
    555 event_kq_notify_base_(struct event_base *base)
    556 {
    557 	struct kqop *kqop = base->evbase;
    558 #if defined(EVFILT_USER) && defined(NOTE_TRIGGER)
    559 	struct kevent kev;
    560 	struct timespec timeout = { 0, 0 };
    561 #endif
    562 	if (! kqop->notify_event_added)
    563 		return -1;
    564 
    565 #if defined(EVFILT_USER) && defined(NOTE_TRIGGER)
    566 	memset(&kev, 0, sizeof(kev));
    567 	kev.ident = NOTIFY_IDENT;
    568 	kev.filter = EVFILT_USER;
    569 	kev.fflags = NOTE_TRIGGER;
    570 
    571 	if (kevent(kqop->kq, &kev, 1, NULL, 0, &timeout) == -1) {
    572 		event_warn("kevent: triggering EVFILT_USER event");
    573 		return -1;
    574 	}
    575 
    576 	return 0;
    577 #else
    578 	return -1;
    579 #endif
    580 }
    581 
    582 #endif /* EVENT__HAVE_KQUEUE */
    583