Home | History | Annotate | Line # | Download | only in kern
      1  1.4  christos /*	$NetBSD: sys_epoll.c,v 1.4 2023/07/30 18:31:13 christos Exp $	*/
      2  1.1  christos 
      3  1.1  christos /*-
      4  1.1  christos  * SPDX-License-Identifier: BSD-2-Clause
      5  1.1  christos  *
      6  1.1  christos  * Copyright (c) 2007 Roman Divacky
      7  1.1  christos  * Copyright (c) 2014 Dmitry Chagin <dchagin (at) FreeBSD.org>
      8  1.1  christos  *
      9  1.1  christos  * Redistribution and use in source and binary forms, with or without
     10  1.1  christos  * modification, are permitted provided that the following conditions
     11  1.1  christos  * are met:
     12  1.1  christos  * 1. Redistributions of source code must retain the above copyright
     13  1.1  christos  *    notice, this list of conditions and the following disclaimer.
     14  1.1  christos  * 2. Redistributions in binary form must reproduce the above copyright
     15  1.1  christos  *    notice, this list of conditions and the following disclaimer in the
     16  1.1  christos  *    documentation and/or other materials provided with the distribution.
     17  1.1  christos  *
     18  1.1  christos  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
     19  1.1  christos  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     20  1.1  christos  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     21  1.1  christos  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     22  1.1  christos  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     23  1.1  christos  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     24  1.1  christos  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     25  1.1  christos  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     26  1.1  christos  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     27  1.1  christos  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     28  1.1  christos  * SUCH DAMAGE.
     29  1.1  christos  */
     30  1.1  christos #include <sys/cdefs.h>
     31  1.4  christos __KERNEL_RCSID(0, "$NetBSD: sys_epoll.c,v 1.4 2023/07/30 18:31:13 christos Exp $");
     32  1.1  christos 
     33  1.1  christos 
     34  1.1  christos #include <sys/param.h>
     35  1.1  christos #include <sys/types.h>
     36  1.1  christos #include <sys/bitops.h>
     37  1.1  christos #include <sys/epoll.h>
     38  1.1  christos #include <sys/event.h>
     39  1.1  christos #include <sys/eventvar.h>
     40  1.1  christos #include <sys/errno.h>
     41  1.1  christos #include <sys/file.h>
     42  1.1  christos #include <sys/filedesc.h>
     43  1.1  christos #include <sys/fcntl.h>
     44  1.1  christos #include <sys/proc.h>
     45  1.1  christos #include <sys/signal.h>
     46  1.1  christos #include <sys/vnode.h>
     47  1.1  christos 
     48  1.1  christos #include <sys/syscallargs.h>
     49  1.1  christos 
     50  1.1  christos #define	EPOLL_MAX_DEPTH		5
     51  1.1  christos 
     52  1.1  christos #define	EPOLL_EVRD	(EPOLLIN|EPOLLRDNORM)
     53  1.1  christos #define	EPOLL_EVWR	(EPOLLOUT|EPOLLWRNORM)
     54  1.1  christos #define	EPOLL_EVSUP	(EPOLLET|EPOLLONESHOT|EPOLLHUP|EPOLLERR|EPOLLPRI \
     55  1.1  christos 			|EPOLL_EVRD|EPOLL_EVWR|EPOLLRDHUP)
     56  1.1  christos 
     57  1.1  christos #define	kext_data	ext[0]
     58  1.1  christos #define	kext_epfd	ext[1]
     59  1.1  christos #define	kext_fd		ext[2]
     60  1.1  christos 
     61  1.1  christos #if DEBUG
     62  1.1  christos #define	DPRINTF(x) uprintf x
     63  1.1  christos #else
     64  1.1  christos #define	DPRINTF(x) __nothing
     65  1.1  christos #endif
     66  1.1  christos 
     67  1.1  christos struct epoll_edge {
     68  1.1  christos 	int epfd;
     69  1.1  christos 	int fd;
     70  1.1  christos };
     71  1.1  christos 
     72  1.1  christos __BITMAP_TYPE(epoll_seen, char, 1);
     73  1.1  christos 
     74  1.1  christos static int	epoll_to_kevent(int, int, struct epoll_event *, struct kevent *,
     75  1.1  christos     int *);
     76  1.1  christos static void	kevent_to_epoll(struct kevent *, struct epoll_event *);
     77  1.1  christos static int      epoll_kev_put_events(void *, struct kevent *, struct kevent *,
     78  1.1  christos     size_t, int);
     79  1.1  christos static int	epoll_kev_fetch_changes(void *, const struct kevent *,
     80  1.1  christos     struct kevent *, size_t, int);
     81  1.1  christos static int	epoll_kev_fetch_timeout(const void *, void *, size_t);
     82  1.1  christos static int	epoll_register_kevent(register_t *, int, int, int,
     83  1.1  christos     unsigned int);
     84  1.1  christos static int	epoll_fd_registered(register_t *, int, int);
     85  1.1  christos static int	epoll_delete_all_events(register_t *, int, int);
     86  1.1  christos static int	epoll_recover_watch_tree(struct epoll_edge *, size_t, size_t);
     87  1.1  christos static int	epoll_dfs(struct epoll_edge *, size_t, struct epoll_seen *,
     88  1.1  christos     size_t, int, int);
     89  1.1  christos static int	epoll_check_loop_and_depth(struct lwp *, int, int);
     90  1.1  christos 
     91  1.1  christos /*
     92  1.1  christos  * epoll_create1(2).  Parse the flags and then create a kqueue instance.
     93  1.1  christos  */
     94  1.1  christos int
     95  1.1  christos sys_epoll_create1(struct lwp *l, const struct sys_epoll_create1_args *uap,
     96  1.1  christos     register_t *retval)
     97  1.1  christos {
     98  1.1  christos 	/* {
     99  1.1  christos 		syscallarg(int) flags;
    100  1.1  christos 	} */
    101  1.1  christos 	struct sys_kqueue1_args kqa;
    102  1.1  christos 
    103  1.4  christos 	if ((SCARG(uap, flags) & ~(EPOLL_CLOEXEC)) != 0)
    104  1.1  christos 		return EINVAL;
    105  1.1  christos 
    106  1.4  christos 	SCARG(&kqa, flags) = 0;
    107  1.4  christos 	if (SCARG(uap, flags) & EPOLL_CLOEXEC)
    108  1.4  christos 		SCARG(&kqa, flags) |= O_CLOEXEC;
    109  1.1  christos 
    110  1.1  christos 	return sys_kqueue1(l, &kqa, retval);
    111  1.1  christos }
    112  1.1  christos 
    113  1.1  christos /*
    114  1.1  christos  * Structure converting function from epoll to kevent.
    115  1.1  christos  */
    116  1.1  christos static int
    117  1.1  christos epoll_to_kevent(int epfd, int fd, struct epoll_event *l_event,
    118  1.1  christos     struct kevent *kevent, int *nkevents)
    119  1.1  christos {
    120  1.1  christos 	uint32_t levents = l_event->events;
    121  1.1  christos 	uint32_t kev_flags = EV_ADD | EV_ENABLE;
    122  1.1  christos 
    123  1.1  christos 	/* flags related to how event is registered */
    124  1.1  christos 	if ((levents & EPOLLONESHOT) != 0)
    125  1.1  christos 		kev_flags |= EV_DISPATCH;
    126  1.1  christos 	if ((levents & EPOLLET) != 0)
    127  1.1  christos 		kev_flags |= EV_CLEAR;
    128  1.1  christos 	if ((levents & EPOLLERR) != 0)
    129  1.1  christos 		kev_flags |= EV_ERROR;
    130  1.1  christos 	if ((levents & EPOLLRDHUP) != 0)
    131  1.1  christos 		kev_flags |= EV_EOF;
    132  1.1  christos 
    133  1.1  christos 	/* flags related to what event is registered */
    134  1.1  christos 	if ((levents & EPOLL_EVRD) != 0) {
    135  1.1  christos 		EV_SET(kevent, fd, EVFILT_READ, kev_flags, 0, 0, 0);
    136  1.1  christos 		kevent->kext_data = l_event->data;
    137  1.1  christos 		kevent->kext_epfd = epfd;
    138  1.1  christos 		kevent->kext_fd = fd;
    139  1.1  christos 		++kevent;
    140  1.1  christos 		++(*nkevents);
    141  1.1  christos 	}
    142  1.1  christos 	if ((levents & EPOLL_EVWR) != 0) {
    143  1.1  christos 		EV_SET(kevent, fd, EVFILT_WRITE, kev_flags, 0, 0, 0);
    144  1.1  christos 		kevent->kext_data = l_event->data;
    145  1.1  christos 		kevent->kext_epfd = epfd;
    146  1.1  christos 		kevent->kext_fd = fd;
    147  1.1  christos 		++kevent;
    148  1.1  christos 		++(*nkevents);
    149  1.1  christos 	}
    150  1.1  christos 	/* zero event mask is legal */
    151  1.1  christos 	if ((levents & (EPOLL_EVRD | EPOLL_EVWR)) == 0) {
    152  1.1  christos 		EV_SET(kevent++, fd, EVFILT_READ, EV_ADD|EV_DISABLE, 0, 0, 0);
    153  1.1  christos 		++(*nkevents);
    154  1.1  christos 	}
    155  1.1  christos 
    156  1.1  christos 	if ((levents & ~(EPOLL_EVSUP)) != 0) {
    157  1.1  christos 		return EINVAL;
    158  1.1  christos 	}
    159  1.1  christos 
    160  1.1  christos 	return 0;
    161  1.1  christos }
    162  1.1  christos 
    163  1.1  christos /*
    164  1.1  christos  * Structure converting function from kevent to epoll. In a case
    165  1.1  christos  * this is called on error in registration we store the error in
    166  1.1  christos  * event->data and pick it up later in sys_epoll_ctl().
    167  1.1  christos  */
    168  1.1  christos static void
    169  1.1  christos kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event)
    170  1.1  christos {
    171  1.1  christos 
    172  1.1  christos 	l_event->data = kevent->kext_data;
    173  1.1  christos 
    174  1.1  christos 	if ((kevent->flags & EV_ERROR) != 0) {
    175  1.1  christos 		l_event->events = EPOLLERR;
    176  1.1  christos 		return;
    177  1.1  christos 	}
    178  1.1  christos 
    179  1.1  christos 	/* XXX EPOLLPRI, EPOLLHUP */
    180  1.1  christos 	switch (kevent->filter) {
    181  1.1  christos 	case EVFILT_READ:
    182  1.1  christos 		l_event->events = EPOLLIN;
    183  1.1  christos 		if ((kevent->flags & EV_EOF) != 0)
    184  1.1  christos 			l_event->events |= EPOLLRDHUP;
    185  1.1  christos 		break;
    186  1.1  christos 	case EVFILT_WRITE:
    187  1.1  christos 		l_event->events = EPOLLOUT;
    188  1.1  christos 		break;
    189  1.1  christos 	default:
    190  1.1  christos 		DPRINTF(("%s: unhandled kevent filter %d\n", __func__,
    191  1.1  christos 		    kevent->filter));
    192  1.1  christos 		break;
    193  1.1  christos 	}
    194  1.1  christos }
    195  1.1  christos 
    196  1.1  christos /*
    197  1.1  christos  * Copyout callback used by kevent.  This converts kevent events to
    198  1.1  christos  * epoll events that are located in args->eventlist.
    199  1.1  christos  */
    200  1.1  christos static int
    201  1.1  christos epoll_kev_put_events(void *ctx, struct kevent *events,
    202  1.1  christos     struct kevent *eventlist, size_t index, int n)
    203  1.1  christos {
    204  1.1  christos 	int i;
    205  1.1  christos 	struct epoll_event *eep = (struct epoll_event *)eventlist;
    206  1.1  christos 
    207  1.1  christos 	KASSERT(n >= 0 && n < EPOLL_MAX_EVENTS);
    208  1.1  christos 
    209  1.1  christos 	for (i = 0; i < n; i++)
    210  1.1  christos 		kevent_to_epoll(events + i, eep + index + i);
    211  1.1  christos 
    212  1.1  christos 	return 0;
    213  1.1  christos }
    214  1.1  christos 
    215  1.1  christos /*
    216  1.1  christos  * Copyin callback used by kevent. This copies already
    217  1.1  christos  * converted filters from kernel memory to the kevent
    218  1.1  christos  * internal kernel memory. Hence the memcpy instead of
    219  1.1  christos  * copyin.
    220  1.1  christos  */
    221  1.1  christos static int
    222  1.1  christos epoll_kev_fetch_changes(void *ctx, const struct kevent *changelist,
    223  1.1  christos     struct kevent *changes, size_t index, int n)
    224  1.1  christos {
    225  1.1  christos 	KASSERT(n >= 0 && n < EPOLL_MAX_EVENTS);
    226  1.1  christos 
    227  1.1  christos 	memcpy(changes, changelist + index, n * sizeof(*changes));
    228  1.1  christos 
    229  1.1  christos 	return 0;
    230  1.1  christos }
    231  1.1  christos 
    232  1.1  christos /*
    233  1.1  christos  * Timer copy callback used by kevent.  Copies a converted timeout
    234  1.1  christos  * from kernel memory to kevent memory.  Hence the memcpy instead of
    235  1.1  christos  * just using copyin.
    236  1.1  christos  */
    237  1.1  christos static int
    238  1.1  christos epoll_kev_fetch_timeout(const void *src, void *dest, size_t size)
    239  1.1  christos {
    240  1.1  christos 	memcpy(dest, src, size);
    241  1.1  christos 
    242  1.1  christos 	return 0;
    243  1.1  christos }
    244  1.1  christos 
    245  1.1  christos /*
    246  1.1  christos  * Load epoll filter, convert it to kevent filter and load it into
    247  1.1  christos  * kevent subsystem.
    248  1.1  christos  *
    249  1.1  christos  * event must point to kernel memory or be NULL.
    250  1.1  christos  */
    251  1.1  christos int
    252  1.1  christos epoll_ctl_common(struct lwp *l, register_t *retval, int epfd, int op, int fd,
    253  1.1  christos     struct epoll_event *event)
    254  1.1  christos {
    255  1.1  christos 	struct kevent kev[2];
    256  1.3       rin 	struct kevent_ops k_ops = {
    257  1.1  christos 		.keo_private = NULL,
    258  1.1  christos 		.keo_fetch_timeout = NULL,
    259  1.1  christos 		.keo_fetch_changes = epoll_kev_fetch_changes,
    260  1.1  christos 		.keo_put_events = NULL,
    261  1.1  christos 	};
    262  1.1  christos 	file_t *epfp, *fp;
    263  1.1  christos 	int error = 0;
    264  1.1  christos 	int nchanges = 0;
    265  1.1  christos 
    266  1.1  christos 	/*
    267  1.1  christos 	 * Need to validate epfd and fd separately from kevent1 to match
    268  1.1  christos 	 * Linux's errno behaviour.
    269  1.1  christos 	 */
    270  1.1  christos 	epfp = fd_getfile(epfd);
    271  1.1  christos 	if (epfp == NULL)
    272  1.1  christos 		return EBADF;
    273  1.1  christos 	if (epfp->f_type != DTYPE_KQUEUE)
    274  1.1  christos 		error = EINVAL;
    275  1.1  christos 	fd_putfile(epfd);
    276  1.1  christos 	if (error != 0)
    277  1.1  christos 		return error;
    278  1.1  christos 
    279  1.1  christos 	fp = fd_getfile(fd);
    280  1.1  christos 	if (fp == NULL)
    281  1.1  christos 		return EBADF;
    282  1.1  christos 	if (fp->f_type == DTYPE_VNODE) {
    283  1.1  christos 		switch (fp->f_vnode->v_type) {
    284  1.1  christos 		case VREG:
    285  1.1  christos 		case VDIR:
    286  1.1  christos 		case VBLK:
    287  1.1  christos 		case VLNK:
    288  1.1  christos 			error = EPERM;
    289  1.1  christos 			break;
    290  1.1  christos 
    291  1.1  christos 		default:
    292  1.1  christos 			break;
    293  1.1  christos 		}
    294  1.1  christos 	}
    295  1.1  christos 	fd_putfile(fd);
    296  1.1  christos 	if (error != 0)
    297  1.1  christos 		return error;
    298  1.1  christos 
    299  1.1  christos 	/* Linux disallows spying on himself */
    300  1.1  christos 	if (epfd == fd) {
    301  1.1  christos 		return EINVAL;
    302  1.1  christos 	}
    303  1.1  christos 
    304  1.1  christos 	if (op != EPOLL_CTL_DEL) {
    305  1.1  christos 		error = epoll_to_kevent(epfd, fd, event, kev, &nchanges);
    306  1.1  christos 		if (error != 0)
    307  1.1  christos 			return error;
    308  1.1  christos 	}
    309  1.1  christos 
    310  1.1  christos 	switch (op) {
    311  1.1  christos 	case EPOLL_CTL_MOD:
    312  1.1  christos 		error = epoll_delete_all_events(retval, epfd, fd);
    313  1.1  christos 		if (error != 0)
    314  1.1  christos 			return error;
    315  1.1  christos 		break;
    316  1.1  christos 
    317  1.1  christos 	case EPOLL_CTL_ADD:
    318  1.1  christos 		if (epoll_fd_registered(retval, epfd, fd))
    319  1.1  christos 			return EEXIST;
    320  1.1  christos 		error = epoll_check_loop_and_depth(l, epfd, fd);
    321  1.1  christos 		if (error != 0)
    322  1.1  christos 			return error;
    323  1.1  christos 		break;
    324  1.1  christos 
    325  1.1  christos 	case EPOLL_CTL_DEL:
    326  1.1  christos 		/* CTL_DEL means unregister this fd with this epoll */
    327  1.1  christos 		return epoll_delete_all_events(retval, epfd, fd);
    328  1.1  christos 
    329  1.1  christos 	default:
    330  1.2  pgoyette 		DPRINTF(("%s: invalid op %d\n", __func__, op));
    331  1.1  christos 		return EINVAL;
    332  1.1  christos 	}
    333  1.1  christos 
    334  1.1  christos 	error = kevent1(retval, epfd, kev, nchanges, NULL, 0, NULL, &k_ops);
    335  1.1  christos 
    336  1.1  christos 	if (error == EOPNOTSUPP) {
    337  1.1  christos 		error = EPERM;
    338  1.1  christos 	}
    339  1.1  christos 
    340  1.1  christos 	return error;
    341  1.1  christos }
    342  1.1  christos 
    343  1.1  christos /*
    344  1.1  christos  * epoll_ctl(2).  Copyin event if necessary and then call
    345  1.1  christos  * epoll_ctl_common().
    346  1.1  christos  */
    347  1.1  christos int
    348  1.1  christos sys_epoll_ctl(struct lwp *l, const struct sys_epoll_ctl_args *uap,
    349  1.1  christos     register_t *retval)
    350  1.1  christos {
    351  1.1  christos 	/* {
    352  1.1  christos 		syscallarg(int) epfd;
    353  1.1  christos 		syscallarg(int) op;
    354  1.1  christos 		syscallarg(int) fd;
    355  1.1  christos 		syscallarg(struct epoll_event *) event;
    356  1.1  christos 	} */
    357  1.1  christos 	struct epoll_event ee;
    358  1.1  christos 	struct epoll_event *eep;
    359  1.1  christos 	int error;
    360  1.1  christos 
    361  1.1  christos 	if (SCARG(uap, op) != EPOLL_CTL_DEL) {
    362  1.1  christos 		error = copyin(SCARG(uap, event), &ee, sizeof(ee));
    363  1.1  christos 		if (error != 0)
    364  1.1  christos 			return error;
    365  1.1  christos 
    366  1.1  christos 		eep = &ee;
    367  1.1  christos 	} else
    368  1.1  christos 		eep = NULL;
    369  1.1  christos 
    370  1.1  christos 	return epoll_ctl_common(l, retval, SCARG(uap, epfd), SCARG(uap, op),
    371  1.1  christos 	    SCARG(uap, fd), eep);
    372  1.1  christos }
    373  1.1  christos 
    374  1.1  christos /*
    375  1.1  christos  * Wait for a filter to be triggered on the epoll file descriptor.
    376  1.1  christos  * All of the epoll_*wait* syscalls eventually end up here.
    377  1.1  christos  *
    378  1.1  christos  * events, nss, and ssp must point to kernel memory (or be NULL).
    379  1.1  christos  */
    380  1.1  christos int
    381  1.1  christos epoll_wait_common(struct lwp *l, register_t *retval, int epfd,
    382  1.1  christos     struct epoll_event *events, int maxevents, struct timespec *tsp,
    383  1.1  christos     const sigset_t *nssp)
    384  1.1  christos {
    385  1.1  christos 	struct kevent_ops k_ops = {
    386  1.1  christos 	        .keo_private = NULL,
    387  1.1  christos 		.keo_fetch_timeout = epoll_kev_fetch_timeout,
    388  1.1  christos 		.keo_fetch_changes = NULL,
    389  1.1  christos 		.keo_put_events = epoll_kev_put_events,
    390  1.1  christos 	};
    391  1.1  christos 	struct proc *p = l->l_proc;
    392  1.1  christos 	file_t *epfp;
    393  1.1  christos 	sigset_t oss;
    394  1.1  christos 	int error = 0;
    395  1.1  christos 
    396  1.1  christos 	if (maxevents <= 0 || maxevents > EPOLL_MAX_EVENTS)
    397  1.1  christos 		return EINVAL;
    398  1.1  christos 
    399  1.1  christos 	/*
    400  1.1  christos 	 * Need to validate epfd separately from kevent1 to match
    401  1.1  christos 	 * Linux's errno behaviour.
    402  1.1  christos 	 */
    403  1.1  christos 	epfp = fd_getfile(epfd);
    404  1.1  christos 	if (epfp == NULL)
    405  1.1  christos 		return EBADF;
    406  1.1  christos 	if (epfp->f_type != DTYPE_KQUEUE)
    407  1.1  christos 		error = EINVAL;
    408  1.1  christos 	fd_putfile(epfd);
    409  1.1  christos 	if (error != 0)
    410  1.1  christos 		return error;
    411  1.1  christos 
    412  1.1  christos 	if (nssp != NULL) {
    413  1.1  christos 		mutex_enter(p->p_lock);
    414  1.1  christos 		error = sigprocmask1(l, SIG_SETMASK, nssp, &oss);
    415  1.1  christos 		mutex_exit(p->p_lock);
    416  1.1  christos 		if (error != 0)
    417  1.1  christos 			return error;
    418  1.1  christos 	}
    419  1.1  christos 
    420  1.1  christos 	error = kevent1(retval, epfd, NULL, 0, (struct kevent *)events,
    421  1.1  christos 	    maxevents, tsp, &k_ops);
    422  1.1  christos 	/*
    423  1.1  christos 	 * Since we're not registering nay events, ENOMEM should not
    424  1.1  christos 	 * be possible for this specific kevent1 call.
    425  1.1  christos 	 */
    426  1.1  christos 	KASSERT(error != ENOMEM);
    427  1.1  christos 
    428  1.1  christos 	if (nssp != NULL) {
    429  1.1  christos 	        mutex_enter(p->p_lock);
    430  1.1  christos 		error = sigprocmask1(l, SIG_SETMASK, &oss, NULL);
    431  1.1  christos 		mutex_exit(p->p_lock);
    432  1.1  christos 	}
    433  1.1  christos 
    434  1.1  christos 	return error;
    435  1.1  christos }
    436  1.1  christos 
    437  1.1  christos /*
    438  1.1  christos  * epoll_pwait2(2).
    439  1.1  christos  */
    440  1.1  christos int
    441  1.1  christos sys_epoll_pwait2(struct lwp *l, const struct sys_epoll_pwait2_args *uap,
    442  1.1  christos     register_t *retval)
    443  1.1  christos {
    444  1.1  christos 	/* {
    445  1.1  christos 		syscallarg(int) epfd;
    446  1.1  christos 		syscallarg(struct epoll_event *) events;
    447  1.1  christos 		syscallarg(int) maxevents;
    448  1.1  christos 		syscallarg(struct timespec *) timeout;
    449  1.1  christos 		syscallarg(sigset_t *) sigmask;
    450  1.1  christos 	} */
    451  1.1  christos 	struct epoll_event *events;
    452  1.1  christos 	struct timespec ts, *tsp;
    453  1.1  christos 	sigset_t ss, *ssp;
    454  1.1  christos 	int error;
    455  1.1  christos 	const int maxevents = SCARG(uap, maxevents);
    456  1.1  christos 
    457  1.1  christos 	if (maxevents <= 0 || maxevents >= EPOLL_MAX_EVENTS)
    458  1.1  christos 		return EINVAL;
    459  1.1  christos 
    460  1.1  christos 	if (SCARG(uap, timeout) != NULL) {
    461  1.1  christos 		error = copyin(SCARG(uap, timeout), &ts, sizeof(ts));
    462  1.1  christos 		if (error != 0)
    463  1.1  christos 			return error;
    464  1.1  christos 
    465  1.1  christos 		tsp = &ts;
    466  1.1  christos 	} else
    467  1.1  christos 		tsp = NULL;
    468  1.1  christos 
    469  1.1  christos 	if (SCARG(uap, sigmask) != NULL) {
    470  1.1  christos 		error = copyin(SCARG(uap, sigmask), &ss, sizeof(ss));
    471  1.1  christos 		if (error != 0)
    472  1.1  christos 			return error;
    473  1.1  christos 
    474  1.1  christos 		ssp = &ss;
    475  1.1  christos 	} else
    476  1.1  christos 		ssp = NULL;
    477  1.1  christos 
    478  1.1  christos 	events = kmem_alloc(maxevents * sizeof(*events), KM_SLEEP);
    479  1.1  christos 
    480  1.1  christos 	error = epoll_wait_common(l, retval, SCARG(uap, epfd), events,
    481  1.1  christos 	    maxevents, tsp, ssp);
    482  1.1  christos 	if (error == 0)
    483  1.1  christos 		error = copyout(events, SCARG(uap, events),
    484  1.1  christos 		    *retval * sizeof(*events));
    485  1.1  christos 
    486  1.1  christos 	kmem_free(events, maxevents * sizeof(*events));
    487  1.1  christos 	return error;
    488  1.1  christos }
    489  1.1  christos 
    490  1.1  christos /*
    491  1.1  christos  * Helper that registers a single kevent.
    492  1.1  christos  */
    493  1.1  christos static int
    494  1.1  christos epoll_register_kevent(register_t *retval, int epfd, int fd, int filter,
    495  1.1  christos     unsigned int flags)
    496  1.1  christos {
    497  1.1  christos 	struct kevent kev;
    498  1.1  christos 	struct kevent_ops k_ops = {
    499  1.1  christos 		.keo_private = NULL,
    500  1.1  christos 		.keo_fetch_timeout = NULL,
    501  1.1  christos 		.keo_fetch_changes = epoll_kev_fetch_changes,
    502  1.1  christos 		.keo_put_events = NULL,
    503  1.1  christos 	};
    504  1.1  christos 
    505  1.1  christos 	EV_SET(&kev, fd, filter, flags, 0, 0, 0);
    506  1.1  christos 
    507  1.3       rin 	return kevent1(retval, epfd, &kev, 1, NULL, 0, NULL, &k_ops);
    508  1.1  christos }
    509  1.1  christos 
    510  1.1  christos /*
    511  1.1  christos  * Check if an fd is already registered in the kqueue referenced by epfd.
    512  1.1  christos  */
    513  1.1  christos static int
    514  1.1  christos epoll_fd_registered(register_t *retval, int epfd, int fd)
    515  1.1  christos {
    516  1.1  christos 	/*
    517  1.1  christos 	 * Set empty filter flags to avoid accidental modification of already
    518  1.1  christos 	 * registered events. In the case of event re-registration:
    519  1.1  christos 	 * 1. If event does not exists kevent() does nothing and returns ENOENT
    520  1.1  christos 	 * 2. If event does exists, it's enabled/disabled state is preserved
    521  1.1  christos 	 *    but fflags, data and udata fields are overwritten. So we can not
    522  1.1  christos 	 *    set socket lowats and store user's context pointer in udata.
    523  1.1  christos 	 */
    524  1.1  christos 	if (epoll_register_kevent(retval, epfd, fd, EVFILT_READ, 0) != ENOENT ||
    525  1.1  christos 	    epoll_register_kevent(retval, epfd, fd, EVFILT_WRITE, 0) != ENOENT)
    526  1.1  christos 		return 1;
    527  1.1  christos 
    528  1.1  christos 	return 0;
    529  1.1  christos }
    530  1.1  christos 
    531  1.1  christos /*
    532  1.1  christos  * Remove all events in the kqueue referenced by epfd that depend on
    533  1.1  christos  * fd.
    534  1.1  christos  */
    535  1.1  christos static int
    536  1.1  christos epoll_delete_all_events(register_t *retval, int epfd, int fd)
    537  1.1  christos {
    538  1.1  christos 	int error1, error2;
    539  1.1  christos 
    540  1.1  christos 	error1 = epoll_register_kevent(retval, epfd, fd, EVFILT_READ,
    541  1.1  christos 	    EV_DELETE);
    542  1.1  christos 	error2 = epoll_register_kevent(retval, epfd, fd, EVFILT_WRITE,
    543  1.1  christos 	    EV_DELETE);
    544  1.1  christos 
    545  1.1  christos 	/* return 0 if at least one result positive */
    546  1.1  christos 	return error1 == 0 ? 0 : error2;
    547  1.1  christos }
    548  1.1  christos 
    549  1.1  christos /*
    550  1.1  christos  * Interate through all the knotes and recover a directed graph on
    551  1.1  christos  * which kqueues are watching each other.
    552  1.1  christos  *
    553  1.1  christos  * If edges is NULL, the number of edges is still counted but no graph
    554  1.1  christos  * is assembled.
    555  1.1  christos  */
    556  1.1  christos static int
    557  1.1  christos epoll_recover_watch_tree(struct epoll_edge *edges, size_t nedges, size_t nfds) {
    558  1.1  christos 	file_t *currfp, *targetfp;
    559  1.1  christos 	struct knote *kn, *tmpkn;
    560  1.1  christos 	size_t i, nedges_so_far = 0;
    561  1.1  christos 
    562  1.1  christos 	for (i = 0; i < nfds && (edges == NULL || nedges_so_far < nedges); i++)
    563  1.1  christos 	{
    564  1.1  christos 		currfp = fd_getfile(i);
    565  1.1  christos 		if (currfp == NULL)
    566  1.1  christos 			continue;
    567  1.1  christos 		if (currfp->f_type != DTYPE_KQUEUE)
    568  1.1  christos 			goto continue_count_outer;
    569  1.1  christos 
    570  1.1  christos 		SLIST_FOREACH_SAFE(kn, &currfp->f_kqueue->kq_sel.sel_klist,
    571  1.1  christos 		    kn_selnext, tmpkn) {
    572  1.1  christos 			targetfp = fd_getfile(kn->kn_kevent.kext_epfd);
    573  1.1  christos 			if (targetfp == NULL)
    574  1.1  christos 				continue;
    575  1.1  christos 			if (targetfp->f_type == DTYPE_KQUEUE) {
    576  1.1  christos 				if (edges != NULL) {
    577  1.1  christos 					edges[nedges_so_far].epfd =
    578  1.1  christos 					    kn->kn_kevent.kext_epfd;
    579  1.1  christos 					edges[nedges_so_far].fd =
    580  1.1  christos 					    kn->kn_kevent.kext_fd;
    581  1.1  christos 				}
    582  1.1  christos 				nedges_so_far++;
    583  1.1  christos 			}
    584  1.1  christos 
    585  1.1  christos 			fd_putfile(kn->kn_kevent.kext_epfd);
    586  1.1  christos 		}
    587  1.1  christos 
    588  1.1  christos continue_count_outer:
    589  1.1  christos 		fd_putfile(i);
    590  1.1  christos 	}
    591  1.1  christos 
    592  1.1  christos 	return nedges_so_far;
    593  1.1  christos }
    594  1.1  christos 
    595  1.1  christos /*
    596  1.1  christos  * Run dfs on the graph described by edges, checking for loops and a
    597  1.1  christos  * depth greater than EPOLL_MAX_DEPTH.
    598  1.1  christos  */
    599  1.1  christos static int
    600  1.1  christos epoll_dfs(struct epoll_edge *edges, size_t nedges, struct epoll_seen *seen,
    601  1.1  christos     size_t nseen, int currfd, int depth)
    602  1.1  christos {
    603  1.1  christos 	int error;
    604  1.1  christos 	size_t i;
    605  1.1  christos 
    606  1.1  christos 	KASSERT(edges != NULL);
    607  1.1  christos 	KASSERT(seen != NULL);
    608  1.1  christos 	KASSERT(nedges > 0);
    609  1.1  christos 	KASSERT(currfd < nseen);
    610  1.1  christos 	KASSERT(0 <= depth && depth <= EPOLL_MAX_DEPTH + 1);
    611  1.1  christos 
    612  1.1  christos 	if (__BITMAP_ISSET(currfd, seen))
    613  1.1  christos 		return ELOOP;
    614  1.1  christos 
    615  1.1  christos 	__BITMAP_SET(currfd, seen);
    616  1.1  christos 
    617  1.1  christos 	depth++;
    618  1.1  christos 	if (depth > EPOLL_MAX_DEPTH)
    619  1.1  christos 		return EINVAL;
    620  1.1  christos 
    621  1.1  christos 	for (i = 0; i < nedges; i++) {
    622  1.1  christos 		if (edges[i].epfd != currfd)
    623  1.1  christos 			continue;
    624  1.1  christos 
    625  1.1  christos 		error = epoll_dfs(edges, nedges, seen, nseen,
    626  1.1  christos 		    edges[i].fd, depth);
    627  1.1  christos 		if (error != 0)
    628  1.1  christos 			return error;
    629  1.1  christos 	}
    630  1.1  christos 
    631  1.1  christos 	return 0;
    632  1.1  christos }
    633  1.1  christos 
    634  1.1  christos /*
    635  1.1  christos  * Check if adding fd to epfd would violate the maximum depth or
    636  1.1  christos  * create a loop.
    637  1.1  christos  */
    638  1.1  christos static int
    639  1.1  christos epoll_check_loop_and_depth(struct lwp *l, int epfd, int fd)
    640  1.1  christos {
    641  1.1  christos 	int error;
    642  1.1  christos 	file_t *fp;
    643  1.1  christos 	struct epoll_edge *edges;
    644  1.1  christos 	struct epoll_seen *seen;
    645  1.1  christos 	size_t nedges, nfds, seen_size;
    646  1.1  christos 	bool fdirrelevant;
    647  1.1  christos 
    648  1.1  christos 	/* If the target isn't another kqueue, we can skip this check */
    649  1.1  christos 	fp = fd_getfile(fd);
    650  1.1  christos 	if (fp == NULL)
    651  1.1  christos 		return 0;
    652  1.1  christos 	fdirrelevant = fp->f_type != DTYPE_KQUEUE;
    653  1.1  christos 	fd_putfile(fd);
    654  1.1  christos 	if (fdirrelevant)
    655  1.1  christos 		return 0;
    656  1.1  christos 
    657  1.1  christos 	nfds = l->l_proc->p_fd->fd_lastfile + 1;
    658  1.1  christos 
    659  1.1  christos 	/*
    660  1.1  christos 	 * We call epoll_recover_watch_tree twice, once to find the
    661  1.1  christos 	 * number of edges, and once to actually fill them in.  We add one
    662  1.1  christos 	 * because we want to include the edge epfd->fd.
    663  1.1  christos 	 */
    664  1.3       rin 	nedges = 1 + epoll_recover_watch_tree(NULL, 0, nfds);
    665  1.1  christos 
    666  1.1  christos 	edges = kmem_zalloc(nedges * sizeof(*edges), KM_SLEEP);
    667  1.1  christos 
    668  1.1  christos 	epoll_recover_watch_tree(edges + 1, nedges - 1, nfds);
    669  1.1  christos 
    670  1.1  christos 	edges[0].epfd = epfd;
    671  1.1  christos 	edges[0].fd = fd;
    672  1.1  christos 
    673  1.1  christos 	seen_size = __BITMAP_SIZE(char, nfds);
    674  1.1  christos 	seen = kmem_zalloc(seen_size, KM_SLEEP);
    675  1.1  christos 
    676  1.1  christos 	error = epoll_dfs(edges, nedges, seen, nfds, epfd, 0);
    677  1.1  christos 
    678  1.1  christos 	kmem_free(seen, seen_size);
    679  1.1  christos 	kmem_free(edges, nedges * sizeof(*edges));
    680  1.1  christos 
    681  1.1  christos 	return error;
    682  1.1  christos }
    683