sys_epoll.c revision 1.2 1 1.2 pgoyette /* $NetBSD: sys_epoll.c,v 1.2 2023/07/29 04:06:32 pgoyette Exp $ */
2 1.1 christos
3 1.1 christos /*-
4 1.1 christos * SPDX-License-Identifier: BSD-2-Clause
5 1.1 christos *
6 1.1 christos * Copyright (c) 2007 Roman Divacky
7 1.1 christos * Copyright (c) 2014 Dmitry Chagin <dchagin (at) FreeBSD.org>
8 1.1 christos *
9 1.1 christos * Redistribution and use in source and binary forms, with or without
10 1.1 christos * modification, are permitted provided that the following conditions
11 1.1 christos * are met:
12 1.1 christos * 1. Redistributions of source code must retain the above copyright
13 1.1 christos * notice, this list of conditions and the following disclaimer.
14 1.1 christos * 2. Redistributions in binary form must reproduce the above copyright
15 1.1 christos * notice, this list of conditions and the following disclaimer in the
16 1.1 christos * documentation and/or other materials provided with the distribution.
17 1.1 christos *
18 1.1 christos * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 1.1 christos * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 1.1 christos * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 1.1 christos * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 1.1 christos * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 1.1 christos * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 1.1 christos * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 1.1 christos * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 1.1 christos * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 1.1 christos * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 1.1 christos * SUCH DAMAGE.
29 1.1 christos */
30 1.1 christos #include <sys/cdefs.h>
31 1.2 pgoyette __KERNEL_RCSID(0, "$NetBSD: sys_epoll.c,v 1.2 2023/07/29 04:06:32 pgoyette Exp $");
32 1.1 christos
33 1.1 christos
34 1.1 christos #include <sys/param.h>
35 1.1 christos #include <sys/types.h>
36 1.1 christos #include <sys/bitops.h>
37 1.1 christos #include <sys/epoll.h>
38 1.1 christos #include <sys/event.h>
39 1.1 christos #include <sys/eventvar.h>
40 1.1 christos #include <sys/errno.h>
41 1.1 christos #include <sys/file.h>
42 1.1 christos #include <sys/filedesc.h>
43 1.1 christos #include <sys/fcntl.h>
44 1.1 christos #include <sys/proc.h>
45 1.1 christos #include <sys/signal.h>
46 1.1 christos #include <sys/vnode.h>
47 1.1 christos
48 1.1 christos #include <sys/syscallargs.h>
49 1.1 christos
50 1.1 christos #define EPOLL_MAX_DEPTH 5
51 1.1 christos
52 1.1 christos #define EPOLL_EVRD (EPOLLIN|EPOLLRDNORM)
53 1.1 christos #define EPOLL_EVWR (EPOLLOUT|EPOLLWRNORM)
54 1.1 christos #define EPOLL_EVSUP (EPOLLET|EPOLLONESHOT|EPOLLHUP|EPOLLERR|EPOLLPRI \
55 1.1 christos |EPOLL_EVRD|EPOLL_EVWR|EPOLLRDHUP)
56 1.1 christos
57 1.1 christos #define kext_data ext[0]
58 1.1 christos #define kext_epfd ext[1]
59 1.1 christos #define kext_fd ext[2]
60 1.1 christos
61 1.1 christos #if DEBUG
62 1.1 christos #define DPRINTF(x) uprintf x
63 1.1 christos #else
64 1.1 christos #define DPRINTF(x) __nothing
65 1.1 christos #endif
66 1.1 christos
67 1.1 christos struct epoll_edge {
68 1.1 christos int epfd;
69 1.1 christos int fd;
70 1.1 christos };
71 1.1 christos
72 1.1 christos __BITMAP_TYPE(epoll_seen, char, 1);
73 1.1 christos
74 1.1 christos static int epoll_to_kevent(int, int, struct epoll_event *, struct kevent *,
75 1.1 christos int *);
76 1.1 christos static void kevent_to_epoll(struct kevent *, struct epoll_event *);
77 1.1 christos static int epoll_kev_put_events(void *, struct kevent *, struct kevent *,
78 1.1 christos size_t, int);
79 1.1 christos static int epoll_kev_fetch_changes(void *, const struct kevent *,
80 1.1 christos struct kevent *, size_t, int);
81 1.1 christos static int epoll_kev_fetch_timeout(const void *, void *, size_t);
82 1.1 christos static int epoll_register_kevent(register_t *, int, int, int,
83 1.1 christos unsigned int);
84 1.1 christos static int epoll_fd_registered(register_t *, int, int);
85 1.1 christos static int epoll_delete_all_events(register_t *, int, int);
86 1.1 christos static int epoll_recover_watch_tree(struct epoll_edge *, size_t, size_t);
87 1.1 christos static int epoll_dfs(struct epoll_edge *, size_t, struct epoll_seen *,
88 1.1 christos size_t, int, int);
89 1.1 christos static int epoll_check_loop_and_depth(struct lwp *, int, int);
90 1.1 christos
91 1.1 christos /*
92 1.1 christos * epoll_create1(2). Parse the flags and then create a kqueue instance.
93 1.1 christos */
94 1.1 christos int
95 1.1 christos sys_epoll_create1(struct lwp *l, const struct sys_epoll_create1_args *uap,
96 1.1 christos register_t *retval)
97 1.1 christos {
98 1.1 christos /* {
99 1.1 christos syscallarg(int) flags;
100 1.1 christos } */
101 1.1 christos struct sys_kqueue1_args kqa;
102 1.1 christos
103 1.1 christos if ((SCARG(uap, flags) & ~(O_CLOEXEC)) != 0)
104 1.1 christos return EINVAL;
105 1.1 christos
106 1.1 christos SCARG(&kqa, flags) = SCARG(uap, flags);
107 1.1 christos
108 1.1 christos return sys_kqueue1(l, &kqa, retval);
109 1.1 christos }
110 1.1 christos
111 1.1 christos /*
112 1.1 christos * Structure converting function from epoll to kevent.
113 1.1 christos */
114 1.1 christos static int
115 1.1 christos epoll_to_kevent(int epfd, int fd, struct epoll_event *l_event,
116 1.1 christos struct kevent *kevent, int *nkevents)
117 1.1 christos {
118 1.1 christos uint32_t levents = l_event->events;
119 1.1 christos uint32_t kev_flags = EV_ADD | EV_ENABLE;
120 1.1 christos
121 1.1 christos /* flags related to how event is registered */
122 1.1 christos if ((levents & EPOLLONESHOT) != 0)
123 1.1 christos kev_flags |= EV_DISPATCH;
124 1.1 christos if ((levents & EPOLLET) != 0)
125 1.1 christos kev_flags |= EV_CLEAR;
126 1.1 christos if ((levents & EPOLLERR) != 0)
127 1.1 christos kev_flags |= EV_ERROR;
128 1.1 christos if ((levents & EPOLLRDHUP) != 0)
129 1.1 christos kev_flags |= EV_EOF;
130 1.1 christos
131 1.1 christos /* flags related to what event is registered */
132 1.1 christos if ((levents & EPOLL_EVRD) != 0) {
133 1.1 christos EV_SET(kevent, fd, EVFILT_READ, kev_flags, 0, 0, 0);
134 1.1 christos kevent->kext_data = l_event->data;
135 1.1 christos kevent->kext_epfd = epfd;
136 1.1 christos kevent->kext_fd = fd;
137 1.1 christos ++kevent;
138 1.1 christos ++(*nkevents);
139 1.1 christos }
140 1.1 christos if ((levents & EPOLL_EVWR) != 0) {
141 1.1 christos EV_SET(kevent, fd, EVFILT_WRITE, kev_flags, 0, 0, 0);
142 1.1 christos kevent->kext_data = l_event->data;
143 1.1 christos kevent->kext_epfd = epfd;
144 1.1 christos kevent->kext_fd = fd;
145 1.1 christos ++kevent;
146 1.1 christos ++(*nkevents);
147 1.1 christos }
148 1.1 christos /* zero event mask is legal */
149 1.1 christos if ((levents & (EPOLL_EVRD | EPOLL_EVWR)) == 0) {
150 1.1 christos EV_SET(kevent++, fd, EVFILT_READ, EV_ADD|EV_DISABLE, 0, 0, 0);
151 1.1 christos ++(*nkevents);
152 1.1 christos }
153 1.1 christos
154 1.1 christos if ((levents & ~(EPOLL_EVSUP)) != 0) {
155 1.1 christos return EINVAL;
156 1.1 christos }
157 1.1 christos
158 1.1 christos return 0;
159 1.1 christos }
160 1.1 christos
161 1.1 christos /*
162 1.1 christos * Structure converting function from kevent to epoll. In a case
163 1.1 christos * this is called on error in registration we store the error in
164 1.1 christos * event->data and pick it up later in sys_epoll_ctl().
165 1.1 christos */
166 1.1 christos static void
167 1.1 christos kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event)
168 1.1 christos {
169 1.1 christos
170 1.1 christos l_event->data = kevent->kext_data;
171 1.1 christos
172 1.1 christos if ((kevent->flags & EV_ERROR) != 0) {
173 1.1 christos l_event->events = EPOLLERR;
174 1.1 christos return;
175 1.1 christos }
176 1.1 christos
177 1.1 christos /* XXX EPOLLPRI, EPOLLHUP */
178 1.1 christos switch (kevent->filter) {
179 1.1 christos case EVFILT_READ:
180 1.1 christos l_event->events = EPOLLIN;
181 1.1 christos if ((kevent->flags & EV_EOF) != 0)
182 1.1 christos l_event->events |= EPOLLRDHUP;
183 1.1 christos break;
184 1.1 christos case EVFILT_WRITE:
185 1.1 christos l_event->events = EPOLLOUT;
186 1.1 christos break;
187 1.1 christos default:
188 1.1 christos DPRINTF(("%s: unhandled kevent filter %d\n", __func__,
189 1.1 christos kevent->filter));
190 1.1 christos break;
191 1.1 christos }
192 1.1 christos }
193 1.1 christos
194 1.1 christos /*
195 1.1 christos * Copyout callback used by kevent. This converts kevent events to
196 1.1 christos * epoll events that are located in args->eventlist.
197 1.1 christos */
198 1.1 christos static int
199 1.1 christos epoll_kev_put_events(void *ctx, struct kevent *events,
200 1.1 christos struct kevent *eventlist, size_t index, int n)
201 1.1 christos {
202 1.1 christos int i;
203 1.1 christos struct epoll_event *eep = (struct epoll_event *)eventlist;
204 1.1 christos
205 1.1 christos KASSERT(n >= 0 && n < EPOLL_MAX_EVENTS);
206 1.1 christos
207 1.1 christos for (i = 0; i < n; i++)
208 1.1 christos kevent_to_epoll(events + i, eep + index + i);
209 1.1 christos
210 1.1 christos return 0;
211 1.1 christos }
212 1.1 christos
213 1.1 christos /*
214 1.1 christos * Copyin callback used by kevent. This copies already
215 1.1 christos * converted filters from kernel memory to the kevent
216 1.1 christos * internal kernel memory. Hence the memcpy instead of
217 1.1 christos * copyin.
218 1.1 christos */
219 1.1 christos static int
220 1.1 christos epoll_kev_fetch_changes(void *ctx, const struct kevent *changelist,
221 1.1 christos struct kevent *changes, size_t index, int n)
222 1.1 christos {
223 1.1 christos KASSERT(n >= 0 && n < EPOLL_MAX_EVENTS);
224 1.1 christos
225 1.1 christos memcpy(changes, changelist + index, n * sizeof(*changes));
226 1.1 christos
227 1.1 christos return 0;
228 1.1 christos }
229 1.1 christos
230 1.1 christos /*
231 1.1 christos * Timer copy callback used by kevent. Copies a converted timeout
232 1.1 christos * from kernel memory to kevent memory. Hence the memcpy instead of
233 1.1 christos * just using copyin.
234 1.1 christos */
235 1.1 christos static int
236 1.1 christos epoll_kev_fetch_timeout(const void *src, void *dest, size_t size)
237 1.1 christos {
238 1.1 christos memcpy(dest, src, size);
239 1.1 christos
240 1.1 christos return 0;
241 1.1 christos }
242 1.1 christos
243 1.1 christos /*
244 1.1 christos * Load epoll filter, convert it to kevent filter and load it into
245 1.1 christos * kevent subsystem.
246 1.1 christos *
247 1.1 christos * event must point to kernel memory or be NULL.
248 1.1 christos */
249 1.1 christos int
250 1.1 christos epoll_ctl_common(struct lwp *l, register_t *retval, int epfd, int op, int fd,
251 1.1 christos struct epoll_event *event)
252 1.1 christos {
253 1.1 christos struct kevent kev[2];
254 1.1 christos struct kevent_ops k_ops = {
255 1.1 christos .keo_private = NULL,
256 1.1 christos .keo_fetch_timeout = NULL,
257 1.1 christos .keo_fetch_changes = epoll_kev_fetch_changes,
258 1.1 christos .keo_put_events = NULL,
259 1.1 christos };
260 1.1 christos file_t *epfp, *fp;
261 1.1 christos int error = 0;
262 1.1 christos int nchanges = 0;
263 1.1 christos
264 1.1 christos /*
265 1.1 christos * Need to validate epfd and fd separately from kevent1 to match
266 1.1 christos * Linux's errno behaviour.
267 1.1 christos */
268 1.1 christos epfp = fd_getfile(epfd);
269 1.1 christos if (epfp == NULL)
270 1.1 christos return EBADF;
271 1.1 christos if (epfp->f_type != DTYPE_KQUEUE)
272 1.1 christos error = EINVAL;
273 1.1 christos fd_putfile(epfd);
274 1.1 christos if (error != 0)
275 1.1 christos return error;
276 1.1 christos
277 1.1 christos fp = fd_getfile(fd);
278 1.1 christos if (fp == NULL)
279 1.1 christos return EBADF;
280 1.1 christos if (fp->f_type == DTYPE_VNODE) {
281 1.1 christos switch (fp->f_vnode->v_type) {
282 1.1 christos case VREG:
283 1.1 christos case VDIR:
284 1.1 christos case VBLK:
285 1.1 christos case VLNK:
286 1.1 christos error = EPERM;
287 1.1 christos break;
288 1.1 christos
289 1.1 christos default:
290 1.1 christos break;
291 1.1 christos }
292 1.1 christos }
293 1.1 christos fd_putfile(fd);
294 1.1 christos if (error != 0)
295 1.1 christos return error;
296 1.1 christos
297 1.1 christos /* Linux disallows spying on himself */
298 1.1 christos if (epfd == fd) {
299 1.1 christos return EINVAL;
300 1.1 christos }
301 1.1 christos
302 1.1 christos if (op != EPOLL_CTL_DEL) {
303 1.1 christos error = epoll_to_kevent(epfd, fd, event, kev, &nchanges);
304 1.1 christos if (error != 0)
305 1.1 christos return error;
306 1.1 christos }
307 1.1 christos
308 1.1 christos switch (op) {
309 1.1 christos case EPOLL_CTL_MOD:
310 1.1 christos error = epoll_delete_all_events(retval, epfd, fd);
311 1.1 christos if (error != 0)
312 1.1 christos return error;
313 1.1 christos break;
314 1.1 christos
315 1.1 christos case EPOLL_CTL_ADD:
316 1.1 christos if (epoll_fd_registered(retval, epfd, fd))
317 1.1 christos return EEXIST;
318 1.1 christos error = epoll_check_loop_and_depth(l, epfd, fd);
319 1.1 christos if (error != 0)
320 1.1 christos return error;
321 1.1 christos break;
322 1.1 christos
323 1.1 christos case EPOLL_CTL_DEL:
324 1.1 christos /* CTL_DEL means unregister this fd with this epoll */
325 1.1 christos return epoll_delete_all_events(retval, epfd, fd);
326 1.1 christos
327 1.1 christos default:
328 1.2 pgoyette DPRINTF(("%s: invalid op %d\n", __func__, op));
329 1.1 christos return EINVAL;
330 1.1 christos }
331 1.1 christos
332 1.1 christos error = kevent1(retval, epfd, kev, nchanges, NULL, 0, NULL, &k_ops);
333 1.1 christos
334 1.1 christos if (error == EOPNOTSUPP) {
335 1.1 christos error = EPERM;
336 1.1 christos }
337 1.1 christos
338 1.1 christos return error;
339 1.1 christos }
340 1.1 christos
341 1.1 christos /*
342 1.1 christos * epoll_ctl(2). Copyin event if necessary and then call
343 1.1 christos * epoll_ctl_common().
344 1.1 christos */
345 1.1 christos int
346 1.1 christos sys_epoll_ctl(struct lwp *l, const struct sys_epoll_ctl_args *uap,
347 1.1 christos register_t *retval)
348 1.1 christos {
349 1.1 christos /* {
350 1.1 christos syscallarg(int) epfd;
351 1.1 christos syscallarg(int) op;
352 1.1 christos syscallarg(int) fd;
353 1.1 christos syscallarg(struct epoll_event *) event;
354 1.1 christos } */
355 1.1 christos struct epoll_event ee;
356 1.1 christos struct epoll_event *eep;
357 1.1 christos int error;
358 1.1 christos
359 1.1 christos if (SCARG(uap, op) != EPOLL_CTL_DEL) {
360 1.1 christos error = copyin(SCARG(uap, event), &ee, sizeof(ee));
361 1.1 christos if (error != 0)
362 1.1 christos return error;
363 1.1 christos
364 1.1 christos eep = ⅇ
365 1.1 christos } else
366 1.1 christos eep = NULL;
367 1.1 christos
368 1.1 christos return epoll_ctl_common(l, retval, SCARG(uap, epfd), SCARG(uap, op),
369 1.1 christos SCARG(uap, fd), eep);
370 1.1 christos }
371 1.1 christos
372 1.1 christos /*
373 1.1 christos * Wait for a filter to be triggered on the epoll file descriptor.
374 1.1 christos * All of the epoll_*wait* syscalls eventually end up here.
375 1.1 christos *
376 1.1 christos * events, nss, and ssp must point to kernel memory (or be NULL).
377 1.1 christos */
378 1.1 christos int
379 1.1 christos epoll_wait_common(struct lwp *l, register_t *retval, int epfd,
380 1.1 christos struct epoll_event *events, int maxevents, struct timespec *tsp,
381 1.1 christos const sigset_t *nssp)
382 1.1 christos {
383 1.1 christos struct kevent_ops k_ops = {
384 1.1 christos .keo_private = NULL,
385 1.1 christos .keo_fetch_timeout = epoll_kev_fetch_timeout,
386 1.1 christos .keo_fetch_changes = NULL,
387 1.1 christos .keo_put_events = epoll_kev_put_events,
388 1.1 christos };
389 1.1 christos struct proc *p = l->l_proc;
390 1.1 christos file_t *epfp;
391 1.1 christos sigset_t oss;
392 1.1 christos int error = 0;
393 1.1 christos
394 1.1 christos if (maxevents <= 0 || maxevents > EPOLL_MAX_EVENTS)
395 1.1 christos return EINVAL;
396 1.1 christos
397 1.1 christos /*
398 1.1 christos * Need to validate epfd separately from kevent1 to match
399 1.1 christos * Linux's errno behaviour.
400 1.1 christos */
401 1.1 christos epfp = fd_getfile(epfd);
402 1.1 christos if (epfp == NULL)
403 1.1 christos return EBADF;
404 1.1 christos if (epfp->f_type != DTYPE_KQUEUE)
405 1.1 christos error = EINVAL;
406 1.1 christos fd_putfile(epfd);
407 1.1 christos if (error != 0)
408 1.1 christos return error;
409 1.1 christos
410 1.1 christos if (nssp != NULL) {
411 1.1 christos mutex_enter(p->p_lock);
412 1.1 christos error = sigprocmask1(l, SIG_SETMASK, nssp, &oss);
413 1.1 christos mutex_exit(p->p_lock);
414 1.1 christos if (error != 0)
415 1.1 christos return error;
416 1.1 christos }
417 1.1 christos
418 1.1 christos error = kevent1(retval, epfd, NULL, 0, (struct kevent *)events,
419 1.1 christos maxevents, tsp, &k_ops);
420 1.1 christos /*
421 1.1 christos * Since we're not registering nay events, ENOMEM should not
422 1.1 christos * be possible for this specific kevent1 call.
423 1.1 christos */
424 1.1 christos KASSERT(error != ENOMEM);
425 1.1 christos
426 1.1 christos if (nssp != NULL) {
427 1.1 christos mutex_enter(p->p_lock);
428 1.1 christos error = sigprocmask1(l, SIG_SETMASK, &oss, NULL);
429 1.1 christos mutex_exit(p->p_lock);
430 1.1 christos }
431 1.1 christos
432 1.1 christos return error;
433 1.1 christos }
434 1.1 christos
435 1.1 christos /*
436 1.1 christos * epoll_pwait2(2).
437 1.1 christos */
438 1.1 christos int
439 1.1 christos sys_epoll_pwait2(struct lwp *l, const struct sys_epoll_pwait2_args *uap,
440 1.1 christos register_t *retval)
441 1.1 christos {
442 1.1 christos /* {
443 1.1 christos syscallarg(int) epfd;
444 1.1 christos syscallarg(struct epoll_event *) events;
445 1.1 christos syscallarg(int) maxevents;
446 1.1 christos syscallarg(struct timespec *) timeout;
447 1.1 christos syscallarg(sigset_t *) sigmask;
448 1.1 christos } */
449 1.1 christos struct epoll_event *events;
450 1.1 christos struct timespec ts, *tsp;
451 1.1 christos sigset_t ss, *ssp;
452 1.1 christos int error;
453 1.1 christos const int maxevents = SCARG(uap, maxevents);
454 1.1 christos
455 1.1 christos if (maxevents <= 0 || maxevents >= EPOLL_MAX_EVENTS)
456 1.1 christos return EINVAL;
457 1.1 christos
458 1.1 christos if (SCARG(uap, timeout) != NULL) {
459 1.1 christos error = copyin(SCARG(uap, timeout), &ts, sizeof(ts));
460 1.1 christos if (error != 0)
461 1.1 christos return error;
462 1.1 christos
463 1.1 christos tsp = &ts;
464 1.1 christos } else
465 1.1 christos tsp = NULL;
466 1.1 christos
467 1.1 christos if (SCARG(uap, sigmask) != NULL) {
468 1.1 christos error = copyin(SCARG(uap, sigmask), &ss, sizeof(ss));
469 1.1 christos if (error != 0)
470 1.1 christos return error;
471 1.1 christos
472 1.1 christos ssp = &ss;
473 1.1 christos } else
474 1.1 christos ssp = NULL;
475 1.1 christos
476 1.1 christos events = kmem_alloc(maxevents * sizeof(*events), KM_SLEEP);
477 1.1 christos
478 1.1 christos error = epoll_wait_common(l, retval, SCARG(uap, epfd), events,
479 1.1 christos maxevents, tsp, ssp);
480 1.1 christos if (error == 0)
481 1.1 christos error = copyout(events, SCARG(uap, events),
482 1.1 christos *retval * sizeof(*events));
483 1.1 christos
484 1.1 christos kmem_free(events, maxevents * sizeof(*events));
485 1.1 christos return error;
486 1.1 christos }
487 1.1 christos
488 1.1 christos /*
489 1.1 christos * Helper that registers a single kevent.
490 1.1 christos */
491 1.1 christos static int
492 1.1 christos epoll_register_kevent(register_t *retval, int epfd, int fd, int filter,
493 1.1 christos unsigned int flags)
494 1.1 christos {
495 1.1 christos struct kevent kev;
496 1.1 christos struct kevent_ops k_ops = {
497 1.1 christos .keo_private = NULL,
498 1.1 christos .keo_fetch_timeout = NULL,
499 1.1 christos .keo_fetch_changes = epoll_kev_fetch_changes,
500 1.1 christos .keo_put_events = NULL,
501 1.1 christos };
502 1.1 christos
503 1.1 christos EV_SET(&kev, fd, filter, flags, 0, 0, 0);
504 1.1 christos
505 1.1 christos return kevent1(retval, epfd, &kev, 1, NULL, 0, NULL, &k_ops);
506 1.1 christos }
507 1.1 christos
508 1.1 christos /*
509 1.1 christos * Check if an fd is already registered in the kqueue referenced by epfd.
510 1.1 christos */
511 1.1 christos static int
512 1.1 christos epoll_fd_registered(register_t *retval, int epfd, int fd)
513 1.1 christos {
514 1.1 christos /*
515 1.1 christos * Set empty filter flags to avoid accidental modification of already
516 1.1 christos * registered events. In the case of event re-registration:
517 1.1 christos * 1. If event does not exists kevent() does nothing and returns ENOENT
518 1.1 christos * 2. If event does exists, it's enabled/disabled state is preserved
519 1.1 christos * but fflags, data and udata fields are overwritten. So we can not
520 1.1 christos * set socket lowats and store user's context pointer in udata.
521 1.1 christos */
522 1.1 christos if (epoll_register_kevent(retval, epfd, fd, EVFILT_READ, 0) != ENOENT ||
523 1.1 christos epoll_register_kevent(retval, epfd, fd, EVFILT_WRITE, 0) != ENOENT)
524 1.1 christos return 1;
525 1.1 christos
526 1.1 christos return 0;
527 1.1 christos }
528 1.1 christos
529 1.1 christos /*
530 1.1 christos * Remove all events in the kqueue referenced by epfd that depend on
531 1.1 christos * fd.
532 1.1 christos */
533 1.1 christos static int
534 1.1 christos epoll_delete_all_events(register_t *retval, int epfd, int fd)
535 1.1 christos {
536 1.1 christos int error1, error2;
537 1.1 christos
538 1.1 christos error1 = epoll_register_kevent(retval, epfd, fd, EVFILT_READ,
539 1.1 christos EV_DELETE);
540 1.1 christos error2 = epoll_register_kevent(retval, epfd, fd, EVFILT_WRITE,
541 1.1 christos EV_DELETE);
542 1.1 christos
543 1.1 christos /* return 0 if at least one result positive */
544 1.1 christos return error1 == 0 ? 0 : error2;
545 1.1 christos }
546 1.1 christos
547 1.1 christos /*
548 1.1 christos * Interate through all the knotes and recover a directed graph on
549 1.1 christos * which kqueues are watching each other.
550 1.1 christos *
551 1.1 christos * If edges is NULL, the number of edges is still counted but no graph
552 1.1 christos * is assembled.
553 1.1 christos */
554 1.1 christos static int
555 1.1 christos epoll_recover_watch_tree(struct epoll_edge *edges, size_t nedges, size_t nfds) {
556 1.1 christos file_t *currfp, *targetfp;
557 1.1 christos struct knote *kn, *tmpkn;
558 1.1 christos size_t i, nedges_so_far = 0;
559 1.1 christos
560 1.1 christos for (i = 0; i < nfds && (edges == NULL || nedges_so_far < nedges); i++)
561 1.1 christos {
562 1.1 christos currfp = fd_getfile(i);
563 1.1 christos if (currfp == NULL)
564 1.1 christos continue;
565 1.1 christos if (currfp->f_type != DTYPE_KQUEUE)
566 1.1 christos goto continue_count_outer;
567 1.1 christos
568 1.1 christos SLIST_FOREACH_SAFE(kn, &currfp->f_kqueue->kq_sel.sel_klist,
569 1.1 christos kn_selnext, tmpkn) {
570 1.1 christos targetfp = fd_getfile(kn->kn_kevent.kext_epfd);
571 1.1 christos if (targetfp == NULL)
572 1.1 christos continue;
573 1.1 christos if (targetfp->f_type == DTYPE_KQUEUE) {
574 1.1 christos if (edges != NULL) {
575 1.1 christos edges[nedges_so_far].epfd =
576 1.1 christos kn->kn_kevent.kext_epfd;
577 1.1 christos edges[nedges_so_far].fd =
578 1.1 christos kn->kn_kevent.kext_fd;
579 1.1 christos }
580 1.1 christos nedges_so_far++;
581 1.1 christos }
582 1.1 christos
583 1.1 christos fd_putfile(kn->kn_kevent.kext_epfd);
584 1.1 christos }
585 1.1 christos
586 1.1 christos continue_count_outer:
587 1.1 christos fd_putfile(i);
588 1.1 christos }
589 1.1 christos
590 1.1 christos return nedges_so_far;
591 1.1 christos }
592 1.1 christos
593 1.1 christos /*
594 1.1 christos * Run dfs on the graph described by edges, checking for loops and a
595 1.1 christos * depth greater than EPOLL_MAX_DEPTH.
596 1.1 christos */
597 1.1 christos static int
598 1.1 christos epoll_dfs(struct epoll_edge *edges, size_t nedges, struct epoll_seen *seen,
599 1.1 christos size_t nseen, int currfd, int depth)
600 1.1 christos {
601 1.1 christos int error;
602 1.1 christos size_t i;
603 1.1 christos
604 1.1 christos KASSERT(edges != NULL);
605 1.1 christos KASSERT(seen != NULL);
606 1.1 christos KASSERT(nedges > 0);
607 1.1 christos KASSERT(currfd < nseen);
608 1.1 christos KASSERT(0 <= depth && depth <= EPOLL_MAX_DEPTH + 1);
609 1.1 christos
610 1.1 christos if (__BITMAP_ISSET(currfd, seen))
611 1.1 christos return ELOOP;
612 1.1 christos
613 1.1 christos __BITMAP_SET(currfd, seen);
614 1.1 christos
615 1.1 christos depth++;
616 1.1 christos if (depth > EPOLL_MAX_DEPTH)
617 1.1 christos return EINVAL;
618 1.1 christos
619 1.1 christos for (i = 0; i < nedges; i++) {
620 1.1 christos if (edges[i].epfd != currfd)
621 1.1 christos continue;
622 1.1 christos
623 1.1 christos error = epoll_dfs(edges, nedges, seen, nseen,
624 1.1 christos edges[i].fd, depth);
625 1.1 christos if (error != 0)
626 1.1 christos return error;
627 1.1 christos }
628 1.1 christos
629 1.1 christos return 0;
630 1.1 christos }
631 1.1 christos
632 1.1 christos /*
633 1.1 christos * Check if adding fd to epfd would violate the maximum depth or
634 1.1 christos * create a loop.
635 1.1 christos */
636 1.1 christos static int
637 1.1 christos epoll_check_loop_and_depth(struct lwp *l, int epfd, int fd)
638 1.1 christos {
639 1.1 christos int error;
640 1.1 christos file_t *fp;
641 1.1 christos struct epoll_edge *edges;
642 1.1 christos struct epoll_seen *seen;
643 1.1 christos size_t nedges, nfds, seen_size;
644 1.1 christos bool fdirrelevant;
645 1.1 christos
646 1.1 christos /* If the target isn't another kqueue, we can skip this check */
647 1.1 christos fp = fd_getfile(fd);
648 1.1 christos if (fp == NULL)
649 1.1 christos return 0;
650 1.1 christos fdirrelevant = fp->f_type != DTYPE_KQUEUE;
651 1.1 christos fd_putfile(fd);
652 1.1 christos if (fdirrelevant)
653 1.1 christos return 0;
654 1.1 christos
655 1.1 christos nfds = l->l_proc->p_fd->fd_lastfile + 1;
656 1.1 christos
657 1.1 christos /*
658 1.1 christos * We call epoll_recover_watch_tree twice, once to find the
659 1.1 christos * number of edges, and once to actually fill them in. We add one
660 1.1 christos * because we want to include the edge epfd->fd.
661 1.1 christos */
662 1.1 christos nedges = 1 + epoll_recover_watch_tree(NULL, 0, nfds);
663 1.1 christos
664 1.1 christos edges = kmem_zalloc(nedges * sizeof(*edges), KM_SLEEP);
665 1.1 christos
666 1.1 christos epoll_recover_watch_tree(edges + 1, nedges - 1, nfds);
667 1.1 christos
668 1.1 christos edges[0].epfd = epfd;
669 1.1 christos edges[0].fd = fd;
670 1.1 christos
671 1.1 christos seen_size = __BITMAP_SIZE(char, nfds);
672 1.1 christos seen = kmem_zalloc(seen_size, KM_SLEEP);
673 1.1 christos
674 1.1 christos error = epoll_dfs(edges, nedges, seen, nfds, epfd, 0);
675 1.1 christos
676 1.1 christos kmem_free(seen, seen_size);
677 1.1 christos kmem_free(edges, nedges * sizeof(*edges));
678 1.1 christos
679 1.1 christos return error;
680 1.1 christos }
681