kern_event.c revision 1.124 1 /* $NetBSD: kern_event.c,v 1.124 2021/09/26 21:29:38 thorpej Exp $ */
2
3 /*-
4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*-
33 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon (at) FreeBSD.org>
34 * Copyright (c) 2009 Apple, Inc
35 * All rights reserved.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 *
46 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
47 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
49 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
50 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
52 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
54 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
55 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
56 * SUCH DAMAGE.
57 *
58 * FreeBSD: src/sys/kern/kern_event.c,v 1.27 2001/07/05 17:10:44 rwatson Exp
59 */
60
61 #include <sys/cdefs.h>
62 __KERNEL_RCSID(0, "$NetBSD: kern_event.c,v 1.124 2021/09/26 21:29:38 thorpej Exp $");
63
64 #include <sys/param.h>
65 #include <sys/systm.h>
66 #include <sys/kernel.h>
67 #include <sys/wait.h>
68 #include <sys/proc.h>
69 #include <sys/file.h>
70 #include <sys/select.h>
71 #include <sys/queue.h>
72 #include <sys/event.h>
73 #include <sys/eventvar.h>
74 #include <sys/poll.h>
75 #include <sys/kmem.h>
76 #include <sys/stat.h>
77 #include <sys/filedesc.h>
78 #include <sys/syscallargs.h>
79 #include <sys/kauth.h>
80 #include <sys/conf.h>
81 #include <sys/atomic.h>
82
83 static int kqueue_scan(file_t *, size_t, struct kevent *,
84 const struct timespec *, register_t *,
85 const struct kevent_ops *, struct kevent *,
86 size_t);
87 static int kqueue_ioctl(file_t *, u_long, void *);
88 static int kqueue_fcntl(file_t *, u_int, void *);
89 static int kqueue_poll(file_t *, int);
90 static int kqueue_kqfilter(file_t *, struct knote *);
91 static int kqueue_stat(file_t *, struct stat *);
92 static int kqueue_close(file_t *);
93 static void kqueue_restart(file_t *);
94 static int kqueue_register(struct kqueue *, struct kevent *);
95 static void kqueue_doclose(struct kqueue *, struct klist *, int);
96
97 static void knote_detach(struct knote *, filedesc_t *fdp, bool);
98 static void knote_enqueue(struct knote *);
99 static void knote_activate(struct knote *);
100
101 static void filt_kqdetach(struct knote *);
102 static int filt_kqueue(struct knote *, long hint);
103 static int filt_procattach(struct knote *);
104 static void filt_procdetach(struct knote *);
105 static int filt_proc(struct knote *, long hint);
106 static int filt_fileattach(struct knote *);
107 static void filt_timerexpire(void *x);
108 static int filt_timerattach(struct knote *);
109 static void filt_timerdetach(struct knote *);
110 static int filt_timer(struct knote *, long hint);
111 static int filt_userattach(struct knote *);
112 static void filt_userdetach(struct knote *);
113 static int filt_user(struct knote *, long hint);
114 static void filt_usertouch(struct knote *, struct kevent *, long type);
115
116 static const struct fileops kqueueops = {
117 .fo_name = "kqueue",
118 .fo_read = (void *)enxio,
119 .fo_write = (void *)enxio,
120 .fo_ioctl = kqueue_ioctl,
121 .fo_fcntl = kqueue_fcntl,
122 .fo_poll = kqueue_poll,
123 .fo_stat = kqueue_stat,
124 .fo_close = kqueue_close,
125 .fo_kqfilter = kqueue_kqfilter,
126 .fo_restart = kqueue_restart,
127 };
128
129 static const struct filterops kqread_filtops = {
130 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
131 .f_attach = NULL,
132 .f_detach = filt_kqdetach,
133 .f_event = filt_kqueue,
134 };
135
136 static const struct filterops proc_filtops = {
137 .f_flags = 0,
138 .f_attach = filt_procattach,
139 .f_detach = filt_procdetach,
140 .f_event = filt_proc,
141 };
142
143 /*
144 * file_filtops is not marked MPSAFE because it's going to call
145 * fileops::fo_kqfilter(), which might not be. That function,
146 * however, will override the knote's filterops, and thus will
147 * inherit the MPSAFE-ness of the back-end at that time.
148 */
149 static const struct filterops file_filtops = {
150 .f_flags = FILTEROP_ISFD,
151 .f_attach = filt_fileattach,
152 .f_detach = NULL,
153 .f_event = NULL,
154 };
155
156 static const struct filterops timer_filtops = {
157 .f_flags = 0,
158 .f_attach = filt_timerattach,
159 .f_detach = filt_timerdetach,
160 .f_event = filt_timer,
161 };
162
163 static const struct filterops user_filtops = {
164 .f_flags = FILTEROP_MPSAFE,
165 .f_attach = filt_userattach,
166 .f_detach = filt_userdetach,
167 .f_event = filt_user,
168 .f_touch = filt_usertouch,
169 };
170
171 static u_int kq_ncallouts = 0;
172 static int kq_calloutmax = (4 * 1024);
173
174 #define KN_HASHSIZE 64 /* XXX should be tunable */
175 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
176
177 extern const struct filterops fs_filtops; /* vfs_syscalls.c */
178 extern const struct filterops sig_filtops; /* kern_sig.c */
179
180 #define KQ_FLUX_WAKEUP(kq) cv_broadcast(&kq->kq_cv)
181
182 /*
183 * Table for for all system-defined filters.
184 * These should be listed in the numeric order of the EVFILT_* defines.
185 * If filtops is NULL, the filter isn't implemented in NetBSD.
186 * End of list is when name is NULL.
187 *
188 * Note that 'refcnt' is meaningless for built-in filters.
189 */
190 struct kfilter {
191 const char *name; /* name of filter */
192 uint32_t filter; /* id of filter */
193 unsigned refcnt; /* reference count */
194 const struct filterops *filtops;/* operations for filter */
195 size_t namelen; /* length of name string */
196 };
197
198 /* System defined filters */
199 static struct kfilter sys_kfilters[] = {
200 { "EVFILT_READ", EVFILT_READ, 0, &file_filtops, 0 },
201 { "EVFILT_WRITE", EVFILT_WRITE, 0, &file_filtops, 0, },
202 { "EVFILT_AIO", EVFILT_AIO, 0, NULL, 0 },
203 { "EVFILT_VNODE", EVFILT_VNODE, 0, &file_filtops, 0 },
204 { "EVFILT_PROC", EVFILT_PROC, 0, &proc_filtops, 0 },
205 { "EVFILT_SIGNAL", EVFILT_SIGNAL, 0, &sig_filtops, 0 },
206 { "EVFILT_TIMER", EVFILT_TIMER, 0, &timer_filtops, 0 },
207 { "EVFILT_FS", EVFILT_FS, 0, &fs_filtops, 0 },
208 { "EVFILT_USER", EVFILT_USER, 0, &user_filtops, 0 },
209 { NULL, 0, 0, NULL, 0 },
210 };
211
212 /* User defined kfilters */
213 static struct kfilter *user_kfilters; /* array */
214 static int user_kfilterc; /* current offset */
215 static int user_kfiltermaxc; /* max size so far */
216 static size_t user_kfiltersz; /* size of allocated memory */
217
218 /*
219 * Global Locks.
220 *
221 * Lock order:
222 *
223 * kqueue_filter_lock
224 * -> kn_kq->kq_fdp->fd_lock
225 * -> object lock (e.g., device driver lock, kqueue_misc_lock, &c.)
226 * -> kn_kq->kq_lock
227 *
228 * Locking rules:
229 *
230 * f_attach: fdp->fd_lock, KERNEL_LOCK
231 * f_detach: fdp->fd_lock, KERNEL_LOCK
232 * f_event(!NOTE_SUBMIT) via kevent: fdp->fd_lock, _no_ object lock
233 * f_event via knote: whatever caller guarantees
234 * Typically, f_event(NOTE_SUBMIT) via knote: object lock
235 * f_event(!NOTE_SUBMIT) via knote: nothing,
236 * acquires/releases object lock inside.
237 */
238 static krwlock_t kqueue_filter_lock; /* lock on filter lists */
239 static kmutex_t kqueue_misc_lock; /* miscellaneous */
240
241 static int
242 filter_attach(struct knote *kn)
243 {
244 int rv;
245
246 KASSERT(kn->kn_fop != NULL);
247 KASSERT(kn->kn_fop->f_attach != NULL);
248
249 /*
250 * N.B. that kn->kn_fop may change as the result of calling
251 * f_attach().
252 */
253 if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
254 rv = kn->kn_fop->f_attach(kn);
255 } else {
256 KERNEL_LOCK(1, NULL);
257 rv = kn->kn_fop->f_attach(kn);
258 KERNEL_UNLOCK_ONE(NULL);
259 }
260
261 return rv;
262 }
263
264 static void
265 filter_detach(struct knote *kn)
266 {
267 KASSERT(kn->kn_fop != NULL);
268 KASSERT(kn->kn_fop->f_detach != NULL);
269
270 if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
271 kn->kn_fop->f_detach(kn);
272 } else {
273 KERNEL_LOCK(1, NULL);
274 kn->kn_fop->f_detach(kn);
275 KERNEL_UNLOCK_ONE(NULL);
276 }
277 }
278
279 static int
280 filter_event(struct knote *kn, long hint)
281 {
282 int rv;
283
284 KASSERT(kn->kn_fop != NULL);
285 KASSERT(kn->kn_fop->f_event != NULL);
286
287 if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
288 rv = kn->kn_fop->f_event(kn, hint);
289 } else {
290 KERNEL_LOCK(1, NULL);
291 rv = kn->kn_fop->f_event(kn, hint);
292 KERNEL_UNLOCK_ONE(NULL);
293 }
294
295 return rv;
296 }
297
298 static void
299 filter_touch(struct knote *kn, struct kevent *kev, long type)
300 {
301 kn->kn_fop->f_touch(kn, kev, type);
302 }
303
304 static kauth_listener_t kqueue_listener;
305
306 static int
307 kqueue_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
308 void *arg0, void *arg1, void *arg2, void *arg3)
309 {
310 struct proc *p;
311 int result;
312
313 result = KAUTH_RESULT_DEFER;
314 p = arg0;
315
316 if (action != KAUTH_PROCESS_KEVENT_FILTER)
317 return result;
318
319 if ((kauth_cred_getuid(p->p_cred) != kauth_cred_getuid(cred) ||
320 ISSET(p->p_flag, PK_SUGID)))
321 return result;
322
323 result = KAUTH_RESULT_ALLOW;
324
325 return result;
326 }
327
328 /*
329 * Initialize the kqueue subsystem.
330 */
331 void
332 kqueue_init(void)
333 {
334
335 rw_init(&kqueue_filter_lock);
336 mutex_init(&kqueue_misc_lock, MUTEX_DEFAULT, IPL_NONE);
337
338 kqueue_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
339 kqueue_listener_cb, NULL);
340 }
341
342 /*
343 * Find kfilter entry by name, or NULL if not found.
344 */
345 static struct kfilter *
346 kfilter_byname_sys(const char *name)
347 {
348 int i;
349
350 KASSERT(rw_lock_held(&kqueue_filter_lock));
351
352 for (i = 0; sys_kfilters[i].name != NULL; i++) {
353 if (strcmp(name, sys_kfilters[i].name) == 0)
354 return &sys_kfilters[i];
355 }
356 return NULL;
357 }
358
359 static struct kfilter *
360 kfilter_byname_user(const char *name)
361 {
362 int i;
363
364 KASSERT(rw_lock_held(&kqueue_filter_lock));
365
366 /* user filter slots have a NULL name if previously deregistered */
367 for (i = 0; i < user_kfilterc ; i++) {
368 if (user_kfilters[i].name != NULL &&
369 strcmp(name, user_kfilters[i].name) == 0)
370 return &user_kfilters[i];
371 }
372 return NULL;
373 }
374
375 static struct kfilter *
376 kfilter_byname(const char *name)
377 {
378 struct kfilter *kfilter;
379
380 KASSERT(rw_lock_held(&kqueue_filter_lock));
381
382 if ((kfilter = kfilter_byname_sys(name)) != NULL)
383 return kfilter;
384
385 return kfilter_byname_user(name);
386 }
387
388 /*
389 * Find kfilter entry by filter id, or NULL if not found.
390 * Assumes entries are indexed in filter id order, for speed.
391 */
392 static struct kfilter *
393 kfilter_byfilter(uint32_t filter)
394 {
395 struct kfilter *kfilter;
396
397 KASSERT(rw_lock_held(&kqueue_filter_lock));
398
399 if (filter < EVFILT_SYSCOUNT) /* it's a system filter */
400 kfilter = &sys_kfilters[filter];
401 else if (user_kfilters != NULL &&
402 filter < EVFILT_SYSCOUNT + user_kfilterc)
403 /* it's a user filter */
404 kfilter = &user_kfilters[filter - EVFILT_SYSCOUNT];
405 else
406 return (NULL); /* out of range */
407 KASSERT(kfilter->filter == filter); /* sanity check! */
408 return (kfilter);
409 }
410
411 /*
412 * Register a new kfilter. Stores the entry in user_kfilters.
413 * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
414 * If retfilter != NULL, the new filterid is returned in it.
415 */
416 int
417 kfilter_register(const char *name, const struct filterops *filtops,
418 int *retfilter)
419 {
420 struct kfilter *kfilter;
421 size_t len;
422 int i;
423
424 if (name == NULL || name[0] == '\0' || filtops == NULL)
425 return (EINVAL); /* invalid args */
426
427 rw_enter(&kqueue_filter_lock, RW_WRITER);
428 if (kfilter_byname(name) != NULL) {
429 rw_exit(&kqueue_filter_lock);
430 return (EEXIST); /* already exists */
431 }
432 if (user_kfilterc > 0xffffffff - EVFILT_SYSCOUNT) {
433 rw_exit(&kqueue_filter_lock);
434 return (EINVAL); /* too many */
435 }
436
437 for (i = 0; i < user_kfilterc; i++) {
438 kfilter = &user_kfilters[i];
439 if (kfilter->name == NULL) {
440 /* Previously deregistered slot. Reuse. */
441 goto reuse;
442 }
443 }
444
445 /* check if need to grow user_kfilters */
446 if (user_kfilterc + 1 > user_kfiltermaxc) {
447 /* Grow in KFILTER_EXTENT chunks. */
448 user_kfiltermaxc += KFILTER_EXTENT;
449 len = user_kfiltermaxc * sizeof(*kfilter);
450 kfilter = kmem_alloc(len, KM_SLEEP);
451 memset((char *)kfilter + user_kfiltersz, 0, len - user_kfiltersz);
452 if (user_kfilters != NULL) {
453 memcpy(kfilter, user_kfilters, user_kfiltersz);
454 kmem_free(user_kfilters, user_kfiltersz);
455 }
456 user_kfiltersz = len;
457 user_kfilters = kfilter;
458 }
459 /* Adding new slot */
460 kfilter = &user_kfilters[user_kfilterc++];
461 reuse:
462 kfilter->name = kmem_strdupsize(name, &kfilter->namelen, KM_SLEEP);
463
464 kfilter->filter = (kfilter - user_kfilters) + EVFILT_SYSCOUNT;
465
466 kfilter->filtops = kmem_alloc(sizeof(*filtops), KM_SLEEP);
467 memcpy(__UNCONST(kfilter->filtops), filtops, sizeof(*filtops));
468
469 if (retfilter != NULL)
470 *retfilter = kfilter->filter;
471 rw_exit(&kqueue_filter_lock);
472
473 return (0);
474 }
475
476 /*
477 * Unregister a kfilter previously registered with kfilter_register.
478 * This retains the filter id, but clears the name and frees filtops (filter
479 * operations), so that the number isn't reused during a boot.
480 * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
481 */
482 int
483 kfilter_unregister(const char *name)
484 {
485 struct kfilter *kfilter;
486
487 if (name == NULL || name[0] == '\0')
488 return (EINVAL); /* invalid name */
489
490 rw_enter(&kqueue_filter_lock, RW_WRITER);
491 if (kfilter_byname_sys(name) != NULL) {
492 rw_exit(&kqueue_filter_lock);
493 return (EINVAL); /* can't detach system filters */
494 }
495
496 kfilter = kfilter_byname_user(name);
497 if (kfilter == NULL) {
498 rw_exit(&kqueue_filter_lock);
499 return (ENOENT);
500 }
501 if (kfilter->refcnt != 0) {
502 rw_exit(&kqueue_filter_lock);
503 return (EBUSY);
504 }
505
506 /* Cast away const (but we know it's safe. */
507 kmem_free(__UNCONST(kfilter->name), kfilter->namelen);
508 kfilter->name = NULL; /* mark as `not implemented' */
509
510 if (kfilter->filtops != NULL) {
511 /* Cast away const (but we know it's safe. */
512 kmem_free(__UNCONST(kfilter->filtops),
513 sizeof(*kfilter->filtops));
514 kfilter->filtops = NULL; /* mark as `not implemented' */
515 }
516 rw_exit(&kqueue_filter_lock);
517
518 return (0);
519 }
520
521
522 /*
523 * Filter attach method for EVFILT_READ and EVFILT_WRITE on normal file
524 * descriptors. Calls fileops kqfilter method for given file descriptor.
525 */
526 static int
527 filt_fileattach(struct knote *kn)
528 {
529 file_t *fp;
530
531 fp = kn->kn_obj;
532
533 return (*fp->f_ops->fo_kqfilter)(fp, kn);
534 }
535
536 /*
537 * Filter detach method for EVFILT_READ on kqueue descriptor.
538 */
539 static void
540 filt_kqdetach(struct knote *kn)
541 {
542 struct kqueue *kq;
543
544 kq = ((file_t *)kn->kn_obj)->f_kqueue;
545
546 mutex_spin_enter(&kq->kq_lock);
547 selremove_knote(&kq->kq_sel, kn);
548 mutex_spin_exit(&kq->kq_lock);
549 }
550
551 /*
552 * Filter event method for EVFILT_READ on kqueue descriptor.
553 */
554 /*ARGSUSED*/
555 static int
556 filt_kqueue(struct knote *kn, long hint)
557 {
558 struct kqueue *kq;
559 int rv;
560
561 kq = ((file_t *)kn->kn_obj)->f_kqueue;
562
563 if (hint != NOTE_SUBMIT)
564 mutex_spin_enter(&kq->kq_lock);
565 kn->kn_data = KQ_COUNT(kq);
566 rv = (kn->kn_data > 0);
567 if (hint != NOTE_SUBMIT)
568 mutex_spin_exit(&kq->kq_lock);
569
570 return rv;
571 }
572
573 /*
574 * Filter attach method for EVFILT_PROC.
575 */
576 static int
577 filt_procattach(struct knote *kn)
578 {
579 struct proc *p;
580 struct lwp *curl;
581
582 curl = curlwp;
583
584 mutex_enter(&proc_lock);
585 if (kn->kn_flags & EV_FLAG1) {
586 /*
587 * NOTE_TRACK attaches to the child process too early
588 * for proc_find, so do a raw look up and check the state
589 * explicitly.
590 */
591 p = proc_find_raw(kn->kn_id);
592 if (p != NULL && p->p_stat != SIDL)
593 p = NULL;
594 } else {
595 p = proc_find(kn->kn_id);
596 }
597
598 if (p == NULL) {
599 mutex_exit(&proc_lock);
600 return ESRCH;
601 }
602
603 /*
604 * Fail if it's not owned by you, or the last exec gave us
605 * setuid/setgid privs (unless you're root).
606 */
607 mutex_enter(p->p_lock);
608 mutex_exit(&proc_lock);
609 if (kauth_authorize_process(curl->l_cred,
610 KAUTH_PROCESS_KEVENT_FILTER, p, NULL, NULL, NULL) != 0) {
611 mutex_exit(p->p_lock);
612 return EACCES;
613 }
614
615 kn->kn_obj = p;
616 kn->kn_flags |= EV_CLEAR; /* automatically set */
617
618 /*
619 * internal flag indicating registration done by kernel
620 */
621 if (kn->kn_flags & EV_FLAG1) {
622 kn->kn_data = kn->kn_sdata; /* ppid */
623 kn->kn_fflags = NOTE_CHILD;
624 kn->kn_flags &= ~EV_FLAG1;
625 }
626 SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext);
627 mutex_exit(p->p_lock);
628
629 return 0;
630 }
631
632 /*
633 * Filter detach method for EVFILT_PROC.
634 *
635 * The knote may be attached to a different process, which may exit,
636 * leaving nothing for the knote to be attached to. So when the process
637 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
638 * it will be deleted when read out. However, as part of the knote deletion,
639 * this routine is called, so a check is needed to avoid actually performing
640 * a detach, because the original process might not exist any more.
641 */
642 static void
643 filt_procdetach(struct knote *kn)
644 {
645 struct proc *p;
646
647 if (kn->kn_status & KN_DETACHED)
648 return;
649
650 p = kn->kn_obj;
651
652 mutex_enter(p->p_lock);
653 SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
654 mutex_exit(p->p_lock);
655 }
656
657 /*
658 * Filter event method for EVFILT_PROC.
659 */
660 static int
661 filt_proc(struct knote *kn, long hint)
662 {
663 u_int event, fflag;
664 struct kevent kev;
665 struct kqueue *kq;
666 int error;
667
668 event = (u_int)hint & NOTE_PCTRLMASK;
669 kq = kn->kn_kq;
670 fflag = 0;
671
672 /* If the user is interested in this event, record it. */
673 if (kn->kn_sfflags & event)
674 fflag |= event;
675
676 if (event == NOTE_EXIT) {
677 struct proc *p = kn->kn_obj;
678
679 if (p != NULL)
680 kn->kn_data = P_WAITSTATUS(p);
681 /*
682 * Process is gone, so flag the event as finished.
683 *
684 * Detach the knote from watched process and mark
685 * it as such. We can't leave this to kqueue_scan(),
686 * since the process might not exist by then. And we
687 * have to do this now, since psignal KNOTE() is called
688 * also for zombies and we might end up reading freed
689 * memory if the kevent would already be picked up
690 * and knote g/c'ed.
691 */
692 filt_procdetach(kn);
693
694 mutex_spin_enter(&kq->kq_lock);
695 kn->kn_status |= KN_DETACHED;
696 /* Mark as ONESHOT, so that the knote it g/c'ed when read */
697 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
698 kn->kn_fflags |= fflag;
699 mutex_spin_exit(&kq->kq_lock);
700
701 return 1;
702 }
703
704 mutex_spin_enter(&kq->kq_lock);
705 if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
706 /*
707 * Process forked, and user wants to track the new process,
708 * so attach a new knote to it, and immediately report an
709 * event with the parent's pid. Register knote with new
710 * process.
711 */
712 memset(&kev, 0, sizeof(kev));
713 kev.ident = hint & NOTE_PDATAMASK; /* pid */
714 kev.filter = kn->kn_filter;
715 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
716 kev.fflags = kn->kn_sfflags;
717 kev.data = kn->kn_id; /* parent */
718 kev.udata = kn->kn_kevent.udata; /* preserve udata */
719 mutex_spin_exit(&kq->kq_lock);
720 error = kqueue_register(kq, &kev);
721 mutex_spin_enter(&kq->kq_lock);
722 if (error != 0)
723 kn->kn_fflags |= NOTE_TRACKERR;
724 }
725 kn->kn_fflags |= fflag;
726 fflag = kn->kn_fflags;
727 mutex_spin_exit(&kq->kq_lock);
728
729 return fflag != 0;
730 }
731
732 static void
733 filt_timerexpire(void *knx)
734 {
735 struct knote *kn = knx;
736 int tticks;
737
738 mutex_enter(&kqueue_misc_lock);
739 kn->kn_data++;
740 knote_activate(kn);
741 if ((kn->kn_flags & EV_ONESHOT) == 0) {
742 tticks = mstohz(kn->kn_sdata);
743 if (tticks <= 0)
744 tticks = 1;
745 callout_schedule((callout_t *)kn->kn_hook, tticks);
746 }
747 mutex_exit(&kqueue_misc_lock);
748 }
749
750 /*
751 * data contains amount of time to sleep, in milliseconds
752 */
753 static int
754 filt_timerattach(struct knote *kn)
755 {
756 callout_t *calloutp;
757 struct kqueue *kq;
758 int tticks;
759
760 tticks = mstohz(kn->kn_sdata);
761
762 /* if the supplied value is under our resolution, use 1 tick */
763 if (tticks == 0) {
764 if (kn->kn_sdata == 0)
765 return EINVAL;
766 tticks = 1;
767 }
768
769 if (atomic_inc_uint_nv(&kq_ncallouts) >= kq_calloutmax ||
770 (calloutp = kmem_alloc(sizeof(*calloutp), KM_NOSLEEP)) == NULL) {
771 atomic_dec_uint(&kq_ncallouts);
772 return ENOMEM;
773 }
774 callout_init(calloutp, CALLOUT_MPSAFE);
775
776 kq = kn->kn_kq;
777 mutex_spin_enter(&kq->kq_lock);
778 kn->kn_flags |= EV_CLEAR; /* automatically set */
779 kn->kn_hook = calloutp;
780 mutex_spin_exit(&kq->kq_lock);
781
782 callout_reset(calloutp, tticks, filt_timerexpire, kn);
783
784 return (0);
785 }
786
787 static void
788 filt_timerdetach(struct knote *kn)
789 {
790 callout_t *calloutp;
791 struct kqueue *kq = kn->kn_kq;
792
793 mutex_spin_enter(&kq->kq_lock);
794 /* prevent rescheduling when we expire */
795 kn->kn_flags |= EV_ONESHOT;
796 mutex_spin_exit(&kq->kq_lock);
797
798 calloutp = (callout_t *)kn->kn_hook;
799 callout_halt(calloutp, NULL);
800 callout_destroy(calloutp);
801 kmem_free(calloutp, sizeof(*calloutp));
802 atomic_dec_uint(&kq_ncallouts);
803 }
804
805 static int
806 filt_timer(struct knote *kn, long hint)
807 {
808 int rv;
809
810 mutex_enter(&kqueue_misc_lock);
811 rv = (kn->kn_data != 0);
812 mutex_exit(&kqueue_misc_lock);
813
814 return rv;
815 }
816
817 static int
818 filt_userattach(struct knote *kn)
819 {
820 struct kqueue *kq = kn->kn_kq;
821
822 /*
823 * EVFILT_USER knotes are not attached to anything in the kernel.
824 */
825 mutex_spin_enter(&kq->kq_lock);
826 kn->kn_hook = NULL;
827 if (kn->kn_fflags & NOTE_TRIGGER)
828 kn->kn_hookid = 1;
829 else
830 kn->kn_hookid = 0;
831 mutex_spin_exit(&kq->kq_lock);
832 return (0);
833 }
834
835 static void
836 filt_userdetach(struct knote *kn)
837 {
838
839 /*
840 * EVFILT_USER knotes are not attached to anything in the kernel.
841 */
842 }
843
844 static int
845 filt_user(struct knote *kn, long hint)
846 {
847 struct kqueue *kq = kn->kn_kq;
848 int hookid;
849
850 mutex_spin_enter(&kq->kq_lock);
851 hookid = kn->kn_hookid;
852 mutex_spin_exit(&kq->kq_lock);
853
854 return hookid;
855 }
856
857 static void
858 filt_usertouch(struct knote *kn, struct kevent *kev, long type)
859 {
860 int ffctrl;
861
862 KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
863
864 switch (type) {
865 case EVENT_REGISTER:
866 if (kev->fflags & NOTE_TRIGGER)
867 kn->kn_hookid = 1;
868
869 ffctrl = kev->fflags & NOTE_FFCTRLMASK;
870 kev->fflags &= NOTE_FFLAGSMASK;
871 switch (ffctrl) {
872 case NOTE_FFNOP:
873 break;
874
875 case NOTE_FFAND:
876 kn->kn_sfflags &= kev->fflags;
877 break;
878
879 case NOTE_FFOR:
880 kn->kn_sfflags |= kev->fflags;
881 break;
882
883 case NOTE_FFCOPY:
884 kn->kn_sfflags = kev->fflags;
885 break;
886
887 default:
888 /* XXX Return error? */
889 break;
890 }
891 kn->kn_sdata = kev->data;
892 if (kev->flags & EV_CLEAR) {
893 kn->kn_hookid = 0;
894 kn->kn_data = 0;
895 kn->kn_fflags = 0;
896 }
897 break;
898
899 case EVENT_PROCESS:
900 *kev = kn->kn_kevent;
901 kev->fflags = kn->kn_sfflags;
902 kev->data = kn->kn_sdata;
903 if (kn->kn_flags & EV_CLEAR) {
904 kn->kn_hookid = 0;
905 kn->kn_data = 0;
906 kn->kn_fflags = 0;
907 }
908 break;
909
910 default:
911 panic("filt_usertouch() - invalid type (%ld)", type);
912 break;
913 }
914 }
915
916 /*
917 * filt_seltrue:
918 *
919 * This filter "event" routine simulates seltrue().
920 */
921 int
922 filt_seltrue(struct knote *kn, long hint)
923 {
924
925 /*
926 * We don't know how much data can be read/written,
927 * but we know that it *can* be. This is about as
928 * good as select/poll does as well.
929 */
930 kn->kn_data = 0;
931 return (1);
932 }
933
934 /*
935 * This provides full kqfilter entry for device switch tables, which
936 * has same effect as filter using filt_seltrue() as filter method.
937 */
938 static void
939 filt_seltruedetach(struct knote *kn)
940 {
941 /* Nothing to do */
942 }
943
944 const struct filterops seltrue_filtops = {
945 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
946 .f_attach = NULL,
947 .f_detach = filt_seltruedetach,
948 .f_event = filt_seltrue,
949 };
950
951 int
952 seltrue_kqfilter(dev_t dev, struct knote *kn)
953 {
954 switch (kn->kn_filter) {
955 case EVFILT_READ:
956 case EVFILT_WRITE:
957 kn->kn_fop = &seltrue_filtops;
958 break;
959 default:
960 return (EINVAL);
961 }
962
963 /* Nothing more to do */
964 return (0);
965 }
966
967 /*
968 * kqueue(2) system call.
969 */
970 static int
971 kqueue1(struct lwp *l, int flags, register_t *retval)
972 {
973 struct kqueue *kq;
974 file_t *fp;
975 int fd, error;
976
977 if ((error = fd_allocfile(&fp, &fd)) != 0)
978 return error;
979 fp->f_flag = FREAD | FWRITE | (flags & (FNONBLOCK|FNOSIGPIPE));
980 fp->f_type = DTYPE_KQUEUE;
981 fp->f_ops = &kqueueops;
982 kq = kmem_zalloc(sizeof(*kq), KM_SLEEP);
983 mutex_init(&kq->kq_lock, MUTEX_DEFAULT, IPL_SCHED);
984 cv_init(&kq->kq_cv, "kqueue");
985 selinit(&kq->kq_sel);
986 TAILQ_INIT(&kq->kq_head);
987 fp->f_kqueue = kq;
988 *retval = fd;
989 kq->kq_fdp = curlwp->l_fd;
990 fd_set_exclose(l, fd, (flags & O_CLOEXEC) != 0);
991 fd_affix(curproc, fp, fd);
992 return error;
993 }
994
995 /*
996 * kqueue(2) system call.
997 */
998 int
999 sys_kqueue(struct lwp *l, const void *v, register_t *retval)
1000 {
1001 return kqueue1(l, 0, retval);
1002 }
1003
1004 int
1005 sys_kqueue1(struct lwp *l, const struct sys_kqueue1_args *uap,
1006 register_t *retval)
1007 {
1008 /* {
1009 syscallarg(int) flags;
1010 } */
1011 return kqueue1(l, SCARG(uap, flags), retval);
1012 }
1013
1014 /*
1015 * kevent(2) system call.
1016 */
1017 int
1018 kevent_fetch_changes(void *ctx, const struct kevent *changelist,
1019 struct kevent *changes, size_t index, int n)
1020 {
1021
1022 return copyin(changelist + index, changes, n * sizeof(*changes));
1023 }
1024
1025 int
1026 kevent_put_events(void *ctx, struct kevent *events,
1027 struct kevent *eventlist, size_t index, int n)
1028 {
1029
1030 return copyout(events, eventlist + index, n * sizeof(*events));
1031 }
1032
1033 static const struct kevent_ops kevent_native_ops = {
1034 .keo_private = NULL,
1035 .keo_fetch_timeout = copyin,
1036 .keo_fetch_changes = kevent_fetch_changes,
1037 .keo_put_events = kevent_put_events,
1038 };
1039
1040 int
1041 sys___kevent50(struct lwp *l, const struct sys___kevent50_args *uap,
1042 register_t *retval)
1043 {
1044 /* {
1045 syscallarg(int) fd;
1046 syscallarg(const struct kevent *) changelist;
1047 syscallarg(size_t) nchanges;
1048 syscallarg(struct kevent *) eventlist;
1049 syscallarg(size_t) nevents;
1050 syscallarg(const struct timespec *) timeout;
1051 } */
1052
1053 return kevent1(retval, SCARG(uap, fd), SCARG(uap, changelist),
1054 SCARG(uap, nchanges), SCARG(uap, eventlist), SCARG(uap, nevents),
1055 SCARG(uap, timeout), &kevent_native_ops);
1056 }
1057
1058 int
1059 kevent1(register_t *retval, int fd,
1060 const struct kevent *changelist, size_t nchanges,
1061 struct kevent *eventlist, size_t nevents,
1062 const struct timespec *timeout,
1063 const struct kevent_ops *keops)
1064 {
1065 struct kevent *kevp;
1066 struct kqueue *kq;
1067 struct timespec ts;
1068 size_t i, n, ichange;
1069 int nerrors, error;
1070 struct kevent kevbuf[KQ_NEVENTS]; /* approx 300 bytes on 64-bit */
1071 file_t *fp;
1072
1073 /* check that we're dealing with a kq */
1074 fp = fd_getfile(fd);
1075 if (fp == NULL)
1076 return (EBADF);
1077
1078 if (fp->f_type != DTYPE_KQUEUE) {
1079 fd_putfile(fd);
1080 return (EBADF);
1081 }
1082
1083 if (timeout != NULL) {
1084 error = (*keops->keo_fetch_timeout)(timeout, &ts, sizeof(ts));
1085 if (error)
1086 goto done;
1087 timeout = &ts;
1088 }
1089
1090 kq = fp->f_kqueue;
1091 nerrors = 0;
1092 ichange = 0;
1093
1094 /* traverse list of events to register */
1095 while (nchanges > 0) {
1096 n = MIN(nchanges, __arraycount(kevbuf));
1097 error = (*keops->keo_fetch_changes)(keops->keo_private,
1098 changelist, kevbuf, ichange, n);
1099 if (error)
1100 goto done;
1101 for (i = 0; i < n; i++) {
1102 kevp = &kevbuf[i];
1103 kevp->flags &= ~EV_SYSFLAGS;
1104 /* register each knote */
1105 error = kqueue_register(kq, kevp);
1106 if (!error && !(kevp->flags & EV_RECEIPT))
1107 continue;
1108 if (nevents == 0)
1109 goto done;
1110 kevp->flags = EV_ERROR;
1111 kevp->data = error;
1112 error = (*keops->keo_put_events)
1113 (keops->keo_private, kevp,
1114 eventlist, nerrors, 1);
1115 if (error)
1116 goto done;
1117 nevents--;
1118 nerrors++;
1119 }
1120 nchanges -= n; /* update the results */
1121 ichange += n;
1122 }
1123 if (nerrors) {
1124 *retval = nerrors;
1125 error = 0;
1126 goto done;
1127 }
1128
1129 /* actually scan through the events */
1130 error = kqueue_scan(fp, nevents, eventlist, timeout, retval, keops,
1131 kevbuf, __arraycount(kevbuf));
1132 done:
1133 fd_putfile(fd);
1134 return (error);
1135 }
1136
1137 /*
1138 * Register a given kevent kev onto the kqueue
1139 */
1140 static int
1141 kqueue_register(struct kqueue *kq, struct kevent *kev)
1142 {
1143 struct kfilter *kfilter;
1144 filedesc_t *fdp;
1145 file_t *fp;
1146 fdfile_t *ff;
1147 struct knote *kn, *newkn;
1148 struct klist *list;
1149 int error, fd, rv;
1150
1151 fdp = kq->kq_fdp;
1152 fp = NULL;
1153 kn = NULL;
1154 error = 0;
1155 fd = 0;
1156
1157 newkn = kmem_zalloc(sizeof(*newkn), KM_SLEEP);
1158
1159 rw_enter(&kqueue_filter_lock, RW_READER);
1160 kfilter = kfilter_byfilter(kev->filter);
1161 if (kfilter == NULL || kfilter->filtops == NULL) {
1162 /* filter not found nor implemented */
1163 rw_exit(&kqueue_filter_lock);
1164 kmem_free(newkn, sizeof(*newkn));
1165 return (EINVAL);
1166 }
1167
1168 /* search if knote already exists */
1169 if (kfilter->filtops->f_flags & FILTEROP_ISFD) {
1170 /* monitoring a file descriptor */
1171 /* validate descriptor */
1172 if (kev->ident > INT_MAX
1173 || (fp = fd_getfile(fd = kev->ident)) == NULL) {
1174 rw_exit(&kqueue_filter_lock);
1175 kmem_free(newkn, sizeof(*newkn));
1176 return EBADF;
1177 }
1178 mutex_enter(&fdp->fd_lock);
1179 ff = fdp->fd_dt->dt_ff[fd];
1180 if (ff->ff_refcnt & FR_CLOSING) {
1181 error = EBADF;
1182 goto doneunlock;
1183 }
1184 if (fd <= fdp->fd_lastkqfile) {
1185 SLIST_FOREACH(kn, &ff->ff_knlist, kn_link) {
1186 if (kq == kn->kn_kq &&
1187 kev->filter == kn->kn_filter)
1188 break;
1189 }
1190 }
1191 } else {
1192 /*
1193 * not monitoring a file descriptor, so
1194 * lookup knotes in internal hash table
1195 */
1196 mutex_enter(&fdp->fd_lock);
1197 if (fdp->fd_knhashmask != 0) {
1198 list = &fdp->fd_knhash[
1199 KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
1200 SLIST_FOREACH(kn, list, kn_link) {
1201 if (kev->ident == kn->kn_id &&
1202 kq == kn->kn_kq &&
1203 kev->filter == kn->kn_filter)
1204 break;
1205 }
1206 }
1207 }
1208
1209 /*
1210 * kn now contains the matching knote, or NULL if no match
1211 */
1212 if (kn == NULL) {
1213 if (kev->flags & EV_ADD) {
1214 /* create new knote */
1215 kn = newkn;
1216 newkn = NULL;
1217 kn->kn_obj = fp;
1218 kn->kn_id = kev->ident;
1219 kn->kn_kq = kq;
1220 kn->kn_fop = kfilter->filtops;
1221 kn->kn_kfilter = kfilter;
1222 kn->kn_sfflags = kev->fflags;
1223 kn->kn_sdata = kev->data;
1224 kev->fflags = 0;
1225 kev->data = 0;
1226 kn->kn_kevent = *kev;
1227
1228 KASSERT(kn->kn_fop != NULL);
1229 /*
1230 * apply reference count to knote structure, and
1231 * do not release it at the end of this routine.
1232 */
1233 fp = NULL;
1234
1235 if (!(kn->kn_fop->f_flags & FILTEROP_ISFD)) {
1236 /*
1237 * If knote is not on an fd, store on
1238 * internal hash table.
1239 */
1240 if (fdp->fd_knhashmask == 0) {
1241 /* XXXAD can block with fd_lock held */
1242 fdp->fd_knhash = hashinit(KN_HASHSIZE,
1243 HASH_LIST, true,
1244 &fdp->fd_knhashmask);
1245 }
1246 list = &fdp->fd_knhash[KN_HASH(kn->kn_id,
1247 fdp->fd_knhashmask)];
1248 } else {
1249 /* Otherwise, knote is on an fd. */
1250 list = (struct klist *)
1251 &fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
1252 if ((int)kn->kn_id > fdp->fd_lastkqfile)
1253 fdp->fd_lastkqfile = kn->kn_id;
1254 }
1255 SLIST_INSERT_HEAD(list, kn, kn_link);
1256
1257 /*
1258 * N.B. kn->kn_fop may change as the result
1259 * of filter_attach()!
1260 */
1261 error = filter_attach(kn);
1262 if (error != 0) {
1263 #ifdef DEBUG
1264 struct proc *p = curlwp->l_proc;
1265 const file_t *ft = kn->kn_obj;
1266 printf("%s: %s[%d]: event type %d not "
1267 "supported for file type %d/%s "
1268 "(error %d)\n", __func__,
1269 p->p_comm, p->p_pid,
1270 kn->kn_filter, ft ? ft->f_type : -1,
1271 ft ? ft->f_ops->fo_name : "?", error);
1272 #endif
1273
1274 /* knote_detach() drops fdp->fd_lock */
1275 knote_detach(kn, fdp, false);
1276 goto done;
1277 }
1278 atomic_inc_uint(&kfilter->refcnt);
1279 goto done_ev_add;
1280 } else {
1281 /* No matching knote and the EV_ADD flag is not set. */
1282 error = ENOENT;
1283 goto doneunlock;
1284 }
1285 }
1286
1287 if (kev->flags & EV_DELETE) {
1288 /* knote_detach() drops fdp->fd_lock */
1289 knote_detach(kn, fdp, true);
1290 goto done;
1291 }
1292
1293 /*
1294 * The user may change some filter values after the
1295 * initial EV_ADD, but doing so will not reset any
1296 * filter which have already been triggered.
1297 */
1298 kn->kn_kevent.udata = kev->udata;
1299 KASSERT(kn->kn_fop != NULL);
1300 if (!(kn->kn_fop->f_flags & FILTEROP_ISFD) &&
1301 kn->kn_fop->f_touch != NULL) {
1302 mutex_spin_enter(&kq->kq_lock);
1303 filter_touch(kn, kev, EVENT_REGISTER);
1304 mutex_spin_exit(&kq->kq_lock);
1305 } else {
1306 kn->kn_sfflags = kev->fflags;
1307 kn->kn_sdata = kev->data;
1308 }
1309
1310 /*
1311 * We can get here if we are trying to attach
1312 * an event to a file descriptor that does not
1313 * support events, and the attach routine is
1314 * broken and does not return an error.
1315 */
1316 done_ev_add:
1317 rv = filter_event(kn, 0);
1318 if (rv)
1319 knote_activate(kn);
1320
1321 /* disable knote */
1322 if ((kev->flags & EV_DISABLE)) {
1323 mutex_spin_enter(&kq->kq_lock);
1324 if ((kn->kn_status & KN_DISABLED) == 0)
1325 kn->kn_status |= KN_DISABLED;
1326 mutex_spin_exit(&kq->kq_lock);
1327 }
1328
1329 /* enable knote */
1330 if ((kev->flags & EV_ENABLE)) {
1331 knote_enqueue(kn);
1332 }
1333 doneunlock:
1334 mutex_exit(&fdp->fd_lock);
1335 done:
1336 rw_exit(&kqueue_filter_lock);
1337 if (newkn != NULL)
1338 kmem_free(newkn, sizeof(*newkn));
1339 if (fp != NULL)
1340 fd_putfile(fd);
1341 return (error);
1342 }
1343
1344 #if defined(DEBUG)
1345 #define KN_FMT(buf, kn) \
1346 (snprintb((buf), sizeof(buf), __KN_FLAG_BITS, (kn)->kn_status), buf)
1347
1348 static void
1349 kqueue_check(const char *func, size_t line, const struct kqueue *kq)
1350 {
1351 const struct knote *kn;
1352 u_int count;
1353 int nmarker;
1354 char buf[128];
1355
1356 KASSERT(mutex_owned(&kq->kq_lock));
1357 KASSERT(KQ_COUNT(kq) < UINT_MAX / 2);
1358
1359 count = 0;
1360 nmarker = 0;
1361 TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) {
1362 if ((kn->kn_status & (KN_MARKER | KN_QUEUED)) == 0) {
1363 panic("%s,%zu: kq=%p kn=%p !(MARKER|QUEUED) %s",
1364 func, line, kq, kn, KN_FMT(buf, kn));
1365 }
1366 if ((kn->kn_status & KN_MARKER) == 0) {
1367 if (kn->kn_kq != kq) {
1368 panic("%s,%zu: kq=%p kn(%p) != kn->kq(%p): %s",
1369 func, line, kq, kn, kn->kn_kq,
1370 KN_FMT(buf, kn));
1371 }
1372 if ((kn->kn_status & KN_ACTIVE) == 0) {
1373 panic("%s,%zu: kq=%p kn=%p: !ACTIVE %s",
1374 func, line, kq, kn, KN_FMT(buf, kn));
1375 }
1376 count++;
1377 if (count > KQ_COUNT(kq)) {
1378 panic("%s,%zu: kq=%p kq->kq_count(%d) != "
1379 "count(%d), nmarker=%d",
1380 func, line, kq, KQ_COUNT(kq), count,
1381 nmarker);
1382 }
1383 } else {
1384 nmarker++;
1385 }
1386 }
1387 }
1388 #define kq_check(a) kqueue_check(__func__, __LINE__, (a))
1389 #else /* defined(DEBUG) */
1390 #define kq_check(a) /* nothing */
1391 #endif /* defined(DEBUG) */
1392
1393 static void
1394 kqueue_restart(file_t *fp)
1395 {
1396 struct kqueue *kq = fp->f_kqueue;
1397 KASSERT(kq != NULL);
1398
1399 mutex_spin_enter(&kq->kq_lock);
1400 kq->kq_count |= KQ_RESTART;
1401 cv_broadcast(&kq->kq_cv);
1402 mutex_spin_exit(&kq->kq_lock);
1403 }
1404
1405 /*
1406 * Scan through the list of events on fp (for a maximum of maxevents),
1407 * returning the results in to ulistp. Timeout is determined by tsp; if
1408 * NULL, wait indefinitely, if 0 valued, perform a poll, otherwise wait
1409 * as appropriate.
1410 */
1411 static int
1412 kqueue_scan(file_t *fp, size_t maxevents, struct kevent *ulistp,
1413 const struct timespec *tsp, register_t *retval,
1414 const struct kevent_ops *keops, struct kevent *kevbuf,
1415 size_t kevcnt)
1416 {
1417 struct kqueue *kq;
1418 struct kevent *kevp;
1419 struct timespec ats, sleepts;
1420 struct knote *kn, *marker, morker;
1421 size_t count, nkev, nevents;
1422 int timeout, error, touch, rv, influx;
1423 filedesc_t *fdp;
1424
1425 fdp = curlwp->l_fd;
1426 kq = fp->f_kqueue;
1427 count = maxevents;
1428 nkev = nevents = error = 0;
1429 if (count == 0) {
1430 *retval = 0;
1431 return 0;
1432 }
1433
1434 if (tsp) { /* timeout supplied */
1435 ats = *tsp;
1436 if (inittimeleft(&ats, &sleepts) == -1) {
1437 *retval = maxevents;
1438 return EINVAL;
1439 }
1440 timeout = tstohz(&ats);
1441 if (timeout <= 0)
1442 timeout = -1; /* do poll */
1443 } else {
1444 /* no timeout, wait forever */
1445 timeout = 0;
1446 }
1447
1448 memset(&morker, 0, sizeof(morker));
1449 marker = &morker;
1450 marker->kn_status = KN_MARKER;
1451 mutex_spin_enter(&kq->kq_lock);
1452 retry:
1453 kevp = kevbuf;
1454 if (KQ_COUNT(kq) == 0) {
1455 if (timeout >= 0) {
1456 error = cv_timedwait_sig(&kq->kq_cv,
1457 &kq->kq_lock, timeout);
1458 if (error == 0) {
1459 if (KQ_COUNT(kq) == 0 &&
1460 (kq->kq_count & KQ_RESTART)) {
1461 /* return to clear file reference */
1462 error = ERESTART;
1463 } else if (tsp == NULL || (timeout =
1464 gettimeleft(&ats, &sleepts)) > 0) {
1465 goto retry;
1466 }
1467 } else {
1468 /* don't restart after signals... */
1469 if (error == ERESTART)
1470 error = EINTR;
1471 if (error == EWOULDBLOCK)
1472 error = 0;
1473 }
1474 }
1475 mutex_spin_exit(&kq->kq_lock);
1476 goto done;
1477 }
1478
1479 /* mark end of knote list */
1480 TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
1481 influx = 0;
1482
1483 /*
1484 * Acquire the fdp->fd_lock interlock to avoid races with
1485 * file creation/destruction from other threads.
1486 */
1487 relock:
1488 mutex_spin_exit(&kq->kq_lock);
1489 mutex_enter(&fdp->fd_lock);
1490 mutex_spin_enter(&kq->kq_lock);
1491
1492 while (count != 0) {
1493 kn = TAILQ_FIRST(&kq->kq_head); /* get next knote */
1494
1495 if ((kn->kn_status & KN_MARKER) != 0 && kn != marker) {
1496 if (influx) {
1497 influx = 0;
1498 KQ_FLUX_WAKEUP(kq);
1499 }
1500 mutex_exit(&fdp->fd_lock);
1501 (void)cv_wait(&kq->kq_cv, &kq->kq_lock);
1502 goto relock;
1503 }
1504
1505 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1506 if (kn == marker) {
1507 /* it's our marker, stop */
1508 KQ_FLUX_WAKEUP(kq);
1509 if (count == maxevents) {
1510 mutex_exit(&fdp->fd_lock);
1511 goto retry;
1512 }
1513 break;
1514 }
1515 KASSERT((kn->kn_status & KN_BUSY) == 0);
1516
1517 kq_check(kq);
1518 kn->kn_status &= ~KN_QUEUED;
1519 kn->kn_status |= KN_BUSY;
1520 kq_check(kq);
1521 if (kn->kn_status & KN_DISABLED) {
1522 kn->kn_status &= ~KN_BUSY;
1523 kq->kq_count--;
1524 /* don't want disabled events */
1525 continue;
1526 }
1527 if ((kn->kn_flags & EV_ONESHOT) == 0) {
1528 mutex_spin_exit(&kq->kq_lock);
1529 KASSERT(mutex_owned(&fdp->fd_lock));
1530 rv = filter_event(kn, 0);
1531 mutex_spin_enter(&kq->kq_lock);
1532 /* Re-poll if note was re-enqueued. */
1533 if ((kn->kn_status & KN_QUEUED) != 0) {
1534 kn->kn_status &= ~KN_BUSY;
1535 /* Re-enqueue raised kq_count, lower it again */
1536 kq->kq_count--;
1537 influx = 1;
1538 continue;
1539 }
1540 if (rv == 0) {
1541 /*
1542 * non-ONESHOT event that hasn't
1543 * triggered again, so de-queue.
1544 */
1545 kn->kn_status &= ~(KN_ACTIVE|KN_BUSY);
1546 kq->kq_count--;
1547 influx = 1;
1548 continue;
1549 }
1550 }
1551 KASSERT(kn->kn_fop != NULL);
1552 touch = (!(kn->kn_fop->f_flags & FILTEROP_ISFD) &&
1553 kn->kn_fop->f_touch != NULL);
1554 /* XXXAD should be got from f_event if !oneshot. */
1555 if (touch) {
1556 filter_touch(kn, kevp, EVENT_PROCESS);
1557 } else {
1558 *kevp = kn->kn_kevent;
1559 }
1560 kevp++;
1561 nkev++;
1562 influx = 1;
1563 if (kn->kn_flags & EV_ONESHOT) {
1564 /* delete ONESHOT events after retrieval */
1565 kn->kn_status &= ~KN_BUSY;
1566 kq->kq_count--;
1567 mutex_spin_exit(&kq->kq_lock);
1568 knote_detach(kn, fdp, true);
1569 mutex_enter(&fdp->fd_lock);
1570 mutex_spin_enter(&kq->kq_lock);
1571 } else if (kn->kn_flags & EV_CLEAR) {
1572 /* clear state after retrieval */
1573 kn->kn_data = 0;
1574 kn->kn_fflags = 0;
1575 /*
1576 * Manually clear knotes who weren't
1577 * 'touch'ed.
1578 */
1579 if (touch == 0) {
1580 kn->kn_data = 0;
1581 kn->kn_fflags = 0;
1582 }
1583 kn->kn_status &= ~(KN_ACTIVE|KN_BUSY);
1584 kq->kq_count--;
1585 } else if (kn->kn_flags & EV_DISPATCH) {
1586 kn->kn_status |= KN_DISABLED;
1587 kn->kn_status &= ~(KN_ACTIVE|KN_BUSY);
1588 kq->kq_count--;
1589 } else {
1590 /* add event back on list */
1591 kq_check(kq);
1592 kn->kn_status |= KN_QUEUED;
1593 kn->kn_status &= ~KN_BUSY;
1594 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1595 kq_check(kq);
1596 }
1597
1598 if (nkev == kevcnt) {
1599 /* do copyouts in kevcnt chunks */
1600 influx = 0;
1601 KQ_FLUX_WAKEUP(kq);
1602 mutex_spin_exit(&kq->kq_lock);
1603 mutex_exit(&fdp->fd_lock);
1604 error = (*keops->keo_put_events)
1605 (keops->keo_private,
1606 kevbuf, ulistp, nevents, nkev);
1607 mutex_enter(&fdp->fd_lock);
1608 mutex_spin_enter(&kq->kq_lock);
1609 nevents += nkev;
1610 nkev = 0;
1611 kevp = kevbuf;
1612 }
1613 count--;
1614 if (error != 0 || count == 0) {
1615 /* remove marker */
1616 TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
1617 break;
1618 }
1619 }
1620 KQ_FLUX_WAKEUP(kq);
1621 mutex_spin_exit(&kq->kq_lock);
1622 mutex_exit(&fdp->fd_lock);
1623
1624 done:
1625 if (nkev != 0) {
1626 /* copyout remaining events */
1627 error = (*keops->keo_put_events)(keops->keo_private,
1628 kevbuf, ulistp, nevents, nkev);
1629 }
1630 *retval = maxevents - count;
1631
1632 return error;
1633 }
1634
1635 /*
1636 * fileops ioctl method for a kqueue descriptor.
1637 *
1638 * Two ioctls are currently supported. They both use struct kfilter_mapping:
1639 * KFILTER_BYNAME find name for filter, and return result in
1640 * name, which is of size len.
1641 * KFILTER_BYFILTER find filter for name. len is ignored.
1642 */
1643 /*ARGSUSED*/
1644 static int
1645 kqueue_ioctl(file_t *fp, u_long com, void *data)
1646 {
1647 struct kfilter_mapping *km;
1648 const struct kfilter *kfilter;
1649 char *name;
1650 int error;
1651
1652 km = data;
1653 error = 0;
1654 name = kmem_alloc(KFILTER_MAXNAME, KM_SLEEP);
1655
1656 switch (com) {
1657 case KFILTER_BYFILTER: /* convert filter -> name */
1658 rw_enter(&kqueue_filter_lock, RW_READER);
1659 kfilter = kfilter_byfilter(km->filter);
1660 if (kfilter != NULL) {
1661 strlcpy(name, kfilter->name, KFILTER_MAXNAME);
1662 rw_exit(&kqueue_filter_lock);
1663 error = copyoutstr(name, km->name, km->len, NULL);
1664 } else {
1665 rw_exit(&kqueue_filter_lock);
1666 error = ENOENT;
1667 }
1668 break;
1669
1670 case KFILTER_BYNAME: /* convert name -> filter */
1671 error = copyinstr(km->name, name, KFILTER_MAXNAME, NULL);
1672 if (error) {
1673 break;
1674 }
1675 rw_enter(&kqueue_filter_lock, RW_READER);
1676 kfilter = kfilter_byname(name);
1677 if (kfilter != NULL)
1678 km->filter = kfilter->filter;
1679 else
1680 error = ENOENT;
1681 rw_exit(&kqueue_filter_lock);
1682 break;
1683
1684 default:
1685 error = ENOTTY;
1686 break;
1687
1688 }
1689 kmem_free(name, KFILTER_MAXNAME);
1690 return (error);
1691 }
1692
1693 /*
1694 * fileops fcntl method for a kqueue descriptor.
1695 */
1696 static int
1697 kqueue_fcntl(file_t *fp, u_int com, void *data)
1698 {
1699
1700 return (ENOTTY);
1701 }
1702
1703 /*
1704 * fileops poll method for a kqueue descriptor.
1705 * Determine if kqueue has events pending.
1706 */
1707 static int
1708 kqueue_poll(file_t *fp, int events)
1709 {
1710 struct kqueue *kq;
1711 int revents;
1712
1713 kq = fp->f_kqueue;
1714
1715 revents = 0;
1716 if (events & (POLLIN | POLLRDNORM)) {
1717 mutex_spin_enter(&kq->kq_lock);
1718 if (KQ_COUNT(kq) != 0) {
1719 revents |= events & (POLLIN | POLLRDNORM);
1720 } else {
1721 selrecord(curlwp, &kq->kq_sel);
1722 }
1723 kq_check(kq);
1724 mutex_spin_exit(&kq->kq_lock);
1725 }
1726
1727 return revents;
1728 }
1729
1730 /*
1731 * fileops stat method for a kqueue descriptor.
1732 * Returns dummy info, with st_size being number of events pending.
1733 */
1734 static int
1735 kqueue_stat(file_t *fp, struct stat *st)
1736 {
1737 struct kqueue *kq;
1738
1739 kq = fp->f_kqueue;
1740
1741 memset(st, 0, sizeof(*st));
1742 st->st_size = KQ_COUNT(kq);
1743 st->st_blksize = sizeof(struct kevent);
1744 st->st_mode = S_IFIFO;
1745
1746 return 0;
1747 }
1748
1749 static void
1750 kqueue_doclose(struct kqueue *kq, struct klist *list, int fd)
1751 {
1752 struct knote *kn;
1753 filedesc_t *fdp;
1754
1755 fdp = kq->kq_fdp;
1756
1757 KASSERT(mutex_owned(&fdp->fd_lock));
1758
1759 for (kn = SLIST_FIRST(list); kn != NULL;) {
1760 if (kq != kn->kn_kq) {
1761 kn = SLIST_NEXT(kn, kn_link);
1762 continue;
1763 }
1764 knote_detach(kn, fdp, true);
1765 mutex_enter(&fdp->fd_lock);
1766 kn = SLIST_FIRST(list);
1767 }
1768 }
1769
1770
1771 /*
1772 * fileops close method for a kqueue descriptor.
1773 */
1774 static int
1775 kqueue_close(file_t *fp)
1776 {
1777 struct kqueue *kq;
1778 filedesc_t *fdp;
1779 fdfile_t *ff;
1780 int i;
1781
1782 kq = fp->f_kqueue;
1783 fp->f_kqueue = NULL;
1784 fp->f_type = 0;
1785 fdp = curlwp->l_fd;
1786
1787 mutex_enter(&fdp->fd_lock);
1788 for (i = 0; i <= fdp->fd_lastkqfile; i++) {
1789 if ((ff = fdp->fd_dt->dt_ff[i]) == NULL)
1790 continue;
1791 kqueue_doclose(kq, (struct klist *)&ff->ff_knlist, i);
1792 }
1793 if (fdp->fd_knhashmask != 0) {
1794 for (i = 0; i < fdp->fd_knhashmask + 1; i++) {
1795 kqueue_doclose(kq, &fdp->fd_knhash[i], -1);
1796 }
1797 }
1798 mutex_exit(&fdp->fd_lock);
1799
1800 KASSERT(KQ_COUNT(kq) == 0);
1801 mutex_destroy(&kq->kq_lock);
1802 cv_destroy(&kq->kq_cv);
1803 seldestroy(&kq->kq_sel);
1804 kmem_free(kq, sizeof(*kq));
1805
1806 return (0);
1807 }
1808
1809 /*
1810 * struct fileops kqfilter method for a kqueue descriptor.
1811 * Event triggered when monitored kqueue changes.
1812 */
1813 static int
1814 kqueue_kqfilter(file_t *fp, struct knote *kn)
1815 {
1816 struct kqueue *kq;
1817
1818 kq = ((file_t *)kn->kn_obj)->f_kqueue;
1819
1820 KASSERT(fp == kn->kn_obj);
1821
1822 if (kn->kn_filter != EVFILT_READ)
1823 return 1;
1824
1825 kn->kn_fop = &kqread_filtops;
1826 mutex_enter(&kq->kq_lock);
1827 selrecord_knote(&kq->kq_sel, kn);
1828 mutex_exit(&kq->kq_lock);
1829
1830 return 0;
1831 }
1832
1833
1834 /*
1835 * Walk down a list of knotes, activating them if their event has
1836 * triggered. The caller's object lock (e.g. device driver lock)
1837 * must be held.
1838 */
1839 void
1840 knote(struct klist *list, long hint)
1841 {
1842 struct knote *kn, *tmpkn;
1843
1844 SLIST_FOREACH_SAFE(kn, list, kn_selnext, tmpkn) {
1845 KASSERT(kn->kn_fop != NULL);
1846 KASSERT(kn->kn_fop->f_event != NULL);
1847 if ((*kn->kn_fop->f_event)(kn, hint))
1848 knote_activate(kn);
1849 }
1850 }
1851
1852 /*
1853 * Remove all knotes referencing a specified fd
1854 */
1855 void
1856 knote_fdclose(int fd)
1857 {
1858 struct klist *list;
1859 struct knote *kn;
1860 filedesc_t *fdp;
1861
1862 fdp = curlwp->l_fd;
1863 mutex_enter(&fdp->fd_lock);
1864 list = (struct klist *)&fdp->fd_dt->dt_ff[fd]->ff_knlist;
1865 while ((kn = SLIST_FIRST(list)) != NULL) {
1866 knote_detach(kn, fdp, true);
1867 mutex_enter(&fdp->fd_lock);
1868 }
1869 mutex_exit(&fdp->fd_lock);
1870 }
1871
1872 /*
1873 * Drop knote. Called with fdp->fd_lock held, and will drop before
1874 * returning.
1875 */
1876 static void
1877 knote_detach(struct knote *kn, filedesc_t *fdp, bool dofop)
1878 {
1879 struct klist *list;
1880 struct kqueue *kq;
1881
1882 kq = kn->kn_kq;
1883
1884 KASSERT((kn->kn_status & KN_MARKER) == 0);
1885 KASSERT(mutex_owned(&fdp->fd_lock));
1886
1887 KASSERT(kn->kn_fop != NULL);
1888 /* Remove from monitored object. */
1889 if (dofop) {
1890 filter_detach(kn);
1891 }
1892
1893 /* Remove from descriptor table. */
1894 if (kn->kn_fop->f_flags & FILTEROP_ISFD)
1895 list = (struct klist *)&fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
1896 else
1897 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
1898
1899 SLIST_REMOVE(list, kn, knote, kn_link);
1900
1901 /* Remove from kqueue. */
1902 again:
1903 mutex_spin_enter(&kq->kq_lock);
1904 if ((kn->kn_status & KN_QUEUED) != 0) {
1905 kq_check(kq);
1906 kq->kq_count--;
1907 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1908 kn->kn_status &= ~KN_QUEUED;
1909 kq_check(kq);
1910 } else if (kn->kn_status & KN_BUSY) {
1911 mutex_spin_exit(&kq->kq_lock);
1912 goto again;
1913 }
1914 mutex_spin_exit(&kq->kq_lock);
1915
1916 mutex_exit(&fdp->fd_lock);
1917 if (kn->kn_fop->f_flags & FILTEROP_ISFD)
1918 fd_putfile(kn->kn_id);
1919 atomic_dec_uint(&kn->kn_kfilter->refcnt);
1920 kmem_free(kn, sizeof(*kn));
1921 }
1922
1923 /*
1924 * Queue new event for knote.
1925 */
1926 static void
1927 knote_enqueue(struct knote *kn)
1928 {
1929 struct kqueue *kq;
1930
1931 KASSERT((kn->kn_status & KN_MARKER) == 0);
1932
1933 kq = kn->kn_kq;
1934
1935 mutex_spin_enter(&kq->kq_lock);
1936 if ((kn->kn_status & KN_DISABLED) != 0) {
1937 kn->kn_status &= ~KN_DISABLED;
1938 }
1939 if ((kn->kn_status & (KN_ACTIVE | KN_QUEUED)) == KN_ACTIVE) {
1940 kq_check(kq);
1941 kn->kn_status |= KN_QUEUED;
1942 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1943 kq->kq_count++;
1944 kq_check(kq);
1945 cv_broadcast(&kq->kq_cv);
1946 selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
1947 }
1948 mutex_spin_exit(&kq->kq_lock);
1949 }
1950 /*
1951 * Queue new event for knote.
1952 */
1953 static void
1954 knote_activate(struct knote *kn)
1955 {
1956 struct kqueue *kq;
1957
1958 KASSERT((kn->kn_status & KN_MARKER) == 0);
1959
1960 kq = kn->kn_kq;
1961
1962 mutex_spin_enter(&kq->kq_lock);
1963 kn->kn_status |= KN_ACTIVE;
1964 if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) {
1965 kq_check(kq);
1966 kn->kn_status |= KN_QUEUED;
1967 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1968 kq->kq_count++;
1969 kq_check(kq);
1970 cv_broadcast(&kq->kq_cv);
1971 selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
1972 }
1973 mutex_spin_exit(&kq->kq_lock);
1974 }
1975