kern_event.c revision 1.106 1 /* $NetBSD: kern_event.c,v 1.106 2020/02/01 02:23:04 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*-
33 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon (at) FreeBSD.org>
34 * All rights reserved.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 *
45 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 * SUCH DAMAGE.
56 *
57 * FreeBSD: src/sys/kern/kern_event.c,v 1.27 2001/07/05 17:10:44 rwatson Exp
58 */
59
60 #include <sys/cdefs.h>
61 __KERNEL_RCSID(0, "$NetBSD: kern_event.c,v 1.106 2020/02/01 02:23:04 riastradh Exp $");
62
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/kernel.h>
66 #include <sys/wait.h>
67 #include <sys/proc.h>
68 #include <sys/file.h>
69 #include <sys/select.h>
70 #include <sys/queue.h>
71 #include <sys/event.h>
72 #include <sys/eventvar.h>
73 #include <sys/poll.h>
74 #include <sys/kmem.h>
75 #include <sys/stat.h>
76 #include <sys/filedesc.h>
77 #include <sys/syscallargs.h>
78 #include <sys/kauth.h>
79 #include <sys/conf.h>
80 #include <sys/atomic.h>
81
82 static int kqueue_scan(file_t *, size_t, struct kevent *,
83 const struct timespec *, register_t *,
84 const struct kevent_ops *, struct kevent *,
85 size_t);
86 static int kqueue_ioctl(file_t *, u_long, void *);
87 static int kqueue_fcntl(file_t *, u_int, void *);
88 static int kqueue_poll(file_t *, int);
89 static int kqueue_kqfilter(file_t *, struct knote *);
90 static int kqueue_stat(file_t *, struct stat *);
91 static int kqueue_close(file_t *);
92 static int kqueue_register(struct kqueue *, struct kevent *);
93 static void kqueue_doclose(struct kqueue *, struct klist *, int);
94
95 static void knote_detach(struct knote *, filedesc_t *fdp, bool);
96 static void knote_enqueue(struct knote *);
97 static void knote_activate(struct knote *);
98
99 static void filt_kqdetach(struct knote *);
100 static int filt_kqueue(struct knote *, long hint);
101 static int filt_procattach(struct knote *);
102 static void filt_procdetach(struct knote *);
103 static int filt_proc(struct knote *, long hint);
104 static int filt_fileattach(struct knote *);
105 static void filt_timerexpire(void *x);
106 static int filt_timerattach(struct knote *);
107 static void filt_timerdetach(struct knote *);
108 static int filt_timer(struct knote *, long hint);
109 static int filt_fsattach(struct knote *kn);
110 static void filt_fsdetach(struct knote *kn);
111 static int filt_fs(struct knote *kn, long hint);
112
113 static const struct fileops kqueueops = {
114 .fo_name = "kqueue",
115 .fo_read = (void *)enxio,
116 .fo_write = (void *)enxio,
117 .fo_ioctl = kqueue_ioctl,
118 .fo_fcntl = kqueue_fcntl,
119 .fo_poll = kqueue_poll,
120 .fo_stat = kqueue_stat,
121 .fo_close = kqueue_close,
122 .fo_kqfilter = kqueue_kqfilter,
123 .fo_restart = fnullop_restart,
124 };
125
126 static const struct filterops kqread_filtops = {
127 .f_isfd = 1,
128 .f_attach = NULL,
129 .f_detach = filt_kqdetach,
130 .f_event = filt_kqueue,
131 };
132
133 static const struct filterops proc_filtops = {
134 .f_isfd = 0,
135 .f_attach = filt_procattach,
136 .f_detach = filt_procdetach,
137 .f_event = filt_proc,
138 };
139
140 static const struct filterops file_filtops = {
141 .f_isfd = 1,
142 .f_attach = filt_fileattach,
143 .f_detach = NULL,
144 .f_event = NULL,
145 };
146
147 static const struct filterops timer_filtops = {
148 .f_isfd = 0,
149 .f_attach = filt_timerattach,
150 .f_detach = filt_timerdetach,
151 .f_event = filt_timer,
152 };
153
154 static const struct filterops fs_filtops = {
155 .f_isfd = 0,
156 .f_attach = filt_fsattach,
157 .f_detach = filt_fsdetach,
158 .f_event = filt_fs,
159 };
160
161 static u_int kq_ncallouts = 0;
162 static int kq_calloutmax = (4 * 1024);
163
164 #define KN_HASHSIZE 64 /* XXX should be tunable */
165 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
166
167 extern const struct filterops sig_filtops;
168
169 /*
170 * Table for for all system-defined filters.
171 * These should be listed in the numeric order of the EVFILT_* defines.
172 * If filtops is NULL, the filter isn't implemented in NetBSD.
173 * End of list is when name is NULL.
174 *
175 * Note that 'refcnt' is meaningless for built-in filters.
176 */
177 struct kfilter {
178 const char *name; /* name of filter */
179 uint32_t filter; /* id of filter */
180 unsigned refcnt; /* reference count */
181 const struct filterops *filtops;/* operations for filter */
182 size_t namelen; /* length of name string */
183 };
184
185 /* System defined filters */
186 static struct kfilter sys_kfilters[] = {
187 { "EVFILT_READ", EVFILT_READ, 0, &file_filtops, 0 },
188 { "EVFILT_WRITE", EVFILT_WRITE, 0, &file_filtops, 0, },
189 { "EVFILT_AIO", EVFILT_AIO, 0, NULL, 0 },
190 { "EVFILT_VNODE", EVFILT_VNODE, 0, &file_filtops, 0 },
191 { "EVFILT_PROC", EVFILT_PROC, 0, &proc_filtops, 0 },
192 { "EVFILT_SIGNAL", EVFILT_SIGNAL, 0, &sig_filtops, 0 },
193 { "EVFILT_TIMER", EVFILT_TIMER, 0, &timer_filtops, 0 },
194 { "EVFILT_FS", EVFILT_FS, 0, &fs_filtops, 0 },
195 { NULL, 0, 0, NULL, 0 },
196 };
197
198 /* User defined kfilters */
199 static struct kfilter *user_kfilters; /* array */
200 static int user_kfilterc; /* current offset */
201 static int user_kfiltermaxc; /* max size so far */
202 static size_t user_kfiltersz; /* size of allocated memory */
203
204 /*
205 * Global Locks.
206 *
207 * Lock order:
208 *
209 * kqueue_filter_lock
210 * -> kn_kq->kq_fdp->fd_lock
211 * -> object lock (e.g., device driver lock, kqueue_misc_lock, &c.)
212 * -> kn_kq->kq_lock
213 *
214 * Locking rules:
215 *
216 * f_attach: fdp->fd_lock, KERNEL_LOCK
217 * f_detach: fdp->fd_lock, KERNEL_LOCK
218 * f_event(!NOTE_SUBMIT) via kevent: fdp->fd_lock, _no_ object lock
219 * f_event via knote: whatever caller guarantees
220 * Typically, f_event(NOTE_SUBMIT) via knote: object lock
221 * f_event(!NOTE_SUBMIT) via knote: nothing,
222 * acquires/releases object lock inside.
223 */
224 static krwlock_t kqueue_filter_lock; /* lock on filter lists */
225 static kmutex_t kqueue_misc_lock; /* miscellaneous */
226
227 static kauth_listener_t kqueue_listener;
228
229 static int
230 kqueue_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
231 void *arg0, void *arg1, void *arg2, void *arg3)
232 {
233 struct proc *p;
234 int result;
235
236 result = KAUTH_RESULT_DEFER;
237 p = arg0;
238
239 if (action != KAUTH_PROCESS_KEVENT_FILTER)
240 return result;
241
242 if ((kauth_cred_getuid(p->p_cred) != kauth_cred_getuid(cred) ||
243 ISSET(p->p_flag, PK_SUGID)))
244 return result;
245
246 result = KAUTH_RESULT_ALLOW;
247
248 return result;
249 }
250
251 /*
252 * Initialize the kqueue subsystem.
253 */
254 void
255 kqueue_init(void)
256 {
257
258 rw_init(&kqueue_filter_lock);
259 mutex_init(&kqueue_misc_lock, MUTEX_DEFAULT, IPL_NONE);
260
261 kqueue_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
262 kqueue_listener_cb, NULL);
263 }
264
265 /*
266 * Find kfilter entry by name, or NULL if not found.
267 */
268 static struct kfilter *
269 kfilter_byname_sys(const char *name)
270 {
271 int i;
272
273 KASSERT(rw_lock_held(&kqueue_filter_lock));
274
275 for (i = 0; sys_kfilters[i].name != NULL; i++) {
276 if (strcmp(name, sys_kfilters[i].name) == 0)
277 return &sys_kfilters[i];
278 }
279 return NULL;
280 }
281
282 static struct kfilter *
283 kfilter_byname_user(const char *name)
284 {
285 int i;
286
287 KASSERT(rw_lock_held(&kqueue_filter_lock));
288
289 /* user filter slots have a NULL name if previously deregistered */
290 for (i = 0; i < user_kfilterc ; i++) {
291 if (user_kfilters[i].name != NULL &&
292 strcmp(name, user_kfilters[i].name) == 0)
293 return &user_kfilters[i];
294 }
295 return NULL;
296 }
297
298 static struct kfilter *
299 kfilter_byname(const char *name)
300 {
301 struct kfilter *kfilter;
302
303 KASSERT(rw_lock_held(&kqueue_filter_lock));
304
305 if ((kfilter = kfilter_byname_sys(name)) != NULL)
306 return kfilter;
307
308 return kfilter_byname_user(name);
309 }
310
311 /*
312 * Find kfilter entry by filter id, or NULL if not found.
313 * Assumes entries are indexed in filter id order, for speed.
314 */
315 static struct kfilter *
316 kfilter_byfilter(uint32_t filter)
317 {
318 struct kfilter *kfilter;
319
320 KASSERT(rw_lock_held(&kqueue_filter_lock));
321
322 if (filter < EVFILT_SYSCOUNT) /* it's a system filter */
323 kfilter = &sys_kfilters[filter];
324 else if (user_kfilters != NULL &&
325 filter < EVFILT_SYSCOUNT + user_kfilterc)
326 /* it's a user filter */
327 kfilter = &user_kfilters[filter - EVFILT_SYSCOUNT];
328 else
329 return (NULL); /* out of range */
330 KASSERT(kfilter->filter == filter); /* sanity check! */
331 return (kfilter);
332 }
333
334 /*
335 * Register a new kfilter. Stores the entry in user_kfilters.
336 * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
337 * If retfilter != NULL, the new filterid is returned in it.
338 */
339 int
340 kfilter_register(const char *name, const struct filterops *filtops,
341 int *retfilter)
342 {
343 struct kfilter *kfilter;
344 size_t len;
345 int i;
346
347 if (name == NULL || name[0] == '\0' || filtops == NULL)
348 return (EINVAL); /* invalid args */
349
350 rw_enter(&kqueue_filter_lock, RW_WRITER);
351 if (kfilter_byname(name) != NULL) {
352 rw_exit(&kqueue_filter_lock);
353 return (EEXIST); /* already exists */
354 }
355 if (user_kfilterc > 0xffffffff - EVFILT_SYSCOUNT) {
356 rw_exit(&kqueue_filter_lock);
357 return (EINVAL); /* too many */
358 }
359
360 for (i = 0; i < user_kfilterc; i++) {
361 kfilter = &user_kfilters[i];
362 if (kfilter->name == NULL) {
363 /* Previously deregistered slot. Reuse. */
364 goto reuse;
365 }
366 }
367
368 /* check if need to grow user_kfilters */
369 if (user_kfilterc + 1 > user_kfiltermaxc) {
370 /* Grow in KFILTER_EXTENT chunks. */
371 user_kfiltermaxc += KFILTER_EXTENT;
372 len = user_kfiltermaxc * sizeof(*kfilter);
373 kfilter = kmem_alloc(len, KM_SLEEP);
374 memset((char *)kfilter + user_kfiltersz, 0, len - user_kfiltersz);
375 if (user_kfilters != NULL) {
376 memcpy(kfilter, user_kfilters, user_kfiltersz);
377 kmem_free(user_kfilters, user_kfiltersz);
378 }
379 user_kfiltersz = len;
380 user_kfilters = kfilter;
381 }
382 /* Adding new slot */
383 kfilter = &user_kfilters[user_kfilterc++];
384 reuse:
385 kfilter->name = kmem_strdupsize(name, &kfilter->namelen, KM_SLEEP);
386
387 kfilter->filter = (kfilter - user_kfilters) + EVFILT_SYSCOUNT;
388
389 kfilter->filtops = kmem_alloc(sizeof(*filtops), KM_SLEEP);
390 memcpy(__UNCONST(kfilter->filtops), filtops, sizeof(*filtops));
391
392 if (retfilter != NULL)
393 *retfilter = kfilter->filter;
394 rw_exit(&kqueue_filter_lock);
395
396 return (0);
397 }
398
399 /*
400 * Unregister a kfilter previously registered with kfilter_register.
401 * This retains the filter id, but clears the name and frees filtops (filter
402 * operations), so that the number isn't reused during a boot.
403 * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
404 */
405 int
406 kfilter_unregister(const char *name)
407 {
408 struct kfilter *kfilter;
409
410 if (name == NULL || name[0] == '\0')
411 return (EINVAL); /* invalid name */
412
413 rw_enter(&kqueue_filter_lock, RW_WRITER);
414 if (kfilter_byname_sys(name) != NULL) {
415 rw_exit(&kqueue_filter_lock);
416 return (EINVAL); /* can't detach system filters */
417 }
418
419 kfilter = kfilter_byname_user(name);
420 if (kfilter == NULL) {
421 rw_exit(&kqueue_filter_lock);
422 return (ENOENT);
423 }
424 if (kfilter->refcnt != 0) {
425 rw_exit(&kqueue_filter_lock);
426 return (EBUSY);
427 }
428
429 /* Cast away const (but we know it's safe. */
430 kmem_free(__UNCONST(kfilter->name), kfilter->namelen);
431 kfilter->name = NULL; /* mark as `not implemented' */
432
433 if (kfilter->filtops != NULL) {
434 /* Cast away const (but we know it's safe. */
435 kmem_free(__UNCONST(kfilter->filtops),
436 sizeof(*kfilter->filtops));
437 kfilter->filtops = NULL; /* mark as `not implemented' */
438 }
439 rw_exit(&kqueue_filter_lock);
440
441 return (0);
442 }
443
444
445 /*
446 * Filter attach method for EVFILT_READ and EVFILT_WRITE on normal file
447 * descriptors. Calls fileops kqfilter method for given file descriptor.
448 */
449 static int
450 filt_fileattach(struct knote *kn)
451 {
452 file_t *fp;
453
454 fp = kn->kn_obj;
455
456 return (*fp->f_ops->fo_kqfilter)(fp, kn);
457 }
458
459 /*
460 * Filter detach method for EVFILT_READ on kqueue descriptor.
461 */
462 static void
463 filt_kqdetach(struct knote *kn)
464 {
465 struct kqueue *kq;
466
467 kq = ((file_t *)kn->kn_obj)->f_kqueue;
468
469 mutex_spin_enter(&kq->kq_lock);
470 SLIST_REMOVE(&kq->kq_sel.sel_klist, kn, knote, kn_selnext);
471 mutex_spin_exit(&kq->kq_lock);
472 }
473
474 /*
475 * Filter event method for EVFILT_READ on kqueue descriptor.
476 */
477 /*ARGSUSED*/
478 static int
479 filt_kqueue(struct knote *kn, long hint)
480 {
481 struct kqueue *kq;
482 int rv;
483
484 kq = ((file_t *)kn->kn_obj)->f_kqueue;
485
486 if (hint != NOTE_SUBMIT)
487 mutex_spin_enter(&kq->kq_lock);
488 kn->kn_data = kq->kq_count;
489 rv = (kn->kn_data > 0);
490 if (hint != NOTE_SUBMIT)
491 mutex_spin_exit(&kq->kq_lock);
492
493 return rv;
494 }
495
496 /*
497 * Filter attach method for EVFILT_PROC.
498 */
499 static int
500 filt_procattach(struct knote *kn)
501 {
502 struct proc *p;
503 struct lwp *curl;
504
505 curl = curlwp;
506
507 mutex_enter(proc_lock);
508 if (kn->kn_flags & EV_FLAG1) {
509 /*
510 * NOTE_TRACK attaches to the child process too early
511 * for proc_find, so do a raw look up and check the state
512 * explicitly.
513 */
514 p = proc_find_raw(kn->kn_id);
515 if (p != NULL && p->p_stat != SIDL)
516 p = NULL;
517 } else {
518 p = proc_find(kn->kn_id);
519 }
520
521 if (p == NULL) {
522 mutex_exit(proc_lock);
523 return ESRCH;
524 }
525
526 /*
527 * Fail if it's not owned by you, or the last exec gave us
528 * setuid/setgid privs (unless you're root).
529 */
530 mutex_enter(p->p_lock);
531 mutex_exit(proc_lock);
532 if (kauth_authorize_process(curl->l_cred, KAUTH_PROCESS_KEVENT_FILTER,
533 p, NULL, NULL, NULL) != 0) {
534 mutex_exit(p->p_lock);
535 return EACCES;
536 }
537
538 kn->kn_obj = p;
539 kn->kn_flags |= EV_CLEAR; /* automatically set */
540
541 /*
542 * internal flag indicating registration done by kernel
543 */
544 if (kn->kn_flags & EV_FLAG1) {
545 kn->kn_data = kn->kn_sdata; /* ppid */
546 kn->kn_fflags = NOTE_CHILD;
547 kn->kn_flags &= ~EV_FLAG1;
548 }
549 SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext);
550 mutex_exit(p->p_lock);
551
552 return 0;
553 }
554
555 /*
556 * Filter detach method for EVFILT_PROC.
557 *
558 * The knote may be attached to a different process, which may exit,
559 * leaving nothing for the knote to be attached to. So when the process
560 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
561 * it will be deleted when read out. However, as part of the knote deletion,
562 * this routine is called, so a check is needed to avoid actually performing
563 * a detach, because the original process might not exist any more.
564 */
565 static void
566 filt_procdetach(struct knote *kn)
567 {
568 struct proc *p;
569
570 if (kn->kn_status & KN_DETACHED)
571 return;
572
573 p = kn->kn_obj;
574
575 mutex_enter(p->p_lock);
576 SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
577 mutex_exit(p->p_lock);
578 }
579
580 /*
581 * Filter event method for EVFILT_PROC.
582 */
583 static int
584 filt_proc(struct knote *kn, long hint)
585 {
586 u_int event, fflag;
587 struct kevent kev;
588 struct kqueue *kq;
589 int error;
590
591 event = (u_int)hint & NOTE_PCTRLMASK;
592 kq = kn->kn_kq;
593 fflag = 0;
594
595 /* If the user is interested in this event, record it. */
596 if (kn->kn_sfflags & event)
597 fflag |= event;
598
599 if (event == NOTE_EXIT) {
600 struct proc *p = kn->kn_obj;
601
602 if (p != NULL)
603 kn->kn_data = P_WAITSTATUS(p);
604 /*
605 * Process is gone, so flag the event as finished.
606 *
607 * Detach the knote from watched process and mark
608 * it as such. We can't leave this to kqueue_scan(),
609 * since the process might not exist by then. And we
610 * have to do this now, since psignal KNOTE() is called
611 * also for zombies and we might end up reading freed
612 * memory if the kevent would already be picked up
613 * and knote g/c'ed.
614 */
615 filt_procdetach(kn);
616
617 mutex_spin_enter(&kq->kq_lock);
618 kn->kn_status |= KN_DETACHED;
619 /* Mark as ONESHOT, so that the knote it g/c'ed when read */
620 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
621 kn->kn_fflags |= fflag;
622 mutex_spin_exit(&kq->kq_lock);
623
624 return 1;
625 }
626
627 mutex_spin_enter(&kq->kq_lock);
628 if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
629 /*
630 * Process forked, and user wants to track the new process,
631 * so attach a new knote to it, and immediately report an
632 * event with the parent's pid. Register knote with new
633 * process.
634 */
635 memset(&kev, 0, sizeof(kev));
636 kev.ident = hint & NOTE_PDATAMASK; /* pid */
637 kev.filter = kn->kn_filter;
638 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
639 kev.fflags = kn->kn_sfflags;
640 kev.data = kn->kn_id; /* parent */
641 kev.udata = kn->kn_kevent.udata; /* preserve udata */
642 mutex_spin_exit(&kq->kq_lock);
643 error = kqueue_register(kq, &kev);
644 mutex_spin_enter(&kq->kq_lock);
645 if (error != 0)
646 kn->kn_fflags |= NOTE_TRACKERR;
647 }
648 kn->kn_fflags |= fflag;
649 fflag = kn->kn_fflags;
650 mutex_spin_exit(&kq->kq_lock);
651
652 return fflag != 0;
653 }
654
655 static void
656 filt_timerexpire(void *knx)
657 {
658 struct knote *kn = knx;
659 int tticks;
660
661 mutex_enter(&kqueue_misc_lock);
662 kn->kn_data++;
663 knote_activate(kn);
664 if ((kn->kn_flags & EV_ONESHOT) == 0) {
665 tticks = mstohz(kn->kn_sdata);
666 if (tticks <= 0)
667 tticks = 1;
668 callout_schedule((callout_t *)kn->kn_hook, tticks);
669 }
670 mutex_exit(&kqueue_misc_lock);
671 }
672
673 /*
674 * data contains amount of time to sleep, in milliseconds
675 */
676 static int
677 filt_timerattach(struct knote *kn)
678 {
679 callout_t *calloutp;
680 struct kqueue *kq;
681 int tticks;
682
683 tticks = mstohz(kn->kn_sdata);
684
685 /* if the supplied value is under our resolution, use 1 tick */
686 if (tticks == 0) {
687 if (kn->kn_sdata == 0)
688 return EINVAL;
689 tticks = 1;
690 }
691
692 if (atomic_inc_uint_nv(&kq_ncallouts) >= kq_calloutmax ||
693 (calloutp = kmem_alloc(sizeof(*calloutp), KM_NOSLEEP)) == NULL) {
694 atomic_dec_uint(&kq_ncallouts);
695 return ENOMEM;
696 }
697 callout_init(calloutp, CALLOUT_MPSAFE);
698
699 kq = kn->kn_kq;
700 mutex_spin_enter(&kq->kq_lock);
701 kn->kn_flags |= EV_CLEAR; /* automatically set */
702 kn->kn_hook = calloutp;
703 mutex_spin_exit(&kq->kq_lock);
704
705 callout_reset(calloutp, tticks, filt_timerexpire, kn);
706
707 return (0);
708 }
709
710 static void
711 filt_timerdetach(struct knote *kn)
712 {
713 callout_t *calloutp;
714 struct kqueue *kq = kn->kn_kq;
715
716 mutex_spin_enter(&kq->kq_lock);
717 /* prevent rescheduling when we expire */
718 kn->kn_flags |= EV_ONESHOT;
719 mutex_spin_exit(&kq->kq_lock);
720
721 calloutp = (callout_t *)kn->kn_hook;
722 callout_halt(calloutp, NULL);
723 callout_destroy(calloutp);
724 kmem_free(calloutp, sizeof(*calloutp));
725 atomic_dec_uint(&kq_ncallouts);
726 }
727
728 static int
729 filt_timer(struct knote *kn, long hint)
730 {
731 int rv;
732
733 mutex_enter(&kqueue_misc_lock);
734 rv = (kn->kn_data != 0);
735 mutex_exit(&kqueue_misc_lock);
736
737 return rv;
738 }
739
740 /*
741 * Filter event method for EVFILT_FS.
742 */
743 struct klist fs_klist = SLIST_HEAD_INITIALIZER(&fs_klist);
744
745 static int
746 filt_fsattach(struct knote *kn)
747 {
748
749 mutex_enter(&kqueue_misc_lock);
750 kn->kn_flags |= EV_CLEAR;
751 SLIST_INSERT_HEAD(&fs_klist, kn, kn_selnext);
752 mutex_exit(&kqueue_misc_lock);
753
754 return 0;
755 }
756
757 static void
758 filt_fsdetach(struct knote *kn)
759 {
760
761 mutex_enter(&kqueue_misc_lock);
762 SLIST_REMOVE(&fs_klist, kn, knote, kn_selnext);
763 mutex_exit(&kqueue_misc_lock);
764 }
765
766 static int
767 filt_fs(struct knote *kn, long hint)
768 {
769 int rv;
770
771 mutex_enter(&kqueue_misc_lock);
772 kn->kn_fflags |= hint;
773 rv = (kn->kn_fflags != 0);
774 mutex_exit(&kqueue_misc_lock);
775
776 return rv;
777 }
778
779 /*
780 * filt_seltrue:
781 *
782 * This filter "event" routine simulates seltrue().
783 */
784 int
785 filt_seltrue(struct knote *kn, long hint)
786 {
787
788 /*
789 * We don't know how much data can be read/written,
790 * but we know that it *can* be. This is about as
791 * good as select/poll does as well.
792 */
793 kn->kn_data = 0;
794 return (1);
795 }
796
797 /*
798 * This provides full kqfilter entry for device switch tables, which
799 * has same effect as filter using filt_seltrue() as filter method.
800 */
801 static void
802 filt_seltruedetach(struct knote *kn)
803 {
804 /* Nothing to do */
805 }
806
807 const struct filterops seltrue_filtops = {
808 .f_isfd = 1,
809 .f_attach = NULL,
810 .f_detach = filt_seltruedetach,
811 .f_event = filt_seltrue,
812 };
813
814 int
815 seltrue_kqfilter(dev_t dev, struct knote *kn)
816 {
817 switch (kn->kn_filter) {
818 case EVFILT_READ:
819 case EVFILT_WRITE:
820 kn->kn_fop = &seltrue_filtops;
821 break;
822 default:
823 return (EINVAL);
824 }
825
826 /* Nothing more to do */
827 return (0);
828 }
829
830 /*
831 * kqueue(2) system call.
832 */
833 static int
834 kqueue1(struct lwp *l, int flags, register_t *retval)
835 {
836 struct kqueue *kq;
837 file_t *fp;
838 int fd, error;
839
840 if ((error = fd_allocfile(&fp, &fd)) != 0)
841 return error;
842 fp->f_flag = FREAD | FWRITE | (flags & (FNONBLOCK|FNOSIGPIPE));
843 fp->f_type = DTYPE_KQUEUE;
844 fp->f_ops = &kqueueops;
845 kq = kmem_zalloc(sizeof(*kq), KM_SLEEP);
846 mutex_init(&kq->kq_lock, MUTEX_DEFAULT, IPL_SCHED);
847 cv_init(&kq->kq_cv, "kqueue");
848 selinit(&kq->kq_sel);
849 TAILQ_INIT(&kq->kq_head);
850 fp->f_kqueue = kq;
851 *retval = fd;
852 kq->kq_fdp = curlwp->l_fd;
853 fd_set_exclose(l, fd, (flags & O_CLOEXEC) != 0);
854 fd_affix(curproc, fp, fd);
855 return error;
856 }
857
858 /*
859 * kqueue(2) system call.
860 */
861 int
862 sys_kqueue(struct lwp *l, const void *v, register_t *retval)
863 {
864 return kqueue1(l, 0, retval);
865 }
866
867 int
868 sys_kqueue1(struct lwp *l, const struct sys_kqueue1_args *uap,
869 register_t *retval)
870 {
871 /* {
872 syscallarg(int) flags;
873 } */
874 return kqueue1(l, SCARG(uap, flags), retval);
875 }
876
877 /*
878 * kevent(2) system call.
879 */
880 int
881 kevent_fetch_changes(void *ctx, const struct kevent *changelist,
882 struct kevent *changes, size_t index, int n)
883 {
884
885 return copyin(changelist + index, changes, n * sizeof(*changes));
886 }
887
888 int
889 kevent_put_events(void *ctx, struct kevent *events,
890 struct kevent *eventlist, size_t index, int n)
891 {
892
893 return copyout(events, eventlist + index, n * sizeof(*events));
894 }
895
896 static const struct kevent_ops kevent_native_ops = {
897 .keo_private = NULL,
898 .keo_fetch_timeout = copyin,
899 .keo_fetch_changes = kevent_fetch_changes,
900 .keo_put_events = kevent_put_events,
901 };
902
903 int
904 sys___kevent50(struct lwp *l, const struct sys___kevent50_args *uap,
905 register_t *retval)
906 {
907 /* {
908 syscallarg(int) fd;
909 syscallarg(const struct kevent *) changelist;
910 syscallarg(size_t) nchanges;
911 syscallarg(struct kevent *) eventlist;
912 syscallarg(size_t) nevents;
913 syscallarg(const struct timespec *) timeout;
914 } */
915
916 return kevent1(retval, SCARG(uap, fd), SCARG(uap, changelist),
917 SCARG(uap, nchanges), SCARG(uap, eventlist), SCARG(uap, nevents),
918 SCARG(uap, timeout), &kevent_native_ops);
919 }
920
921 int
922 kevent1(register_t *retval, int fd,
923 const struct kevent *changelist, size_t nchanges,
924 struct kevent *eventlist, size_t nevents,
925 const struct timespec *timeout,
926 const struct kevent_ops *keops)
927 {
928 struct kevent *kevp;
929 struct kqueue *kq;
930 struct timespec ts;
931 size_t i, n, ichange;
932 int nerrors, error;
933 struct kevent kevbuf[KQ_NEVENTS]; /* approx 300 bytes on 64-bit */
934 file_t *fp;
935
936 /* check that we're dealing with a kq */
937 fp = fd_getfile(fd);
938 if (fp == NULL)
939 return (EBADF);
940
941 if (fp->f_type != DTYPE_KQUEUE) {
942 fd_putfile(fd);
943 return (EBADF);
944 }
945
946 if (timeout != NULL) {
947 error = (*keops->keo_fetch_timeout)(timeout, &ts, sizeof(ts));
948 if (error)
949 goto done;
950 timeout = &ts;
951 }
952
953 kq = fp->f_kqueue;
954 nerrors = 0;
955 ichange = 0;
956
957 /* traverse list of events to register */
958 while (nchanges > 0) {
959 n = MIN(nchanges, __arraycount(kevbuf));
960 error = (*keops->keo_fetch_changes)(keops->keo_private,
961 changelist, kevbuf, ichange, n);
962 if (error)
963 goto done;
964 for (i = 0; i < n; i++) {
965 kevp = &kevbuf[i];
966 kevp->flags &= ~EV_SYSFLAGS;
967 /* register each knote */
968 error = kqueue_register(kq, kevp);
969 if (!error && !(kevp->flags & EV_RECEIPT))
970 continue;
971 if (nevents == 0)
972 goto done;
973 kevp->flags = EV_ERROR;
974 kevp->data = error;
975 error = (*keops->keo_put_events)
976 (keops->keo_private, kevp,
977 eventlist, nerrors, 1);
978 if (error)
979 goto done;
980 nevents--;
981 nerrors++;
982 }
983 nchanges -= n; /* update the results */
984 ichange += n;
985 }
986 if (nerrors) {
987 *retval = nerrors;
988 error = 0;
989 goto done;
990 }
991
992 /* actually scan through the events */
993 error = kqueue_scan(fp, nevents, eventlist, timeout, retval, keops,
994 kevbuf, __arraycount(kevbuf));
995 done:
996 fd_putfile(fd);
997 return (error);
998 }
999
1000 /*
1001 * Register a given kevent kev onto the kqueue
1002 */
1003 static int
1004 kqueue_register(struct kqueue *kq, struct kevent *kev)
1005 {
1006 struct kfilter *kfilter;
1007 filedesc_t *fdp;
1008 file_t *fp;
1009 fdfile_t *ff;
1010 struct knote *kn, *newkn;
1011 struct klist *list;
1012 int error, fd, rv;
1013
1014 fdp = kq->kq_fdp;
1015 fp = NULL;
1016 kn = NULL;
1017 error = 0;
1018 fd = 0;
1019
1020 newkn = kmem_zalloc(sizeof(*newkn), KM_SLEEP);
1021
1022 rw_enter(&kqueue_filter_lock, RW_READER);
1023 kfilter = kfilter_byfilter(kev->filter);
1024 if (kfilter == NULL || kfilter->filtops == NULL) {
1025 /* filter not found nor implemented */
1026 rw_exit(&kqueue_filter_lock);
1027 kmem_free(newkn, sizeof(*newkn));
1028 return (EINVAL);
1029 }
1030
1031 /* search if knote already exists */
1032 if (kfilter->filtops->f_isfd) {
1033 /* monitoring a file descriptor */
1034 /* validate descriptor */
1035 if (kev->ident > INT_MAX
1036 || (fp = fd_getfile(fd = kev->ident)) == NULL) {
1037 rw_exit(&kqueue_filter_lock);
1038 kmem_free(newkn, sizeof(*newkn));
1039 return EBADF;
1040 }
1041 mutex_enter(&fdp->fd_lock);
1042 ff = fdp->fd_dt->dt_ff[fd];
1043 if (ff->ff_refcnt & FR_CLOSING) {
1044 error = EBADF;
1045 goto doneunlock;
1046 }
1047 if (fd <= fdp->fd_lastkqfile) {
1048 SLIST_FOREACH(kn, &ff->ff_knlist, kn_link) {
1049 if (kq == kn->kn_kq &&
1050 kev->filter == kn->kn_filter)
1051 break;
1052 }
1053 }
1054 } else {
1055 /*
1056 * not monitoring a file descriptor, so
1057 * lookup knotes in internal hash table
1058 */
1059 mutex_enter(&fdp->fd_lock);
1060 if (fdp->fd_knhashmask != 0) {
1061 list = &fdp->fd_knhash[
1062 KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
1063 SLIST_FOREACH(kn, list, kn_link) {
1064 if (kev->ident == kn->kn_id &&
1065 kq == kn->kn_kq &&
1066 kev->filter == kn->kn_filter)
1067 break;
1068 }
1069 }
1070 }
1071
1072 /*
1073 * kn now contains the matching knote, or NULL if no match
1074 */
1075 if (kev->flags & EV_ADD) {
1076 if (kn == NULL) {
1077 /* create new knote */
1078 kn = newkn;
1079 newkn = NULL;
1080 kn->kn_obj = fp;
1081 kn->kn_id = kev->ident;
1082 kn->kn_kq = kq;
1083 kn->kn_fop = kfilter->filtops;
1084 kn->kn_kfilter = kfilter;
1085 kn->kn_sfflags = kev->fflags;
1086 kn->kn_sdata = kev->data;
1087 kev->fflags = 0;
1088 kev->data = 0;
1089 kn->kn_kevent = *kev;
1090
1091 KASSERT(kn->kn_fop != NULL);
1092 /*
1093 * apply reference count to knote structure, and
1094 * do not release it at the end of this routine.
1095 */
1096 fp = NULL;
1097
1098 if (!kn->kn_fop->f_isfd) {
1099 /*
1100 * If knote is not on an fd, store on
1101 * internal hash table.
1102 */
1103 if (fdp->fd_knhashmask == 0) {
1104 /* XXXAD can block with fd_lock held */
1105 fdp->fd_knhash = hashinit(KN_HASHSIZE,
1106 HASH_LIST, true,
1107 &fdp->fd_knhashmask);
1108 }
1109 list = &fdp->fd_knhash[KN_HASH(kn->kn_id,
1110 fdp->fd_knhashmask)];
1111 } else {
1112 /* Otherwise, knote is on an fd. */
1113 list = (struct klist *)
1114 &fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
1115 if ((int)kn->kn_id > fdp->fd_lastkqfile)
1116 fdp->fd_lastkqfile = kn->kn_id;
1117 }
1118 SLIST_INSERT_HEAD(list, kn, kn_link);
1119
1120 KERNEL_LOCK(1, NULL); /* XXXSMP */
1121 error = (*kfilter->filtops->f_attach)(kn);
1122 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */
1123 if (error != 0) {
1124 #ifdef DEBUG
1125 struct proc *p = curlwp->l_proc;
1126 const file_t *ft = kn->kn_obj;
1127 printf("%s: %s[%d]: event type %d not "
1128 "supported for file type %d/%s "
1129 "(error %d)\n", __func__,
1130 p->p_comm, p->p_pid,
1131 kn->kn_filter, ft ? ft->f_type : -1,
1132 ft ? ft->f_ops->fo_name : "?", error);
1133 #endif
1134
1135 /* knote_detach() drops fdp->fd_lock */
1136 knote_detach(kn, fdp, false);
1137 goto done;
1138 }
1139 atomic_inc_uint(&kfilter->refcnt);
1140 } else {
1141 /*
1142 * The user may change some filter values after the
1143 * initial EV_ADD, but doing so will not reset any
1144 * filter which have already been triggered.
1145 */
1146 kn->kn_sfflags = kev->fflags;
1147 kn->kn_sdata = kev->data;
1148 kn->kn_kevent.udata = kev->udata;
1149 }
1150 /*
1151 * We can get here if we are trying to attach
1152 * an event to a file descriptor that does not
1153 * support events, and the attach routine is
1154 * broken and does not return an error.
1155 */
1156 KASSERT(kn->kn_fop != NULL);
1157 KASSERT(kn->kn_fop->f_event != NULL);
1158 KERNEL_LOCK(1, NULL); /* XXXSMP */
1159 rv = (*kn->kn_fop->f_event)(kn, 0);
1160 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */
1161 if (rv)
1162 knote_activate(kn);
1163 } else {
1164 if (kn == NULL) {
1165 error = ENOENT;
1166 goto doneunlock;
1167 }
1168 if (kev->flags & EV_DELETE) {
1169 /* knote_detach() drops fdp->fd_lock */
1170 knote_detach(kn, fdp, true);
1171 goto done;
1172 }
1173 }
1174
1175 /* disable knote */
1176 if ((kev->flags & EV_DISABLE)) {
1177 mutex_spin_enter(&kq->kq_lock);
1178 if ((kn->kn_status & KN_DISABLED) == 0)
1179 kn->kn_status |= KN_DISABLED;
1180 mutex_spin_exit(&kq->kq_lock);
1181 }
1182
1183 /* enable knote */
1184 if ((kev->flags & EV_ENABLE)) {
1185 knote_enqueue(kn);
1186 }
1187 doneunlock:
1188 mutex_exit(&fdp->fd_lock);
1189 done:
1190 rw_exit(&kqueue_filter_lock);
1191 if (newkn != NULL)
1192 kmem_free(newkn, sizeof(*newkn));
1193 if (fp != NULL)
1194 fd_putfile(fd);
1195 return (error);
1196 }
1197
1198 #if defined(DEBUG)
1199 #define KN_FMT(buf, kn) \
1200 (snprintb((buf), sizeof(buf), __KN_FLAG_BITS, (kn)->kn_status), buf)
1201
1202 static void
1203 kqueue_check(const char *func, size_t line, const struct kqueue *kq)
1204 {
1205 const struct knote *kn;
1206 int count;
1207 int nmarker;
1208 char buf[128];
1209
1210 KASSERT(mutex_owned(&kq->kq_lock));
1211 KASSERT(kq->kq_count >= 0);
1212
1213 count = 0;
1214 nmarker = 0;
1215 TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) {
1216 if ((kn->kn_status & (KN_MARKER | KN_QUEUED)) == 0) {
1217 panic("%s,%zu: kq=%p kn=%p !(MARKER|QUEUED) %s",
1218 func, line, kq, kn, KN_FMT(buf, kn));
1219 }
1220 if ((kn->kn_status & KN_MARKER) == 0) {
1221 if (kn->kn_kq != kq) {
1222 panic("%s,%zu: kq=%p kn(%p) != kn->kq(%p): %s",
1223 func, line, kq, kn, kn->kn_kq,
1224 KN_FMT(buf, kn));
1225 }
1226 if ((kn->kn_status & KN_ACTIVE) == 0) {
1227 panic("%s,%zu: kq=%p kn=%p: !ACTIVE %s",
1228 func, line, kq, kn, KN_FMT(buf, kn));
1229 }
1230 count++;
1231 if (count > kq->kq_count) {
1232 goto bad;
1233 }
1234 } else {
1235 nmarker++;
1236 #if 0
1237 if (nmarker > 10000) {
1238 panic("%s,%zu: kq=%p too many markers: "
1239 "%d != %d, nmarker=%d",
1240 func, line, kq, kq->kq_count, count,
1241 nmarker);
1242 }
1243 #endif
1244 }
1245 }
1246 if (kq->kq_count != count) {
1247 bad:
1248 panic("%s,%zu: kq=%p kq->kq_count(%d) != count(%d), nmarker=%d",
1249 func, line, kq, kq->kq_count, count, nmarker);
1250 }
1251 }
1252 #define kq_check(a) kqueue_check(__func__, __LINE__, (a))
1253 #else /* defined(DEBUG) */
1254 #define kq_check(a) /* nothing */
1255 #endif /* defined(DEBUG) */
1256
1257 /*
1258 * Scan through the list of events on fp (for a maximum of maxevents),
1259 * returning the results in to ulistp. Timeout is determined by tsp; if
1260 * NULL, wait indefinitely, if 0 valued, perform a poll, otherwise wait
1261 * as appropriate.
1262 */
1263 static int
1264 kqueue_scan(file_t *fp, size_t maxevents, struct kevent *ulistp,
1265 const struct timespec *tsp, register_t *retval,
1266 const struct kevent_ops *keops, struct kevent *kevbuf,
1267 size_t kevcnt)
1268 {
1269 struct kqueue *kq;
1270 struct kevent *kevp;
1271 struct timespec ats, sleepts;
1272 struct knote *kn, *marker, morker;
1273 size_t count, nkev, nevents;
1274 int timeout, error, rv;
1275 filedesc_t *fdp;
1276
1277 fdp = curlwp->l_fd;
1278 kq = fp->f_kqueue;
1279 count = maxevents;
1280 nkev = nevents = error = 0;
1281 if (count == 0) {
1282 *retval = 0;
1283 return 0;
1284 }
1285
1286 if (tsp) { /* timeout supplied */
1287 ats = *tsp;
1288 if (inittimeleft(&ats, &sleepts) == -1) {
1289 *retval = maxevents;
1290 return EINVAL;
1291 }
1292 timeout = tstohz(&ats);
1293 if (timeout <= 0)
1294 timeout = -1; /* do poll */
1295 } else {
1296 /* no timeout, wait forever */
1297 timeout = 0;
1298 }
1299
1300 memset(&morker, 0, sizeof(morker));
1301 marker = &morker;
1302 marker->kn_status = KN_MARKER;
1303 mutex_spin_enter(&kq->kq_lock);
1304 retry:
1305 kevp = kevbuf;
1306 if (kq->kq_count == 0) {
1307 if (timeout >= 0) {
1308 error = cv_timedwait_sig(&kq->kq_cv,
1309 &kq->kq_lock, timeout);
1310 if (error == 0) {
1311 if (tsp == NULL || (timeout =
1312 gettimeleft(&ats, &sleepts)) > 0)
1313 goto retry;
1314 } else {
1315 /* don't restart after signals... */
1316 if (error == ERESTART)
1317 error = EINTR;
1318 if (error == EWOULDBLOCK)
1319 error = 0;
1320 }
1321 }
1322 mutex_spin_exit(&kq->kq_lock);
1323 } else {
1324 /* mark end of knote list */
1325 TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
1326
1327 /*
1328 * Acquire the fdp->fd_lock interlock to avoid races with
1329 * file creation/destruction from other threads.
1330 */
1331 mutex_spin_exit(&kq->kq_lock);
1332 mutex_enter(&fdp->fd_lock);
1333 mutex_spin_enter(&kq->kq_lock);
1334
1335 while (count != 0) {
1336 kn = TAILQ_FIRST(&kq->kq_head); /* get next knote */
1337 while ((kn->kn_status & KN_MARKER) != 0) {
1338 if (kn == marker) {
1339 /* it's our marker, stop */
1340 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1341 if (count < maxevents || (tsp != NULL &&
1342 (timeout = gettimeleft(&ats,
1343 &sleepts)) <= 0))
1344 goto done;
1345 mutex_exit(&fdp->fd_lock);
1346 goto retry;
1347 }
1348 /* someone else's marker. */
1349 kn = TAILQ_NEXT(kn, kn_tqe);
1350 }
1351 kq_check(kq);
1352 kq->kq_count--;
1353 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1354 kn->kn_status &= ~KN_QUEUED;
1355 kn->kn_status |= KN_BUSY;
1356 kq_check(kq);
1357 if (kn->kn_status & KN_DISABLED) {
1358 kn->kn_status &= ~KN_BUSY;
1359 /* don't want disabled events */
1360 continue;
1361 }
1362 if ((kn->kn_flags & EV_ONESHOT) == 0) {
1363 mutex_spin_exit(&kq->kq_lock);
1364 KASSERT(kn->kn_fop != NULL);
1365 KASSERT(kn->kn_fop->f_event != NULL);
1366 KERNEL_LOCK(1, NULL); /* XXXSMP */
1367 KASSERT(mutex_owned(&fdp->fd_lock));
1368 rv = (*kn->kn_fop->f_event)(kn, 0);
1369 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */
1370 mutex_spin_enter(&kq->kq_lock);
1371 /* Re-poll if note was re-enqueued. */
1372 if ((kn->kn_status & KN_QUEUED) != 0) {
1373 kn->kn_status &= ~KN_BUSY;
1374 continue;
1375 }
1376 if (rv == 0) {
1377 /*
1378 * non-ONESHOT event that hasn't
1379 * triggered again, so de-queue.
1380 */
1381 kn->kn_status &= ~(KN_ACTIVE|KN_BUSY);
1382 continue;
1383 }
1384 }
1385 /* XXXAD should be got from f_event if !oneshot. */
1386 *kevp++ = kn->kn_kevent;
1387 nkev++;
1388 if (kn->kn_flags & EV_ONESHOT) {
1389 /* delete ONESHOT events after retrieval */
1390 kn->kn_status &= ~KN_BUSY;
1391 mutex_spin_exit(&kq->kq_lock);
1392 knote_detach(kn, fdp, true);
1393 mutex_enter(&fdp->fd_lock);
1394 mutex_spin_enter(&kq->kq_lock);
1395 } else if (kn->kn_flags & EV_CLEAR) {
1396 /* clear state after retrieval */
1397 kn->kn_data = 0;
1398 kn->kn_fflags = 0;
1399 kn->kn_status &= ~(KN_QUEUED|KN_ACTIVE|KN_BUSY);
1400 } else if (kn->kn_flags & EV_DISPATCH) {
1401 kn->kn_status |= KN_DISABLED;
1402 kn->kn_status &= ~(KN_QUEUED|KN_ACTIVE|KN_BUSY);
1403 } else {
1404 /* add event back on list */
1405 kq_check(kq);
1406 kn->kn_status |= KN_QUEUED;
1407 kn->kn_status &= ~KN_BUSY;
1408 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1409 kq->kq_count++;
1410 kq_check(kq);
1411 }
1412 if (nkev == kevcnt) {
1413 /* do copyouts in kevcnt chunks */
1414 mutex_spin_exit(&kq->kq_lock);
1415 mutex_exit(&fdp->fd_lock);
1416 error = (*keops->keo_put_events)
1417 (keops->keo_private,
1418 kevbuf, ulistp, nevents, nkev);
1419 mutex_enter(&fdp->fd_lock);
1420 mutex_spin_enter(&kq->kq_lock);
1421 nevents += nkev;
1422 nkev = 0;
1423 kevp = kevbuf;
1424 }
1425 count--;
1426 if (error != 0 || count == 0) {
1427 /* remove marker */
1428 TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
1429 break;
1430 }
1431 }
1432 done:
1433 mutex_spin_exit(&kq->kq_lock);
1434 mutex_exit(&fdp->fd_lock);
1435 }
1436 if (nkev != 0) {
1437 /* copyout remaining events */
1438 error = (*keops->keo_put_events)(keops->keo_private,
1439 kevbuf, ulistp, nevents, nkev);
1440 }
1441 *retval = maxevents - count;
1442
1443 return error;
1444 }
1445
1446 /*
1447 * fileops ioctl method for a kqueue descriptor.
1448 *
1449 * Two ioctls are currently supported. They both use struct kfilter_mapping:
1450 * KFILTER_BYNAME find name for filter, and return result in
1451 * name, which is of size len.
1452 * KFILTER_BYFILTER find filter for name. len is ignored.
1453 */
1454 /*ARGSUSED*/
1455 static int
1456 kqueue_ioctl(file_t *fp, u_long com, void *data)
1457 {
1458 struct kfilter_mapping *km;
1459 const struct kfilter *kfilter;
1460 char *name;
1461 int error;
1462
1463 km = data;
1464 error = 0;
1465 name = kmem_alloc(KFILTER_MAXNAME, KM_SLEEP);
1466
1467 switch (com) {
1468 case KFILTER_BYFILTER: /* convert filter -> name */
1469 rw_enter(&kqueue_filter_lock, RW_READER);
1470 kfilter = kfilter_byfilter(km->filter);
1471 if (kfilter != NULL) {
1472 strlcpy(name, kfilter->name, KFILTER_MAXNAME);
1473 rw_exit(&kqueue_filter_lock);
1474 error = copyoutstr(name, km->name, km->len, NULL);
1475 } else {
1476 rw_exit(&kqueue_filter_lock);
1477 error = ENOENT;
1478 }
1479 break;
1480
1481 case KFILTER_BYNAME: /* convert name -> filter */
1482 error = copyinstr(km->name, name, KFILTER_MAXNAME, NULL);
1483 if (error) {
1484 break;
1485 }
1486 rw_enter(&kqueue_filter_lock, RW_READER);
1487 kfilter = kfilter_byname(name);
1488 if (kfilter != NULL)
1489 km->filter = kfilter->filter;
1490 else
1491 error = ENOENT;
1492 rw_exit(&kqueue_filter_lock);
1493 break;
1494
1495 default:
1496 error = ENOTTY;
1497 break;
1498
1499 }
1500 kmem_free(name, KFILTER_MAXNAME);
1501 return (error);
1502 }
1503
1504 /*
1505 * fileops fcntl method for a kqueue descriptor.
1506 */
1507 static int
1508 kqueue_fcntl(file_t *fp, u_int com, void *data)
1509 {
1510
1511 return (ENOTTY);
1512 }
1513
1514 /*
1515 * fileops poll method for a kqueue descriptor.
1516 * Determine if kqueue has events pending.
1517 */
1518 static int
1519 kqueue_poll(file_t *fp, int events)
1520 {
1521 struct kqueue *kq;
1522 int revents;
1523
1524 kq = fp->f_kqueue;
1525
1526 revents = 0;
1527 if (events & (POLLIN | POLLRDNORM)) {
1528 mutex_spin_enter(&kq->kq_lock);
1529 if (kq->kq_count != 0) {
1530 revents |= events & (POLLIN | POLLRDNORM);
1531 } else {
1532 selrecord(curlwp, &kq->kq_sel);
1533 }
1534 kq_check(kq);
1535 mutex_spin_exit(&kq->kq_lock);
1536 }
1537
1538 return revents;
1539 }
1540
1541 /*
1542 * fileops stat method for a kqueue descriptor.
1543 * Returns dummy info, with st_size being number of events pending.
1544 */
1545 static int
1546 kqueue_stat(file_t *fp, struct stat *st)
1547 {
1548 struct kqueue *kq;
1549
1550 kq = fp->f_kqueue;
1551
1552 memset(st, 0, sizeof(*st));
1553 st->st_size = kq->kq_count;
1554 st->st_blksize = sizeof(struct kevent);
1555 st->st_mode = S_IFIFO;
1556
1557 return 0;
1558 }
1559
1560 static void
1561 kqueue_doclose(struct kqueue *kq, struct klist *list, int fd)
1562 {
1563 struct knote *kn;
1564 filedesc_t *fdp;
1565
1566 fdp = kq->kq_fdp;
1567
1568 KASSERT(mutex_owned(&fdp->fd_lock));
1569
1570 for (kn = SLIST_FIRST(list); kn != NULL;) {
1571 if (kq != kn->kn_kq) {
1572 kn = SLIST_NEXT(kn, kn_link);
1573 continue;
1574 }
1575 knote_detach(kn, fdp, true);
1576 mutex_enter(&fdp->fd_lock);
1577 kn = SLIST_FIRST(list);
1578 }
1579 }
1580
1581
1582 /*
1583 * fileops close method for a kqueue descriptor.
1584 */
1585 static int
1586 kqueue_close(file_t *fp)
1587 {
1588 struct kqueue *kq;
1589 filedesc_t *fdp;
1590 fdfile_t *ff;
1591 int i;
1592
1593 kq = fp->f_kqueue;
1594 fp->f_kqueue = NULL;
1595 fp->f_type = 0;
1596 fdp = curlwp->l_fd;
1597
1598 mutex_enter(&fdp->fd_lock);
1599 for (i = 0; i <= fdp->fd_lastkqfile; i++) {
1600 if ((ff = fdp->fd_dt->dt_ff[i]) == NULL)
1601 continue;
1602 kqueue_doclose(kq, (struct klist *)&ff->ff_knlist, i);
1603 }
1604 if (fdp->fd_knhashmask != 0) {
1605 for (i = 0; i < fdp->fd_knhashmask + 1; i++) {
1606 kqueue_doclose(kq, &fdp->fd_knhash[i], -1);
1607 }
1608 }
1609 mutex_exit(&fdp->fd_lock);
1610
1611 KASSERT(kq->kq_count == 0);
1612 mutex_destroy(&kq->kq_lock);
1613 cv_destroy(&kq->kq_cv);
1614 seldestroy(&kq->kq_sel);
1615 kmem_free(kq, sizeof(*kq));
1616
1617 return (0);
1618 }
1619
1620 /*
1621 * struct fileops kqfilter method for a kqueue descriptor.
1622 * Event triggered when monitored kqueue changes.
1623 */
1624 static int
1625 kqueue_kqfilter(file_t *fp, struct knote *kn)
1626 {
1627 struct kqueue *kq;
1628
1629 kq = ((file_t *)kn->kn_obj)->f_kqueue;
1630
1631 KASSERT(fp == kn->kn_obj);
1632
1633 if (kn->kn_filter != EVFILT_READ)
1634 return 1;
1635
1636 kn->kn_fop = &kqread_filtops;
1637 mutex_enter(&kq->kq_lock);
1638 SLIST_INSERT_HEAD(&kq->kq_sel.sel_klist, kn, kn_selnext);
1639 mutex_exit(&kq->kq_lock);
1640
1641 return 0;
1642 }
1643
1644
1645 /*
1646 * Walk down a list of knotes, activating them if their event has
1647 * triggered. The caller's object lock (e.g. device driver lock)
1648 * must be held.
1649 */
1650 void
1651 knote(struct klist *list, long hint)
1652 {
1653 struct knote *kn, *tmpkn;
1654
1655 SLIST_FOREACH_SAFE(kn, list, kn_selnext, tmpkn) {
1656 KASSERT(kn->kn_fop != NULL);
1657 KASSERT(kn->kn_fop->f_event != NULL);
1658 if ((*kn->kn_fop->f_event)(kn, hint))
1659 knote_activate(kn);
1660 }
1661 }
1662
1663 /*
1664 * Remove all knotes referencing a specified fd
1665 */
1666 void
1667 knote_fdclose(int fd)
1668 {
1669 struct klist *list;
1670 struct knote *kn;
1671 filedesc_t *fdp;
1672
1673 fdp = curlwp->l_fd;
1674 mutex_enter(&fdp->fd_lock);
1675 list = (struct klist *)&fdp->fd_dt->dt_ff[fd]->ff_knlist;
1676 while ((kn = SLIST_FIRST(list)) != NULL) {
1677 knote_detach(kn, fdp, true);
1678 mutex_enter(&fdp->fd_lock);
1679 }
1680 mutex_exit(&fdp->fd_lock);
1681 }
1682
1683 /*
1684 * Drop knote. Called with fdp->fd_lock held, and will drop before
1685 * returning.
1686 */
1687 static void
1688 knote_detach(struct knote *kn, filedesc_t *fdp, bool dofop)
1689 {
1690 struct klist *list;
1691 struct kqueue *kq;
1692
1693 kq = kn->kn_kq;
1694
1695 KASSERT((kn->kn_status & KN_MARKER) == 0);
1696 KASSERT(mutex_owned(&fdp->fd_lock));
1697
1698 KASSERT(kn->kn_fop != NULL);
1699 /* Remove from monitored object. */
1700 if (dofop) {
1701 KASSERT(kn->kn_fop->f_detach != NULL);
1702 KERNEL_LOCK(1, NULL); /* XXXSMP */
1703 (*kn->kn_fop->f_detach)(kn);
1704 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */
1705 }
1706
1707 /* Remove from descriptor table. */
1708 if (kn->kn_fop->f_isfd)
1709 list = (struct klist *)&fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
1710 else
1711 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
1712
1713 SLIST_REMOVE(list, kn, knote, kn_link);
1714
1715 /* Remove from kqueue. */
1716 again:
1717 mutex_spin_enter(&kq->kq_lock);
1718 if ((kn->kn_status & KN_QUEUED) != 0) {
1719 kq_check(kq);
1720 kq->kq_count--;
1721 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1722 kn->kn_status &= ~KN_QUEUED;
1723 kq_check(kq);
1724 } else if (kn->kn_status & KN_BUSY) {
1725 mutex_spin_exit(&kq->kq_lock);
1726 goto again;
1727 }
1728 mutex_spin_exit(&kq->kq_lock);
1729
1730 mutex_exit(&fdp->fd_lock);
1731 if (kn->kn_fop->f_isfd)
1732 fd_putfile(kn->kn_id);
1733 atomic_dec_uint(&kn->kn_kfilter->refcnt);
1734 kmem_free(kn, sizeof(*kn));
1735 }
1736
1737 /*
1738 * Queue new event for knote.
1739 */
1740 static void
1741 knote_enqueue(struct knote *kn)
1742 {
1743 struct kqueue *kq;
1744
1745 KASSERT((kn->kn_status & KN_MARKER) == 0);
1746
1747 kq = kn->kn_kq;
1748
1749 mutex_spin_enter(&kq->kq_lock);
1750 if ((kn->kn_status & KN_DISABLED) != 0) {
1751 kn->kn_status &= ~KN_DISABLED;
1752 }
1753 if ((kn->kn_status & (KN_ACTIVE | KN_QUEUED)) == KN_ACTIVE) {
1754 kq_check(kq);
1755 kn->kn_status |= KN_QUEUED;
1756 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1757 kq->kq_count++;
1758 kq_check(kq);
1759 cv_broadcast(&kq->kq_cv);
1760 selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
1761 }
1762 mutex_spin_exit(&kq->kq_lock);
1763 }
1764 /*
1765 * Queue new event for knote.
1766 */
1767 static void
1768 knote_activate(struct knote *kn)
1769 {
1770 struct kqueue *kq;
1771
1772 KASSERT((kn->kn_status & KN_MARKER) == 0);
1773
1774 kq = kn->kn_kq;
1775
1776 mutex_spin_enter(&kq->kq_lock);
1777 kn->kn_status |= KN_ACTIVE;
1778 if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) {
1779 kq_check(kq);
1780 kn->kn_status |= KN_QUEUED;
1781 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1782 kq->kq_count++;
1783 kq_check(kq);
1784 cv_broadcast(&kq->kq_cv);
1785 selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
1786 }
1787 mutex_spin_exit(&kq->kq_lock);
1788 }
1789