kern_event.c revision 1.102 1 /* $NetBSD: kern_event.c,v 1.102 2018/01/09 03:31:13 christos Exp $ */
2
3 /*-
4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*-
33 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon (at) FreeBSD.org>
34 * All rights reserved.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 *
45 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 * SUCH DAMAGE.
56 *
57 * FreeBSD: src/sys/kern/kern_event.c,v 1.27 2001/07/05 17:10:44 rwatson Exp
58 */
59
60 #include <sys/cdefs.h>
61 __KERNEL_RCSID(0, "$NetBSD: kern_event.c,v 1.102 2018/01/09 03:31:13 christos Exp $");
62
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/kernel.h>
66 #include <sys/wait.h>
67 #include <sys/proc.h>
68 #include <sys/file.h>
69 #include <sys/select.h>
70 #include <sys/queue.h>
71 #include <sys/event.h>
72 #include <sys/eventvar.h>
73 #include <sys/poll.h>
74 #include <sys/kmem.h>
75 #include <sys/stat.h>
76 #include <sys/filedesc.h>
77 #include <sys/syscallargs.h>
78 #include <sys/kauth.h>
79 #include <sys/conf.h>
80 #include <sys/atomic.h>
81
82 static int kqueue_scan(file_t *, size_t, struct kevent *,
83 const struct timespec *, register_t *,
84 const struct kevent_ops *, struct kevent *,
85 size_t);
86 static int kqueue_ioctl(file_t *, u_long, void *);
87 static int kqueue_fcntl(file_t *, u_int, void *);
88 static int kqueue_poll(file_t *, int);
89 static int kqueue_kqfilter(file_t *, struct knote *);
90 static int kqueue_stat(file_t *, struct stat *);
91 static int kqueue_close(file_t *);
92 static int kqueue_register(struct kqueue *, struct kevent *);
93 static void kqueue_doclose(struct kqueue *, struct klist *, int);
94
95 static void knote_detach(struct knote *, filedesc_t *fdp, bool);
96 static void knote_enqueue(struct knote *);
97 static void knote_activate(struct knote *);
98
99 static void filt_kqdetach(struct knote *);
100 static int filt_kqueue(struct knote *, long hint);
101 static int filt_procattach(struct knote *);
102 static void filt_procdetach(struct knote *);
103 static int filt_proc(struct knote *, long hint);
104 static int filt_fileattach(struct knote *);
105 static void filt_timerexpire(void *x);
106 static int filt_timerattach(struct knote *);
107 static void filt_timerdetach(struct knote *);
108 static int filt_timer(struct knote *, long hint);
109 static int filt_fsattach(struct knote *kn);
110 static void filt_fsdetach(struct knote *kn);
111 static int filt_fs(struct knote *kn, long hint);
112
113 static const struct fileops kqueueops = {
114 .fo_name = "kqueue",
115 .fo_read = (void *)enxio,
116 .fo_write = (void *)enxio,
117 .fo_ioctl = kqueue_ioctl,
118 .fo_fcntl = kqueue_fcntl,
119 .fo_poll = kqueue_poll,
120 .fo_stat = kqueue_stat,
121 .fo_close = kqueue_close,
122 .fo_kqfilter = kqueue_kqfilter,
123 .fo_restart = fnullop_restart,
124 };
125
126 static const struct filterops kqread_filtops = {
127 .f_isfd = 1,
128 .f_attach = NULL,
129 .f_detach = filt_kqdetach,
130 .f_event = filt_kqueue,
131 };
132
133 static const struct filterops proc_filtops = {
134 .f_isfd = 0,
135 .f_attach = filt_procattach,
136 .f_detach = filt_procdetach,
137 .f_event = filt_proc,
138 };
139
140 static const struct filterops file_filtops = {
141 .f_isfd = 1,
142 .f_attach = filt_fileattach,
143 .f_detach = NULL,
144 .f_event = NULL,
145 };
146
147 static const struct filterops timer_filtops = {
148 .f_isfd = 0,
149 .f_attach = filt_timerattach,
150 .f_detach = filt_timerdetach,
151 .f_event = filt_timer,
152 };
153
154 static const struct filterops fs_filtops = {
155 .f_isfd = 0,
156 .f_attach = filt_fsattach,
157 .f_detach = filt_fsdetach,
158 .f_event = filt_fs,
159 };
160
161 static u_int kq_ncallouts = 0;
162 static int kq_calloutmax = (4 * 1024);
163
164 #define KN_HASHSIZE 64 /* XXX should be tunable */
165 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
166
167 extern const struct filterops sig_filtops;
168
169 /*
170 * Table for for all system-defined filters.
171 * These should be listed in the numeric order of the EVFILT_* defines.
172 * If filtops is NULL, the filter isn't implemented in NetBSD.
173 * End of list is when name is NULL.
174 *
175 * Note that 'refcnt' is meaningless for built-in filters.
176 */
177 struct kfilter {
178 const char *name; /* name of filter */
179 uint32_t filter; /* id of filter */
180 unsigned refcnt; /* reference count */
181 const struct filterops *filtops;/* operations for filter */
182 size_t namelen; /* length of name string */
183 };
184
185 /* System defined filters */
186 static struct kfilter sys_kfilters[] = {
187 { "EVFILT_READ", EVFILT_READ, 0, &file_filtops, 0 },
188 { "EVFILT_WRITE", EVFILT_WRITE, 0, &file_filtops, 0, },
189 { "EVFILT_AIO", EVFILT_AIO, 0, NULL, 0 },
190 { "EVFILT_VNODE", EVFILT_VNODE, 0, &file_filtops, 0 },
191 { "EVFILT_PROC", EVFILT_PROC, 0, &proc_filtops, 0 },
192 { "EVFILT_SIGNAL", EVFILT_SIGNAL, 0, &sig_filtops, 0 },
193 { "EVFILT_TIMER", EVFILT_TIMER, 0, &timer_filtops, 0 },
194 { "EVFILT_FS", EVFILT_FS, 0, &fs_filtops, 0 },
195 { NULL, 0, 0, NULL, 0 },
196 };
197
198 /* User defined kfilters */
199 static struct kfilter *user_kfilters; /* array */
200 static int user_kfilterc; /* current offset */
201 static int user_kfiltermaxc; /* max size so far */
202 static size_t user_kfiltersz; /* size of allocated memory */
203
204 /*
205 * Global Locks.
206 *
207 * Lock order:
208 *
209 * kqueue_filter_lock
210 * -> kn_kq->kq_fdp->fd_lock
211 * -> object lock (e.g., device driver lock, kqueue_misc_lock, &c.)
212 * -> kn_kq->kq_lock
213 *
214 * Locking rules:
215 *
216 * f_attach: fdp->fd_lock, KERNEL_LOCK
217 * f_detach: fdp->fd_lock, KERNEL_LOCK
218 * f_event(!NOTE_SUBMIT) via kevent: fdp->fd_lock, _no_ object lock
219 * f_event via knote: whatever caller guarantees
220 * Typically, f_event(NOTE_SUBMIT) via knote: object lock
221 * f_event(!NOTE_SUBMIT) via knote: nothing,
222 * acquires/releases object lock inside.
223 */
224 static krwlock_t kqueue_filter_lock; /* lock on filter lists */
225 static kmutex_t kqueue_misc_lock; /* miscellaneous */
226
227 static kauth_listener_t kqueue_listener;
228
229 static int
230 kqueue_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
231 void *arg0, void *arg1, void *arg2, void *arg3)
232 {
233 struct proc *p;
234 int result;
235
236 result = KAUTH_RESULT_DEFER;
237 p = arg0;
238
239 if (action != KAUTH_PROCESS_KEVENT_FILTER)
240 return result;
241
242 if ((kauth_cred_getuid(p->p_cred) != kauth_cred_getuid(cred) ||
243 ISSET(p->p_flag, PK_SUGID)))
244 return result;
245
246 result = KAUTH_RESULT_ALLOW;
247
248 return result;
249 }
250
251 /*
252 * Initialize the kqueue subsystem.
253 */
254 void
255 kqueue_init(void)
256 {
257
258 rw_init(&kqueue_filter_lock);
259 mutex_init(&kqueue_misc_lock, MUTEX_DEFAULT, IPL_NONE);
260
261 kqueue_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
262 kqueue_listener_cb, NULL);
263 }
264
265 /*
266 * Find kfilter entry by name, or NULL if not found.
267 */
268 static struct kfilter *
269 kfilter_byname_sys(const char *name)
270 {
271 int i;
272
273 KASSERT(rw_lock_held(&kqueue_filter_lock));
274
275 for (i = 0; sys_kfilters[i].name != NULL; i++) {
276 if (strcmp(name, sys_kfilters[i].name) == 0)
277 return &sys_kfilters[i];
278 }
279 return NULL;
280 }
281
282 static struct kfilter *
283 kfilter_byname_user(const char *name)
284 {
285 int i;
286
287 KASSERT(rw_lock_held(&kqueue_filter_lock));
288
289 /* user filter slots have a NULL name if previously deregistered */
290 for (i = 0; i < user_kfilterc ; i++) {
291 if (user_kfilters[i].name != NULL &&
292 strcmp(name, user_kfilters[i].name) == 0)
293 return &user_kfilters[i];
294 }
295 return NULL;
296 }
297
298 static struct kfilter *
299 kfilter_byname(const char *name)
300 {
301 struct kfilter *kfilter;
302
303 KASSERT(rw_lock_held(&kqueue_filter_lock));
304
305 if ((kfilter = kfilter_byname_sys(name)) != NULL)
306 return kfilter;
307
308 return kfilter_byname_user(name);
309 }
310
311 /*
312 * Find kfilter entry by filter id, or NULL if not found.
313 * Assumes entries are indexed in filter id order, for speed.
314 */
315 static struct kfilter *
316 kfilter_byfilter(uint32_t filter)
317 {
318 struct kfilter *kfilter;
319
320 KASSERT(rw_lock_held(&kqueue_filter_lock));
321
322 if (filter < EVFILT_SYSCOUNT) /* it's a system filter */
323 kfilter = &sys_kfilters[filter];
324 else if (user_kfilters != NULL &&
325 filter < EVFILT_SYSCOUNT + user_kfilterc)
326 /* it's a user filter */
327 kfilter = &user_kfilters[filter - EVFILT_SYSCOUNT];
328 else
329 return (NULL); /* out of range */
330 KASSERT(kfilter->filter == filter); /* sanity check! */
331 return (kfilter);
332 }
333
334 /*
335 * Register a new kfilter. Stores the entry in user_kfilters.
336 * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
337 * If retfilter != NULL, the new filterid is returned in it.
338 */
339 int
340 kfilter_register(const char *name, const struct filterops *filtops,
341 int *retfilter)
342 {
343 struct kfilter *kfilter;
344 size_t len;
345 int i;
346
347 if (name == NULL || name[0] == '\0' || filtops == NULL)
348 return (EINVAL); /* invalid args */
349
350 rw_enter(&kqueue_filter_lock, RW_WRITER);
351 if (kfilter_byname(name) != NULL) {
352 rw_exit(&kqueue_filter_lock);
353 return (EEXIST); /* already exists */
354 }
355 if (user_kfilterc > 0xffffffff - EVFILT_SYSCOUNT) {
356 rw_exit(&kqueue_filter_lock);
357 return (EINVAL); /* too many */
358 }
359
360 for (i = 0; i < user_kfilterc; i++) {
361 kfilter = &user_kfilters[i];
362 if (kfilter->name == NULL) {
363 /* Previously deregistered slot. Reuse. */
364 goto reuse;
365 }
366 }
367
368 /* check if need to grow user_kfilters */
369 if (user_kfilterc + 1 > user_kfiltermaxc) {
370 /* Grow in KFILTER_EXTENT chunks. */
371 user_kfiltermaxc += KFILTER_EXTENT;
372 len = user_kfiltermaxc * sizeof(*kfilter);
373 kfilter = kmem_alloc(len, KM_SLEEP);
374 memset((char *)kfilter + user_kfiltersz, 0, len - user_kfiltersz);
375 if (user_kfilters != NULL) {
376 memcpy(kfilter, user_kfilters, user_kfiltersz);
377 kmem_free(user_kfilters, user_kfiltersz);
378 }
379 user_kfiltersz = len;
380 user_kfilters = kfilter;
381 }
382 /* Adding new slot */
383 kfilter = &user_kfilters[user_kfilterc++];
384 reuse:
385 kfilter->name = kmem_strdupsize(name, &kfilter->namelen, KM_SLEEP);
386
387 kfilter->filter = (kfilter - user_kfilters) + EVFILT_SYSCOUNT;
388
389 kfilter->filtops = kmem_alloc(sizeof(*filtops), KM_SLEEP);
390 memcpy(__UNCONST(kfilter->filtops), filtops, sizeof(*filtops));
391
392 if (retfilter != NULL)
393 *retfilter = kfilter->filter;
394 rw_exit(&kqueue_filter_lock);
395
396 return (0);
397 }
398
399 /*
400 * Unregister a kfilter previously registered with kfilter_register.
401 * This retains the filter id, but clears the name and frees filtops (filter
402 * operations), so that the number isn't reused during a boot.
403 * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
404 */
405 int
406 kfilter_unregister(const char *name)
407 {
408 struct kfilter *kfilter;
409
410 if (name == NULL || name[0] == '\0')
411 return (EINVAL); /* invalid name */
412
413 rw_enter(&kqueue_filter_lock, RW_WRITER);
414 if (kfilter_byname_sys(name) != NULL) {
415 rw_exit(&kqueue_filter_lock);
416 return (EINVAL); /* can't detach system filters */
417 }
418
419 kfilter = kfilter_byname_user(name);
420 if (kfilter == NULL) {
421 rw_exit(&kqueue_filter_lock);
422 return (ENOENT);
423 }
424 if (kfilter->refcnt != 0) {
425 rw_exit(&kqueue_filter_lock);
426 return (EBUSY);
427 }
428
429 /* Cast away const (but we know it's safe. */
430 kmem_free(__UNCONST(kfilter->name), kfilter->namelen);
431 kfilter->name = NULL; /* mark as `not implemented' */
432
433 if (kfilter->filtops != NULL) {
434 /* Cast away const (but we know it's safe. */
435 kmem_free(__UNCONST(kfilter->filtops),
436 sizeof(*kfilter->filtops));
437 kfilter->filtops = NULL; /* mark as `not implemented' */
438 }
439 rw_exit(&kqueue_filter_lock);
440
441 return (0);
442 }
443
444
445 /*
446 * Filter attach method for EVFILT_READ and EVFILT_WRITE on normal file
447 * descriptors. Calls fileops kqfilter method for given file descriptor.
448 */
449 static int
450 filt_fileattach(struct knote *kn)
451 {
452 file_t *fp;
453
454 fp = kn->kn_obj;
455
456 return (*fp->f_ops->fo_kqfilter)(fp, kn);
457 }
458
459 /*
460 * Filter detach method for EVFILT_READ on kqueue descriptor.
461 */
462 static void
463 filt_kqdetach(struct knote *kn)
464 {
465 struct kqueue *kq;
466
467 kq = ((file_t *)kn->kn_obj)->f_kqueue;
468
469 mutex_spin_enter(&kq->kq_lock);
470 SLIST_REMOVE(&kq->kq_sel.sel_klist, kn, knote, kn_selnext);
471 mutex_spin_exit(&kq->kq_lock);
472 }
473
474 /*
475 * Filter event method for EVFILT_READ on kqueue descriptor.
476 */
477 /*ARGSUSED*/
478 static int
479 filt_kqueue(struct knote *kn, long hint)
480 {
481 struct kqueue *kq;
482 int rv;
483
484 kq = ((file_t *)kn->kn_obj)->f_kqueue;
485
486 if (hint != NOTE_SUBMIT)
487 mutex_spin_enter(&kq->kq_lock);
488 kn->kn_data = kq->kq_count;
489 rv = (kn->kn_data > 0);
490 if (hint != NOTE_SUBMIT)
491 mutex_spin_exit(&kq->kq_lock);
492
493 return rv;
494 }
495
496 /*
497 * Filter attach method for EVFILT_PROC.
498 */
499 static int
500 filt_procattach(struct knote *kn)
501 {
502 struct proc *p;
503 struct lwp *curl;
504
505 curl = curlwp;
506
507 mutex_enter(proc_lock);
508 if (kn->kn_flags & EV_FLAG1) {
509 /*
510 * NOTE_TRACK attaches to the child process too early
511 * for proc_find, so do a raw look up and check the state
512 * explicitly.
513 */
514 p = proc_find_raw(kn->kn_id);
515 if (p != NULL && p->p_stat != SIDL)
516 p = NULL;
517 } else {
518 p = proc_find(kn->kn_id);
519 }
520
521 if (p == NULL) {
522 mutex_exit(proc_lock);
523 return ESRCH;
524 }
525
526 /*
527 * Fail if it's not owned by you, or the last exec gave us
528 * setuid/setgid privs (unless you're root).
529 */
530 mutex_enter(p->p_lock);
531 mutex_exit(proc_lock);
532 if (kauth_authorize_process(curl->l_cred, KAUTH_PROCESS_KEVENT_FILTER,
533 p, NULL, NULL, NULL) != 0) {
534 mutex_exit(p->p_lock);
535 return EACCES;
536 }
537
538 kn->kn_obj = p;
539 kn->kn_flags |= EV_CLEAR; /* automatically set */
540
541 /*
542 * internal flag indicating registration done by kernel
543 */
544 if (kn->kn_flags & EV_FLAG1) {
545 kn->kn_data = kn->kn_sdata; /* ppid */
546 kn->kn_fflags = NOTE_CHILD;
547 kn->kn_flags &= ~EV_FLAG1;
548 }
549 SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext);
550 mutex_exit(p->p_lock);
551
552 return 0;
553 }
554
555 /*
556 * Filter detach method for EVFILT_PROC.
557 *
558 * The knote may be attached to a different process, which may exit,
559 * leaving nothing for the knote to be attached to. So when the process
560 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
561 * it will be deleted when read out. However, as part of the knote deletion,
562 * this routine is called, so a check is needed to avoid actually performing
563 * a detach, because the original process might not exist any more.
564 */
565 static void
566 filt_procdetach(struct knote *kn)
567 {
568 struct proc *p;
569
570 if (kn->kn_status & KN_DETACHED)
571 return;
572
573 p = kn->kn_obj;
574
575 mutex_enter(p->p_lock);
576 SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
577 mutex_exit(p->p_lock);
578 }
579
580 /*
581 * Filter event method for EVFILT_PROC.
582 */
583 static int
584 filt_proc(struct knote *kn, long hint)
585 {
586 u_int event, fflag;
587 struct kevent kev;
588 struct kqueue *kq;
589 int error;
590
591 event = (u_int)hint & NOTE_PCTRLMASK;
592 kq = kn->kn_kq;
593 fflag = 0;
594
595 /* If the user is interested in this event, record it. */
596 if (kn->kn_sfflags & event)
597 fflag |= event;
598
599 if (event == NOTE_EXIT) {
600 struct proc *p = kn->kn_obj;
601
602 if (p != NULL)
603 kn->kn_data = P_WAITSTATUS(p);
604 /*
605 * Process is gone, so flag the event as finished.
606 *
607 * Detach the knote from watched process and mark
608 * it as such. We can't leave this to kqueue_scan(),
609 * since the process might not exist by then. And we
610 * have to do this now, since psignal KNOTE() is called
611 * also for zombies and we might end up reading freed
612 * memory if the kevent would already be picked up
613 * and knote g/c'ed.
614 */
615 filt_procdetach(kn);
616
617 mutex_spin_enter(&kq->kq_lock);
618 kn->kn_status |= KN_DETACHED;
619 /* Mark as ONESHOT, so that the knote it g/c'ed when read */
620 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
621 kn->kn_fflags |= fflag;
622 mutex_spin_exit(&kq->kq_lock);
623
624 return 1;
625 }
626
627 mutex_spin_enter(&kq->kq_lock);
628 if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
629 /*
630 * Process forked, and user wants to track the new process,
631 * so attach a new knote to it, and immediately report an
632 * event with the parent's pid. Register knote with new
633 * process.
634 */
635 kev.ident = hint & NOTE_PDATAMASK; /* pid */
636 kev.filter = kn->kn_filter;
637 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
638 kev.fflags = kn->kn_sfflags;
639 kev.data = kn->kn_id; /* parent */
640 kev.udata = kn->kn_kevent.udata; /* preserve udata */
641 mutex_spin_exit(&kq->kq_lock);
642 error = kqueue_register(kq, &kev);
643 mutex_spin_enter(&kq->kq_lock);
644 if (error != 0)
645 kn->kn_fflags |= NOTE_TRACKERR;
646 }
647 kn->kn_fflags |= fflag;
648 fflag = kn->kn_fflags;
649 mutex_spin_exit(&kq->kq_lock);
650
651 return fflag != 0;
652 }
653
654 static void
655 filt_timerexpire(void *knx)
656 {
657 struct knote *kn = knx;
658 int tticks;
659
660 mutex_enter(&kqueue_misc_lock);
661 kn->kn_data++;
662 knote_activate(kn);
663 if ((kn->kn_flags & EV_ONESHOT) == 0) {
664 tticks = mstohz(kn->kn_sdata);
665 if (tticks <= 0)
666 tticks = 1;
667 callout_schedule((callout_t *)kn->kn_hook, tticks);
668 }
669 mutex_exit(&kqueue_misc_lock);
670 }
671
672 /*
673 * data contains amount of time to sleep, in milliseconds
674 */
675 static int
676 filt_timerattach(struct knote *kn)
677 {
678 callout_t *calloutp;
679 struct kqueue *kq;
680 int tticks;
681
682 tticks = mstohz(kn->kn_sdata);
683
684 /* if the supplied value is under our resolution, use 1 tick */
685 if (tticks == 0) {
686 if (kn->kn_sdata == 0)
687 return EINVAL;
688 tticks = 1;
689 }
690
691 if (atomic_inc_uint_nv(&kq_ncallouts) >= kq_calloutmax ||
692 (calloutp = kmem_alloc(sizeof(*calloutp), KM_NOSLEEP)) == NULL) {
693 atomic_dec_uint(&kq_ncallouts);
694 return ENOMEM;
695 }
696 callout_init(calloutp, CALLOUT_MPSAFE);
697
698 kq = kn->kn_kq;
699 mutex_spin_enter(&kq->kq_lock);
700 kn->kn_flags |= EV_CLEAR; /* automatically set */
701 kn->kn_hook = calloutp;
702 mutex_spin_exit(&kq->kq_lock);
703
704 callout_reset(calloutp, tticks, filt_timerexpire, kn);
705
706 return (0);
707 }
708
709 static void
710 filt_timerdetach(struct knote *kn)
711 {
712 callout_t *calloutp;
713
714 calloutp = (callout_t *)kn->kn_hook;
715 callout_halt(calloutp, NULL);
716 callout_destroy(calloutp);
717 kmem_free(calloutp, sizeof(*calloutp));
718 atomic_dec_uint(&kq_ncallouts);
719 }
720
721 static int
722 filt_timer(struct knote *kn, long hint)
723 {
724 int rv;
725
726 mutex_enter(&kqueue_misc_lock);
727 rv = (kn->kn_data != 0);
728 mutex_exit(&kqueue_misc_lock);
729
730 return rv;
731 }
732
733 /*
734 * Filter event method for EVFILT_FS.
735 */
736 struct klist fs_klist = SLIST_HEAD_INITIALIZER(&fs_klist);
737
738 static int
739 filt_fsattach(struct knote *kn)
740 {
741
742 mutex_enter(&kqueue_misc_lock);
743 kn->kn_flags |= EV_CLEAR;
744 SLIST_INSERT_HEAD(&fs_klist, kn, kn_selnext);
745 mutex_exit(&kqueue_misc_lock);
746
747 return 0;
748 }
749
750 static void
751 filt_fsdetach(struct knote *kn)
752 {
753
754 mutex_enter(&kqueue_misc_lock);
755 SLIST_REMOVE(&fs_klist, kn, knote, kn_selnext);
756 mutex_exit(&kqueue_misc_lock);
757 }
758
759 static int
760 filt_fs(struct knote *kn, long hint)
761 {
762 int rv;
763
764 mutex_enter(&kqueue_misc_lock);
765 kn->kn_fflags |= hint;
766 rv = (kn->kn_fflags != 0);
767 mutex_exit(&kqueue_misc_lock);
768
769 return rv;
770 }
771
772 /*
773 * filt_seltrue:
774 *
775 * This filter "event" routine simulates seltrue().
776 */
777 int
778 filt_seltrue(struct knote *kn, long hint)
779 {
780
781 /*
782 * We don't know how much data can be read/written,
783 * but we know that it *can* be. This is about as
784 * good as select/poll does as well.
785 */
786 kn->kn_data = 0;
787 return (1);
788 }
789
790 /*
791 * This provides full kqfilter entry for device switch tables, which
792 * has same effect as filter using filt_seltrue() as filter method.
793 */
794 static void
795 filt_seltruedetach(struct knote *kn)
796 {
797 /* Nothing to do */
798 }
799
800 const struct filterops seltrue_filtops = {
801 .f_isfd = 1,
802 .f_attach = NULL,
803 .f_detach = filt_seltruedetach,
804 .f_event = filt_seltrue,
805 };
806
807 int
808 seltrue_kqfilter(dev_t dev, struct knote *kn)
809 {
810 switch (kn->kn_filter) {
811 case EVFILT_READ:
812 case EVFILT_WRITE:
813 kn->kn_fop = &seltrue_filtops;
814 break;
815 default:
816 return (EINVAL);
817 }
818
819 /* Nothing more to do */
820 return (0);
821 }
822
823 /*
824 * kqueue(2) system call.
825 */
826 static int
827 kqueue1(struct lwp *l, int flags, register_t *retval)
828 {
829 struct kqueue *kq;
830 file_t *fp;
831 int fd, error;
832
833 if ((error = fd_allocfile(&fp, &fd)) != 0)
834 return error;
835 fp->f_flag = FREAD | FWRITE | (flags & (FNONBLOCK|FNOSIGPIPE));
836 fp->f_type = DTYPE_KQUEUE;
837 fp->f_ops = &kqueueops;
838 kq = kmem_zalloc(sizeof(*kq), KM_SLEEP);
839 mutex_init(&kq->kq_lock, MUTEX_DEFAULT, IPL_SCHED);
840 cv_init(&kq->kq_cv, "kqueue");
841 selinit(&kq->kq_sel);
842 TAILQ_INIT(&kq->kq_head);
843 fp->f_kqueue = kq;
844 *retval = fd;
845 kq->kq_fdp = curlwp->l_fd;
846 fd_set_exclose(l, fd, (flags & O_CLOEXEC) != 0);
847 fd_affix(curproc, fp, fd);
848 return error;
849 }
850
851 /*
852 * kqueue(2) system call.
853 */
854 int
855 sys_kqueue(struct lwp *l, const void *v, register_t *retval)
856 {
857 return kqueue1(l, 0, retval);
858 }
859
860 int
861 sys_kqueue1(struct lwp *l, const struct sys_kqueue1_args *uap,
862 register_t *retval)
863 {
864 /* {
865 syscallarg(int) flags;
866 } */
867 return kqueue1(l, SCARG(uap, flags), retval);
868 }
869
870 /*
871 * kevent(2) system call.
872 */
873 int
874 kevent_fetch_changes(void *ctx, const struct kevent *changelist,
875 struct kevent *changes, size_t index, int n)
876 {
877
878 return copyin(changelist + index, changes, n * sizeof(*changes));
879 }
880
881 int
882 kevent_put_events(void *ctx, struct kevent *events,
883 struct kevent *eventlist, size_t index, int n)
884 {
885
886 return copyout(events, eventlist + index, n * sizeof(*events));
887 }
888
889 static const struct kevent_ops kevent_native_ops = {
890 .keo_private = NULL,
891 .keo_fetch_timeout = copyin,
892 .keo_fetch_changes = kevent_fetch_changes,
893 .keo_put_events = kevent_put_events,
894 };
895
896 int
897 sys___kevent50(struct lwp *l, const struct sys___kevent50_args *uap,
898 register_t *retval)
899 {
900 /* {
901 syscallarg(int) fd;
902 syscallarg(const struct kevent *) changelist;
903 syscallarg(size_t) nchanges;
904 syscallarg(struct kevent *) eventlist;
905 syscallarg(size_t) nevents;
906 syscallarg(const struct timespec *) timeout;
907 } */
908
909 return kevent1(retval, SCARG(uap, fd), SCARG(uap, changelist),
910 SCARG(uap, nchanges), SCARG(uap, eventlist), SCARG(uap, nevents),
911 SCARG(uap, timeout), &kevent_native_ops);
912 }
913
914 int
915 kevent1(register_t *retval, int fd,
916 const struct kevent *changelist, size_t nchanges,
917 struct kevent *eventlist, size_t nevents,
918 const struct timespec *timeout,
919 const struct kevent_ops *keops)
920 {
921 struct kevent *kevp;
922 struct kqueue *kq;
923 struct timespec ts;
924 size_t i, n, ichange;
925 int nerrors, error;
926 struct kevent kevbuf[KQ_NEVENTS]; /* approx 300 bytes on 64-bit */
927 file_t *fp;
928
929 /* check that we're dealing with a kq */
930 fp = fd_getfile(fd);
931 if (fp == NULL)
932 return (EBADF);
933
934 if (fp->f_type != DTYPE_KQUEUE) {
935 fd_putfile(fd);
936 return (EBADF);
937 }
938
939 if (timeout != NULL) {
940 error = (*keops->keo_fetch_timeout)(timeout, &ts, sizeof(ts));
941 if (error)
942 goto done;
943 timeout = &ts;
944 }
945
946 kq = fp->f_kqueue;
947 nerrors = 0;
948 ichange = 0;
949
950 /* traverse list of events to register */
951 while (nchanges > 0) {
952 n = MIN(nchanges, __arraycount(kevbuf));
953 error = (*keops->keo_fetch_changes)(keops->keo_private,
954 changelist, kevbuf, ichange, n);
955 if (error)
956 goto done;
957 for (i = 0; i < n; i++) {
958 kevp = &kevbuf[i];
959 kevp->flags &= ~EV_SYSFLAGS;
960 /* register each knote */
961 error = kqueue_register(kq, kevp);
962 if (!error && !(kevp->flags & EV_RECEIPT))
963 continue;
964 if (nevents == 0)
965 goto done;
966 kevp->flags = EV_ERROR;
967 kevp->data = error;
968 error = (*keops->keo_put_events)
969 (keops->keo_private, kevp,
970 eventlist, nerrors, 1);
971 if (error)
972 goto done;
973 nevents--;
974 nerrors++;
975 }
976 nchanges -= n; /* update the results */
977 ichange += n;
978 }
979 if (nerrors) {
980 *retval = nerrors;
981 error = 0;
982 goto done;
983 }
984
985 /* actually scan through the events */
986 error = kqueue_scan(fp, nevents, eventlist, timeout, retval, keops,
987 kevbuf, __arraycount(kevbuf));
988 done:
989 fd_putfile(fd);
990 return (error);
991 }
992
993 /*
994 * Register a given kevent kev onto the kqueue
995 */
996 static int
997 kqueue_register(struct kqueue *kq, struct kevent *kev)
998 {
999 struct kfilter *kfilter;
1000 filedesc_t *fdp;
1001 file_t *fp;
1002 fdfile_t *ff;
1003 struct knote *kn, *newkn;
1004 struct klist *list;
1005 int error, fd, rv;
1006
1007 fdp = kq->kq_fdp;
1008 fp = NULL;
1009 kn = NULL;
1010 error = 0;
1011 fd = 0;
1012
1013 newkn = kmem_zalloc(sizeof(*newkn), KM_SLEEP);
1014
1015 rw_enter(&kqueue_filter_lock, RW_READER);
1016 kfilter = kfilter_byfilter(kev->filter);
1017 if (kfilter == NULL || kfilter->filtops == NULL) {
1018 /* filter not found nor implemented */
1019 rw_exit(&kqueue_filter_lock);
1020 kmem_free(newkn, sizeof(*newkn));
1021 return (EINVAL);
1022 }
1023
1024 /* search if knote already exists */
1025 if (kfilter->filtops->f_isfd) {
1026 /* monitoring a file descriptor */
1027 /* validate descriptor */
1028 if (kev->ident > INT_MAX
1029 || (fp = fd_getfile(fd = kev->ident)) == NULL) {
1030 rw_exit(&kqueue_filter_lock);
1031 kmem_free(newkn, sizeof(*newkn));
1032 return EBADF;
1033 }
1034 mutex_enter(&fdp->fd_lock);
1035 ff = fdp->fd_dt->dt_ff[fd];
1036 if (ff->ff_refcnt & FR_CLOSING) {
1037 error = EBADF;
1038 goto doneunlock;
1039 }
1040 if (fd <= fdp->fd_lastkqfile) {
1041 SLIST_FOREACH(kn, &ff->ff_knlist, kn_link) {
1042 if (kq == kn->kn_kq &&
1043 kev->filter == kn->kn_filter)
1044 break;
1045 }
1046 }
1047 } else {
1048 /*
1049 * not monitoring a file descriptor, so
1050 * lookup knotes in internal hash table
1051 */
1052 mutex_enter(&fdp->fd_lock);
1053 if (fdp->fd_knhashmask != 0) {
1054 list = &fdp->fd_knhash[
1055 KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
1056 SLIST_FOREACH(kn, list, kn_link) {
1057 if (kev->ident == kn->kn_id &&
1058 kq == kn->kn_kq &&
1059 kev->filter == kn->kn_filter)
1060 break;
1061 }
1062 }
1063 }
1064
1065 /*
1066 * kn now contains the matching knote, or NULL if no match
1067 */
1068 if (kev->flags & EV_ADD) {
1069 if (kn == NULL) {
1070 /* create new knote */
1071 kn = newkn;
1072 newkn = NULL;
1073 kn->kn_obj = fp;
1074 kn->kn_id = kev->ident;
1075 kn->kn_kq = kq;
1076 kn->kn_fop = kfilter->filtops;
1077 kn->kn_kfilter = kfilter;
1078 kn->kn_sfflags = kev->fflags;
1079 kn->kn_sdata = kev->data;
1080 kev->fflags = 0;
1081 kev->data = 0;
1082 kn->kn_kevent = *kev;
1083
1084 KASSERT(kn->kn_fop != NULL);
1085 /*
1086 * apply reference count to knote structure, and
1087 * do not release it at the end of this routine.
1088 */
1089 fp = NULL;
1090
1091 if (!kn->kn_fop->f_isfd) {
1092 /*
1093 * If knote is not on an fd, store on
1094 * internal hash table.
1095 */
1096 if (fdp->fd_knhashmask == 0) {
1097 /* XXXAD can block with fd_lock held */
1098 fdp->fd_knhash = hashinit(KN_HASHSIZE,
1099 HASH_LIST, true,
1100 &fdp->fd_knhashmask);
1101 }
1102 list = &fdp->fd_knhash[KN_HASH(kn->kn_id,
1103 fdp->fd_knhashmask)];
1104 } else {
1105 /* Otherwise, knote is on an fd. */
1106 list = (struct klist *)
1107 &fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
1108 if ((int)kn->kn_id > fdp->fd_lastkqfile)
1109 fdp->fd_lastkqfile = kn->kn_id;
1110 }
1111 SLIST_INSERT_HEAD(list, kn, kn_link);
1112
1113 KERNEL_LOCK(1, NULL); /* XXXSMP */
1114 error = (*kfilter->filtops->f_attach)(kn);
1115 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */
1116 if (error != 0) {
1117 #ifdef DEBUG
1118 const file_t *ft = kn->kn_obj;
1119 uprintf("%s: event type %d not supported for "
1120 "file type %d/%s (error %d)\n", __func__,
1121 kn->kn_filter, ft ? ft->f_type : -1,
1122 ft ? ft->f_ops->fo_name : "?", error);
1123 #endif
1124
1125 /* knote_detach() drops fdp->fd_lock */
1126 knote_detach(kn, fdp, false);
1127 goto done;
1128 }
1129 atomic_inc_uint(&kfilter->refcnt);
1130 } else {
1131 /*
1132 * The user may change some filter values after the
1133 * initial EV_ADD, but doing so will not reset any
1134 * filter which have already been triggered.
1135 */
1136 kn->kn_sfflags = kev->fflags;
1137 kn->kn_sdata = kev->data;
1138 kn->kn_kevent.udata = kev->udata;
1139 }
1140 /*
1141 * We can get here if we are trying to attach
1142 * an event to a file descriptor that does not
1143 * support events, and the attach routine is
1144 * broken and does not return an error.
1145 */
1146 KASSERT(kn->kn_fop != NULL);
1147 KASSERT(kn->kn_fop->f_event != NULL);
1148 KERNEL_LOCK(1, NULL); /* XXXSMP */
1149 rv = (*kn->kn_fop->f_event)(kn, 0);
1150 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */
1151 if (rv)
1152 knote_activate(kn);
1153 } else {
1154 if (kn == NULL) {
1155 error = ENOENT;
1156 goto doneunlock;
1157 }
1158 if (kev->flags & EV_DELETE) {
1159 /* knote_detach() drops fdp->fd_lock */
1160 knote_detach(kn, fdp, true);
1161 goto done;
1162 }
1163 }
1164
1165 /* disable knote */
1166 if ((kev->flags & EV_DISABLE)) {
1167 mutex_spin_enter(&kq->kq_lock);
1168 if ((kn->kn_status & KN_DISABLED) == 0)
1169 kn->kn_status |= KN_DISABLED;
1170 mutex_spin_exit(&kq->kq_lock);
1171 }
1172
1173 /* enable knote */
1174 if ((kev->flags & EV_ENABLE)) {
1175 knote_enqueue(kn);
1176 }
1177 doneunlock:
1178 mutex_exit(&fdp->fd_lock);
1179 done:
1180 rw_exit(&kqueue_filter_lock);
1181 if (newkn != NULL)
1182 kmem_free(newkn, sizeof(*newkn));
1183 if (fp != NULL)
1184 fd_putfile(fd);
1185 return (error);
1186 }
1187
1188 #if defined(DEBUG)
1189 #define KN_FMT(buf, kn) \
1190 (snprintb((buf), sizeof(buf), __KN_FLAG_BITS, (kn)->kn_status), buf)
1191
1192 static void
1193 kqueue_check(const char *func, size_t line, const struct kqueue *kq)
1194 {
1195 const struct knote *kn;
1196 int count;
1197 int nmarker;
1198 char buf[128];
1199
1200 KASSERT(mutex_owned(&kq->kq_lock));
1201 KASSERT(kq->kq_count >= 0);
1202
1203 count = 0;
1204 nmarker = 0;
1205 TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) {
1206 if ((kn->kn_status & (KN_MARKER | KN_QUEUED)) == 0) {
1207 panic("%s,%zu: kq=%p kn=%p !(MARKER|QUEUED) %s",
1208 func, line, kq, kn, KN_FMT(buf, kn));
1209 }
1210 if ((kn->kn_status & KN_MARKER) == 0) {
1211 if (kn->kn_kq != kq) {
1212 panic("%s,%zu: kq=%p kn(%p) != kn->kq(%p): %s",
1213 func, line, kq, kn, kn->kn_kq,
1214 KN_FMT(buf, kn));
1215 }
1216 if ((kn->kn_status & KN_ACTIVE) == 0) {
1217 panic("%s,%zu: kq=%p kn=%p: !ACTIVE %s",
1218 func, line, kq, kn, KN_FMT(buf, kn));
1219 }
1220 count++;
1221 if (count > kq->kq_count) {
1222 goto bad;
1223 }
1224 } else {
1225 nmarker++;
1226 #if 0
1227 if (nmarker > 10000) {
1228 panic("%s,%zu: kq=%p too many markers: "
1229 "%d != %d, nmarker=%d",
1230 func, line, kq, kq->kq_count, count,
1231 nmarker);
1232 }
1233 #endif
1234 }
1235 }
1236 if (kq->kq_count != count) {
1237 bad:
1238 panic("%s,%zu: kq=%p kq->kq_count(%d) != count(%d), nmarker=%d",
1239 func, line, kq, kq->kq_count, count, nmarker);
1240 }
1241 }
1242 #define kq_check(a) kqueue_check(__func__, __LINE__, (a))
1243 #else /* defined(DEBUG) */
1244 #define kq_check(a) /* nothing */
1245 #endif /* defined(DEBUG) */
1246
1247 /*
1248 * Scan through the list of events on fp (for a maximum of maxevents),
1249 * returning the results in to ulistp. Timeout is determined by tsp; if
1250 * NULL, wait indefinitely, if 0 valued, perform a poll, otherwise wait
1251 * as appropriate.
1252 */
1253 static int
1254 kqueue_scan(file_t *fp, size_t maxevents, struct kevent *ulistp,
1255 const struct timespec *tsp, register_t *retval,
1256 const struct kevent_ops *keops, struct kevent *kevbuf,
1257 size_t kevcnt)
1258 {
1259 struct kqueue *kq;
1260 struct kevent *kevp;
1261 struct timespec ats, sleepts;
1262 struct knote *kn, *marker, morker;
1263 size_t count, nkev, nevents;
1264 int timeout, error, rv;
1265 filedesc_t *fdp;
1266
1267 fdp = curlwp->l_fd;
1268 kq = fp->f_kqueue;
1269 count = maxevents;
1270 nkev = nevents = error = 0;
1271 if (count == 0) {
1272 *retval = 0;
1273 return 0;
1274 }
1275
1276 if (tsp) { /* timeout supplied */
1277 ats = *tsp;
1278 if (inittimeleft(&ats, &sleepts) == -1) {
1279 *retval = maxevents;
1280 return EINVAL;
1281 }
1282 timeout = tstohz(&ats);
1283 if (timeout <= 0)
1284 timeout = -1; /* do poll */
1285 } else {
1286 /* no timeout, wait forever */
1287 timeout = 0;
1288 }
1289
1290 memset(&morker, 0, sizeof(morker));
1291 marker = &morker;
1292 marker->kn_status = KN_MARKER;
1293 mutex_spin_enter(&kq->kq_lock);
1294 retry:
1295 kevp = kevbuf;
1296 if (kq->kq_count == 0) {
1297 if (timeout >= 0) {
1298 error = cv_timedwait_sig(&kq->kq_cv,
1299 &kq->kq_lock, timeout);
1300 if (error == 0) {
1301 if (tsp == NULL || (timeout =
1302 gettimeleft(&ats, &sleepts)) > 0)
1303 goto retry;
1304 } else {
1305 /* don't restart after signals... */
1306 if (error == ERESTART)
1307 error = EINTR;
1308 if (error == EWOULDBLOCK)
1309 error = 0;
1310 }
1311 }
1312 mutex_spin_exit(&kq->kq_lock);
1313 } else {
1314 /* mark end of knote list */
1315 TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
1316
1317 /*
1318 * Acquire the fdp->fd_lock interlock to avoid races with
1319 * file creation/destruction from other threads.
1320 */
1321 mutex_spin_exit(&kq->kq_lock);
1322 mutex_enter(&fdp->fd_lock);
1323 mutex_spin_enter(&kq->kq_lock);
1324
1325 while (count != 0) {
1326 kn = TAILQ_FIRST(&kq->kq_head); /* get next knote */
1327 while ((kn->kn_status & KN_MARKER) != 0) {
1328 if (kn == marker) {
1329 /* it's our marker, stop */
1330 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1331 if (count < maxevents || (tsp != NULL &&
1332 (timeout = gettimeleft(&ats,
1333 &sleepts)) <= 0))
1334 goto done;
1335 mutex_exit(&fdp->fd_lock);
1336 goto retry;
1337 }
1338 /* someone else's marker. */
1339 kn = TAILQ_NEXT(kn, kn_tqe);
1340 }
1341 kq_check(kq);
1342 kq->kq_count--;
1343 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1344 kn->kn_status &= ~KN_QUEUED;
1345 kn->kn_status |= KN_BUSY;
1346 kq_check(kq);
1347 if (kn->kn_status & KN_DISABLED) {
1348 kn->kn_status &= ~KN_BUSY;
1349 /* don't want disabled events */
1350 continue;
1351 }
1352 if ((kn->kn_flags & EV_ONESHOT) == 0) {
1353 mutex_spin_exit(&kq->kq_lock);
1354 KASSERT(kn->kn_fop != NULL);
1355 KASSERT(kn->kn_fop->f_event != NULL);
1356 KERNEL_LOCK(1, NULL); /* XXXSMP */
1357 KASSERT(mutex_owned(&fdp->fd_lock));
1358 rv = (*kn->kn_fop->f_event)(kn, 0);
1359 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */
1360 mutex_spin_enter(&kq->kq_lock);
1361 /* Re-poll if note was re-enqueued. */
1362 if ((kn->kn_status & KN_QUEUED) != 0) {
1363 kn->kn_status &= ~KN_BUSY;
1364 continue;
1365 }
1366 if (rv == 0) {
1367 /*
1368 * non-ONESHOT event that hasn't
1369 * triggered again, so de-queue.
1370 */
1371 kn->kn_status &= ~(KN_ACTIVE|KN_BUSY);
1372 continue;
1373 }
1374 }
1375 /* XXXAD should be got from f_event if !oneshot. */
1376 *kevp++ = kn->kn_kevent;
1377 nkev++;
1378 if (kn->kn_flags & EV_ONESHOT) {
1379 /* delete ONESHOT events after retrieval */
1380 kn->kn_status &= ~KN_BUSY;
1381 mutex_spin_exit(&kq->kq_lock);
1382 knote_detach(kn, fdp, true);
1383 mutex_enter(&fdp->fd_lock);
1384 mutex_spin_enter(&kq->kq_lock);
1385 } else if (kn->kn_flags & EV_CLEAR) {
1386 /* clear state after retrieval */
1387 kn->kn_data = 0;
1388 kn->kn_fflags = 0;
1389 kn->kn_status &= ~(KN_QUEUED|KN_ACTIVE|KN_BUSY);
1390 } else if (kn->kn_flags & EV_DISPATCH) {
1391 kn->kn_status |= KN_DISABLED;
1392 kn->kn_status &= ~(KN_QUEUED|KN_ACTIVE|KN_BUSY);
1393 } else {
1394 /* add event back on list */
1395 kq_check(kq);
1396 kn->kn_status |= KN_QUEUED;
1397 kn->kn_status &= ~KN_BUSY;
1398 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1399 kq->kq_count++;
1400 kq_check(kq);
1401 }
1402 if (nkev == kevcnt) {
1403 /* do copyouts in kevcnt chunks */
1404 mutex_spin_exit(&kq->kq_lock);
1405 mutex_exit(&fdp->fd_lock);
1406 error = (*keops->keo_put_events)
1407 (keops->keo_private,
1408 kevbuf, ulistp, nevents, nkev);
1409 mutex_enter(&fdp->fd_lock);
1410 mutex_spin_enter(&kq->kq_lock);
1411 nevents += nkev;
1412 nkev = 0;
1413 kevp = kevbuf;
1414 }
1415 count--;
1416 if (error != 0 || count == 0) {
1417 /* remove marker */
1418 TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
1419 break;
1420 }
1421 }
1422 done:
1423 mutex_spin_exit(&kq->kq_lock);
1424 mutex_exit(&fdp->fd_lock);
1425 }
1426 if (nkev != 0) {
1427 /* copyout remaining events */
1428 error = (*keops->keo_put_events)(keops->keo_private,
1429 kevbuf, ulistp, nevents, nkev);
1430 }
1431 *retval = maxevents - count;
1432
1433 return error;
1434 }
1435
1436 /*
1437 * fileops ioctl method for a kqueue descriptor.
1438 *
1439 * Two ioctls are currently supported. They both use struct kfilter_mapping:
1440 * KFILTER_BYNAME find name for filter, and return result in
1441 * name, which is of size len.
1442 * KFILTER_BYFILTER find filter for name. len is ignored.
1443 */
1444 /*ARGSUSED*/
1445 static int
1446 kqueue_ioctl(file_t *fp, u_long com, void *data)
1447 {
1448 struct kfilter_mapping *km;
1449 const struct kfilter *kfilter;
1450 char *name;
1451 int error;
1452
1453 km = data;
1454 error = 0;
1455 name = kmem_alloc(KFILTER_MAXNAME, KM_SLEEP);
1456
1457 switch (com) {
1458 case KFILTER_BYFILTER: /* convert filter -> name */
1459 rw_enter(&kqueue_filter_lock, RW_READER);
1460 kfilter = kfilter_byfilter(km->filter);
1461 if (kfilter != NULL) {
1462 strlcpy(name, kfilter->name, KFILTER_MAXNAME);
1463 rw_exit(&kqueue_filter_lock);
1464 error = copyoutstr(name, km->name, km->len, NULL);
1465 } else {
1466 rw_exit(&kqueue_filter_lock);
1467 error = ENOENT;
1468 }
1469 break;
1470
1471 case KFILTER_BYNAME: /* convert name -> filter */
1472 error = copyinstr(km->name, name, KFILTER_MAXNAME, NULL);
1473 if (error) {
1474 break;
1475 }
1476 rw_enter(&kqueue_filter_lock, RW_READER);
1477 kfilter = kfilter_byname(name);
1478 if (kfilter != NULL)
1479 km->filter = kfilter->filter;
1480 else
1481 error = ENOENT;
1482 rw_exit(&kqueue_filter_lock);
1483 break;
1484
1485 default:
1486 error = ENOTTY;
1487 break;
1488
1489 }
1490 kmem_free(name, KFILTER_MAXNAME);
1491 return (error);
1492 }
1493
1494 /*
1495 * fileops fcntl method for a kqueue descriptor.
1496 */
1497 static int
1498 kqueue_fcntl(file_t *fp, u_int com, void *data)
1499 {
1500
1501 return (ENOTTY);
1502 }
1503
1504 /*
1505 * fileops poll method for a kqueue descriptor.
1506 * Determine if kqueue has events pending.
1507 */
1508 static int
1509 kqueue_poll(file_t *fp, int events)
1510 {
1511 struct kqueue *kq;
1512 int revents;
1513
1514 kq = fp->f_kqueue;
1515
1516 revents = 0;
1517 if (events & (POLLIN | POLLRDNORM)) {
1518 mutex_spin_enter(&kq->kq_lock);
1519 if (kq->kq_count != 0) {
1520 revents |= events & (POLLIN | POLLRDNORM);
1521 } else {
1522 selrecord(curlwp, &kq->kq_sel);
1523 }
1524 kq_check(kq);
1525 mutex_spin_exit(&kq->kq_lock);
1526 }
1527
1528 return revents;
1529 }
1530
1531 /*
1532 * fileops stat method for a kqueue descriptor.
1533 * Returns dummy info, with st_size being number of events pending.
1534 */
1535 static int
1536 kqueue_stat(file_t *fp, struct stat *st)
1537 {
1538 struct kqueue *kq;
1539
1540 kq = fp->f_kqueue;
1541
1542 memset(st, 0, sizeof(*st));
1543 st->st_size = kq->kq_count;
1544 st->st_blksize = sizeof(struct kevent);
1545 st->st_mode = S_IFIFO;
1546
1547 return 0;
1548 }
1549
1550 static void
1551 kqueue_doclose(struct kqueue *kq, struct klist *list, int fd)
1552 {
1553 struct knote *kn;
1554 filedesc_t *fdp;
1555
1556 fdp = kq->kq_fdp;
1557
1558 KASSERT(mutex_owned(&fdp->fd_lock));
1559
1560 for (kn = SLIST_FIRST(list); kn != NULL;) {
1561 if (kq != kn->kn_kq) {
1562 kn = SLIST_NEXT(kn, kn_link);
1563 continue;
1564 }
1565 knote_detach(kn, fdp, true);
1566 mutex_enter(&fdp->fd_lock);
1567 kn = SLIST_FIRST(list);
1568 }
1569 }
1570
1571
1572 /*
1573 * fileops close method for a kqueue descriptor.
1574 */
1575 static int
1576 kqueue_close(file_t *fp)
1577 {
1578 struct kqueue *kq;
1579 filedesc_t *fdp;
1580 fdfile_t *ff;
1581 int i;
1582
1583 kq = fp->f_kqueue;
1584 fp->f_kqueue = NULL;
1585 fp->f_type = 0;
1586 fdp = curlwp->l_fd;
1587
1588 mutex_enter(&fdp->fd_lock);
1589 for (i = 0; i <= fdp->fd_lastkqfile; i++) {
1590 if ((ff = fdp->fd_dt->dt_ff[i]) == NULL)
1591 continue;
1592 kqueue_doclose(kq, (struct klist *)&ff->ff_knlist, i);
1593 }
1594 if (fdp->fd_knhashmask != 0) {
1595 for (i = 0; i < fdp->fd_knhashmask + 1; i++) {
1596 kqueue_doclose(kq, &fdp->fd_knhash[i], -1);
1597 }
1598 }
1599 mutex_exit(&fdp->fd_lock);
1600
1601 KASSERT(kq->kq_count == 0);
1602 mutex_destroy(&kq->kq_lock);
1603 cv_destroy(&kq->kq_cv);
1604 seldestroy(&kq->kq_sel);
1605 kmem_free(kq, sizeof(*kq));
1606
1607 return (0);
1608 }
1609
1610 /*
1611 * struct fileops kqfilter method for a kqueue descriptor.
1612 * Event triggered when monitored kqueue changes.
1613 */
1614 static int
1615 kqueue_kqfilter(file_t *fp, struct knote *kn)
1616 {
1617 struct kqueue *kq;
1618
1619 kq = ((file_t *)kn->kn_obj)->f_kqueue;
1620
1621 KASSERT(fp == kn->kn_obj);
1622
1623 if (kn->kn_filter != EVFILT_READ)
1624 return 1;
1625
1626 kn->kn_fop = &kqread_filtops;
1627 mutex_enter(&kq->kq_lock);
1628 SLIST_INSERT_HEAD(&kq->kq_sel.sel_klist, kn, kn_selnext);
1629 mutex_exit(&kq->kq_lock);
1630
1631 return 0;
1632 }
1633
1634
1635 /*
1636 * Walk down a list of knotes, activating them if their event has
1637 * triggered. The caller's object lock (e.g. device driver lock)
1638 * must be held.
1639 */
1640 void
1641 knote(struct klist *list, long hint)
1642 {
1643 struct knote *kn, *tmpkn;
1644
1645 SLIST_FOREACH_SAFE(kn, list, kn_selnext, tmpkn) {
1646 KASSERT(kn->kn_fop != NULL);
1647 KASSERT(kn->kn_fop->f_event != NULL);
1648 if ((*kn->kn_fop->f_event)(kn, hint))
1649 knote_activate(kn);
1650 }
1651 }
1652
1653 /*
1654 * Remove all knotes referencing a specified fd
1655 */
1656 void
1657 knote_fdclose(int fd)
1658 {
1659 struct klist *list;
1660 struct knote *kn;
1661 filedesc_t *fdp;
1662
1663 fdp = curlwp->l_fd;
1664 list = (struct klist *)&fdp->fd_dt->dt_ff[fd]->ff_knlist;
1665 mutex_enter(&fdp->fd_lock);
1666 while ((kn = SLIST_FIRST(list)) != NULL) {
1667 knote_detach(kn, fdp, true);
1668 mutex_enter(&fdp->fd_lock);
1669 }
1670 mutex_exit(&fdp->fd_lock);
1671 }
1672
1673 /*
1674 * Drop knote. Called with fdp->fd_lock held, and will drop before
1675 * returning.
1676 */
1677 static void
1678 knote_detach(struct knote *kn, filedesc_t *fdp, bool dofop)
1679 {
1680 struct klist *list;
1681 struct kqueue *kq;
1682
1683 kq = kn->kn_kq;
1684
1685 KASSERT((kn->kn_status & KN_MARKER) == 0);
1686 KASSERT(mutex_owned(&fdp->fd_lock));
1687
1688 KASSERT(kn->kn_fop != NULL);
1689 /* Remove from monitored object. */
1690 if (dofop) {
1691 KASSERT(kn->kn_fop->f_detach != NULL);
1692 KERNEL_LOCK(1, NULL); /* XXXSMP */
1693 (*kn->kn_fop->f_detach)(kn);
1694 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */
1695 }
1696
1697 /* Remove from descriptor table. */
1698 if (kn->kn_fop->f_isfd)
1699 list = (struct klist *)&fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
1700 else
1701 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
1702
1703 SLIST_REMOVE(list, kn, knote, kn_link);
1704
1705 /* Remove from kqueue. */
1706 again:
1707 mutex_spin_enter(&kq->kq_lock);
1708 if ((kn->kn_status & KN_QUEUED) != 0) {
1709 kq_check(kq);
1710 kq->kq_count--;
1711 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1712 kn->kn_status &= ~KN_QUEUED;
1713 kq_check(kq);
1714 } else if (kn->kn_status & KN_BUSY) {
1715 mutex_spin_exit(&kq->kq_lock);
1716 goto again;
1717 }
1718 mutex_spin_exit(&kq->kq_lock);
1719
1720 mutex_exit(&fdp->fd_lock);
1721 if (kn->kn_fop->f_isfd)
1722 fd_putfile(kn->kn_id);
1723 atomic_dec_uint(&kn->kn_kfilter->refcnt);
1724 kmem_free(kn, sizeof(*kn));
1725 }
1726
1727 /*
1728 * Queue new event for knote.
1729 */
1730 static void
1731 knote_enqueue(struct knote *kn)
1732 {
1733 struct kqueue *kq;
1734
1735 KASSERT((kn->kn_status & KN_MARKER) == 0);
1736
1737 kq = kn->kn_kq;
1738
1739 mutex_spin_enter(&kq->kq_lock);
1740 if ((kn->kn_status & KN_DISABLED) != 0) {
1741 kn->kn_status &= ~KN_DISABLED;
1742 }
1743 if ((kn->kn_status & (KN_ACTIVE | KN_QUEUED)) == KN_ACTIVE) {
1744 kq_check(kq);
1745 kn->kn_status |= KN_QUEUED;
1746 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1747 kq->kq_count++;
1748 kq_check(kq);
1749 cv_broadcast(&kq->kq_cv);
1750 selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
1751 }
1752 mutex_spin_exit(&kq->kq_lock);
1753 }
1754 /*
1755 * Queue new event for knote.
1756 */
1757 static void
1758 knote_activate(struct knote *kn)
1759 {
1760 struct kqueue *kq;
1761
1762 KASSERT((kn->kn_status & KN_MARKER) == 0);
1763
1764 kq = kn->kn_kq;
1765
1766 mutex_spin_enter(&kq->kq_lock);
1767 kn->kn_status |= KN_ACTIVE;
1768 if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) {
1769 kq_check(kq);
1770 kn->kn_status |= KN_QUEUED;
1771 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1772 kq->kq_count++;
1773 kq_check(kq);
1774 cv_broadcast(&kq->kq_cv);
1775 selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
1776 }
1777 mutex_spin_exit(&kq->kq_lock);
1778 }
1779