kern_event.c revision 1.104 1 /* $NetBSD: kern_event.c,v 1.104 2018/11/13 06:58:14 maxv Exp $ */
2
3 /*-
4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*-
33 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon (at) FreeBSD.org>
34 * All rights reserved.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 *
45 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 * SUCH DAMAGE.
56 *
57 * FreeBSD: src/sys/kern/kern_event.c,v 1.27 2001/07/05 17:10:44 rwatson Exp
58 */
59
60 #include <sys/cdefs.h>
61 __KERNEL_RCSID(0, "$NetBSD: kern_event.c,v 1.104 2018/11/13 06:58:14 maxv Exp $");
62
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/kernel.h>
66 #include <sys/wait.h>
67 #include <sys/proc.h>
68 #include <sys/file.h>
69 #include <sys/select.h>
70 #include <sys/queue.h>
71 #include <sys/event.h>
72 #include <sys/eventvar.h>
73 #include <sys/poll.h>
74 #include <sys/kmem.h>
75 #include <sys/stat.h>
76 #include <sys/filedesc.h>
77 #include <sys/syscallargs.h>
78 #include <sys/kauth.h>
79 #include <sys/conf.h>
80 #include <sys/atomic.h>
81
82 static int kqueue_scan(file_t *, size_t, struct kevent *,
83 const struct timespec *, register_t *,
84 const struct kevent_ops *, struct kevent *,
85 size_t);
86 static int kqueue_ioctl(file_t *, u_long, void *);
87 static int kqueue_fcntl(file_t *, u_int, void *);
88 static int kqueue_poll(file_t *, int);
89 static int kqueue_kqfilter(file_t *, struct knote *);
90 static int kqueue_stat(file_t *, struct stat *);
91 static int kqueue_close(file_t *);
92 static int kqueue_register(struct kqueue *, struct kevent *);
93 static void kqueue_doclose(struct kqueue *, struct klist *, int);
94
95 static void knote_detach(struct knote *, filedesc_t *fdp, bool);
96 static void knote_enqueue(struct knote *);
97 static void knote_activate(struct knote *);
98
99 static void filt_kqdetach(struct knote *);
100 static int filt_kqueue(struct knote *, long hint);
101 static int filt_procattach(struct knote *);
102 static void filt_procdetach(struct knote *);
103 static int filt_proc(struct knote *, long hint);
104 static int filt_fileattach(struct knote *);
105 static void filt_timerexpire(void *x);
106 static int filt_timerattach(struct knote *);
107 static void filt_timerdetach(struct knote *);
108 static int filt_timer(struct knote *, long hint);
109 static int filt_fsattach(struct knote *kn);
110 static void filt_fsdetach(struct knote *kn);
111 static int filt_fs(struct knote *kn, long hint);
112
113 static const struct fileops kqueueops = {
114 .fo_name = "kqueue",
115 .fo_read = (void *)enxio,
116 .fo_write = (void *)enxio,
117 .fo_ioctl = kqueue_ioctl,
118 .fo_fcntl = kqueue_fcntl,
119 .fo_poll = kqueue_poll,
120 .fo_stat = kqueue_stat,
121 .fo_close = kqueue_close,
122 .fo_kqfilter = kqueue_kqfilter,
123 .fo_restart = fnullop_restart,
124 };
125
126 static const struct filterops kqread_filtops = {
127 .f_isfd = 1,
128 .f_attach = NULL,
129 .f_detach = filt_kqdetach,
130 .f_event = filt_kqueue,
131 };
132
133 static const struct filterops proc_filtops = {
134 .f_isfd = 0,
135 .f_attach = filt_procattach,
136 .f_detach = filt_procdetach,
137 .f_event = filt_proc,
138 };
139
140 static const struct filterops file_filtops = {
141 .f_isfd = 1,
142 .f_attach = filt_fileattach,
143 .f_detach = NULL,
144 .f_event = NULL,
145 };
146
147 static const struct filterops timer_filtops = {
148 .f_isfd = 0,
149 .f_attach = filt_timerattach,
150 .f_detach = filt_timerdetach,
151 .f_event = filt_timer,
152 };
153
154 static const struct filterops fs_filtops = {
155 .f_isfd = 0,
156 .f_attach = filt_fsattach,
157 .f_detach = filt_fsdetach,
158 .f_event = filt_fs,
159 };
160
161 static u_int kq_ncallouts = 0;
162 static int kq_calloutmax = (4 * 1024);
163
164 #define KN_HASHSIZE 64 /* XXX should be tunable */
165 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
166
167 extern const struct filterops sig_filtops;
168
169 /*
170 * Table for for all system-defined filters.
171 * These should be listed in the numeric order of the EVFILT_* defines.
172 * If filtops is NULL, the filter isn't implemented in NetBSD.
173 * End of list is when name is NULL.
174 *
175 * Note that 'refcnt' is meaningless for built-in filters.
176 */
177 struct kfilter {
178 const char *name; /* name of filter */
179 uint32_t filter; /* id of filter */
180 unsigned refcnt; /* reference count */
181 const struct filterops *filtops;/* operations for filter */
182 size_t namelen; /* length of name string */
183 };
184
185 /* System defined filters */
186 static struct kfilter sys_kfilters[] = {
187 { "EVFILT_READ", EVFILT_READ, 0, &file_filtops, 0 },
188 { "EVFILT_WRITE", EVFILT_WRITE, 0, &file_filtops, 0, },
189 { "EVFILT_AIO", EVFILT_AIO, 0, NULL, 0 },
190 { "EVFILT_VNODE", EVFILT_VNODE, 0, &file_filtops, 0 },
191 { "EVFILT_PROC", EVFILT_PROC, 0, &proc_filtops, 0 },
192 { "EVFILT_SIGNAL", EVFILT_SIGNAL, 0, &sig_filtops, 0 },
193 { "EVFILT_TIMER", EVFILT_TIMER, 0, &timer_filtops, 0 },
194 { "EVFILT_FS", EVFILT_FS, 0, &fs_filtops, 0 },
195 { NULL, 0, 0, NULL, 0 },
196 };
197
198 /* User defined kfilters */
199 static struct kfilter *user_kfilters; /* array */
200 static int user_kfilterc; /* current offset */
201 static int user_kfiltermaxc; /* max size so far */
202 static size_t user_kfiltersz; /* size of allocated memory */
203
204 /*
205 * Global Locks.
206 *
207 * Lock order:
208 *
209 * kqueue_filter_lock
210 * -> kn_kq->kq_fdp->fd_lock
211 * -> object lock (e.g., device driver lock, kqueue_misc_lock, &c.)
212 * -> kn_kq->kq_lock
213 *
214 * Locking rules:
215 *
216 * f_attach: fdp->fd_lock, KERNEL_LOCK
217 * f_detach: fdp->fd_lock, KERNEL_LOCK
218 * f_event(!NOTE_SUBMIT) via kevent: fdp->fd_lock, _no_ object lock
219 * f_event via knote: whatever caller guarantees
220 * Typically, f_event(NOTE_SUBMIT) via knote: object lock
221 * f_event(!NOTE_SUBMIT) via knote: nothing,
222 * acquires/releases object lock inside.
223 */
224 static krwlock_t kqueue_filter_lock; /* lock on filter lists */
225 static kmutex_t kqueue_misc_lock; /* miscellaneous */
226
227 static kauth_listener_t kqueue_listener;
228
229 static int
230 kqueue_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
231 void *arg0, void *arg1, void *arg2, void *arg3)
232 {
233 struct proc *p;
234 int result;
235
236 result = KAUTH_RESULT_DEFER;
237 p = arg0;
238
239 if (action != KAUTH_PROCESS_KEVENT_FILTER)
240 return result;
241
242 if ((kauth_cred_getuid(p->p_cred) != kauth_cred_getuid(cred) ||
243 ISSET(p->p_flag, PK_SUGID)))
244 return result;
245
246 result = KAUTH_RESULT_ALLOW;
247
248 return result;
249 }
250
251 /*
252 * Initialize the kqueue subsystem.
253 */
254 void
255 kqueue_init(void)
256 {
257
258 rw_init(&kqueue_filter_lock);
259 mutex_init(&kqueue_misc_lock, MUTEX_DEFAULT, IPL_NONE);
260
261 kqueue_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
262 kqueue_listener_cb, NULL);
263 }
264
265 /*
266 * Find kfilter entry by name, or NULL if not found.
267 */
268 static struct kfilter *
269 kfilter_byname_sys(const char *name)
270 {
271 int i;
272
273 KASSERT(rw_lock_held(&kqueue_filter_lock));
274
275 for (i = 0; sys_kfilters[i].name != NULL; i++) {
276 if (strcmp(name, sys_kfilters[i].name) == 0)
277 return &sys_kfilters[i];
278 }
279 return NULL;
280 }
281
282 static struct kfilter *
283 kfilter_byname_user(const char *name)
284 {
285 int i;
286
287 KASSERT(rw_lock_held(&kqueue_filter_lock));
288
289 /* user filter slots have a NULL name if previously deregistered */
290 for (i = 0; i < user_kfilterc ; i++) {
291 if (user_kfilters[i].name != NULL &&
292 strcmp(name, user_kfilters[i].name) == 0)
293 return &user_kfilters[i];
294 }
295 return NULL;
296 }
297
298 static struct kfilter *
299 kfilter_byname(const char *name)
300 {
301 struct kfilter *kfilter;
302
303 KASSERT(rw_lock_held(&kqueue_filter_lock));
304
305 if ((kfilter = kfilter_byname_sys(name)) != NULL)
306 return kfilter;
307
308 return kfilter_byname_user(name);
309 }
310
311 /*
312 * Find kfilter entry by filter id, or NULL if not found.
313 * Assumes entries are indexed in filter id order, for speed.
314 */
315 static struct kfilter *
316 kfilter_byfilter(uint32_t filter)
317 {
318 struct kfilter *kfilter;
319
320 KASSERT(rw_lock_held(&kqueue_filter_lock));
321
322 if (filter < EVFILT_SYSCOUNT) /* it's a system filter */
323 kfilter = &sys_kfilters[filter];
324 else if (user_kfilters != NULL &&
325 filter < EVFILT_SYSCOUNT + user_kfilterc)
326 /* it's a user filter */
327 kfilter = &user_kfilters[filter - EVFILT_SYSCOUNT];
328 else
329 return (NULL); /* out of range */
330 KASSERT(kfilter->filter == filter); /* sanity check! */
331 return (kfilter);
332 }
333
334 /*
335 * Register a new kfilter. Stores the entry in user_kfilters.
336 * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
337 * If retfilter != NULL, the new filterid is returned in it.
338 */
339 int
340 kfilter_register(const char *name, const struct filterops *filtops,
341 int *retfilter)
342 {
343 struct kfilter *kfilter;
344 size_t len;
345 int i;
346
347 if (name == NULL || name[0] == '\0' || filtops == NULL)
348 return (EINVAL); /* invalid args */
349
350 rw_enter(&kqueue_filter_lock, RW_WRITER);
351 if (kfilter_byname(name) != NULL) {
352 rw_exit(&kqueue_filter_lock);
353 return (EEXIST); /* already exists */
354 }
355 if (user_kfilterc > 0xffffffff - EVFILT_SYSCOUNT) {
356 rw_exit(&kqueue_filter_lock);
357 return (EINVAL); /* too many */
358 }
359
360 for (i = 0; i < user_kfilterc; i++) {
361 kfilter = &user_kfilters[i];
362 if (kfilter->name == NULL) {
363 /* Previously deregistered slot. Reuse. */
364 goto reuse;
365 }
366 }
367
368 /* check if need to grow user_kfilters */
369 if (user_kfilterc + 1 > user_kfiltermaxc) {
370 /* Grow in KFILTER_EXTENT chunks. */
371 user_kfiltermaxc += KFILTER_EXTENT;
372 len = user_kfiltermaxc * sizeof(*kfilter);
373 kfilter = kmem_alloc(len, KM_SLEEP);
374 memset((char *)kfilter + user_kfiltersz, 0, len - user_kfiltersz);
375 if (user_kfilters != NULL) {
376 memcpy(kfilter, user_kfilters, user_kfiltersz);
377 kmem_free(user_kfilters, user_kfiltersz);
378 }
379 user_kfiltersz = len;
380 user_kfilters = kfilter;
381 }
382 /* Adding new slot */
383 kfilter = &user_kfilters[user_kfilterc++];
384 reuse:
385 kfilter->name = kmem_strdupsize(name, &kfilter->namelen, KM_SLEEP);
386
387 kfilter->filter = (kfilter - user_kfilters) + EVFILT_SYSCOUNT;
388
389 kfilter->filtops = kmem_alloc(sizeof(*filtops), KM_SLEEP);
390 memcpy(__UNCONST(kfilter->filtops), filtops, sizeof(*filtops));
391
392 if (retfilter != NULL)
393 *retfilter = kfilter->filter;
394 rw_exit(&kqueue_filter_lock);
395
396 return (0);
397 }
398
399 /*
400 * Unregister a kfilter previously registered with kfilter_register.
401 * This retains the filter id, but clears the name and frees filtops (filter
402 * operations), so that the number isn't reused during a boot.
403 * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
404 */
405 int
406 kfilter_unregister(const char *name)
407 {
408 struct kfilter *kfilter;
409
410 if (name == NULL || name[0] == '\0')
411 return (EINVAL); /* invalid name */
412
413 rw_enter(&kqueue_filter_lock, RW_WRITER);
414 if (kfilter_byname_sys(name) != NULL) {
415 rw_exit(&kqueue_filter_lock);
416 return (EINVAL); /* can't detach system filters */
417 }
418
419 kfilter = kfilter_byname_user(name);
420 if (kfilter == NULL) {
421 rw_exit(&kqueue_filter_lock);
422 return (ENOENT);
423 }
424 if (kfilter->refcnt != 0) {
425 rw_exit(&kqueue_filter_lock);
426 return (EBUSY);
427 }
428
429 /* Cast away const (but we know it's safe. */
430 kmem_free(__UNCONST(kfilter->name), kfilter->namelen);
431 kfilter->name = NULL; /* mark as `not implemented' */
432
433 if (kfilter->filtops != NULL) {
434 /* Cast away const (but we know it's safe. */
435 kmem_free(__UNCONST(kfilter->filtops),
436 sizeof(*kfilter->filtops));
437 kfilter->filtops = NULL; /* mark as `not implemented' */
438 }
439 rw_exit(&kqueue_filter_lock);
440
441 return (0);
442 }
443
444
445 /*
446 * Filter attach method for EVFILT_READ and EVFILT_WRITE on normal file
447 * descriptors. Calls fileops kqfilter method for given file descriptor.
448 */
449 static int
450 filt_fileattach(struct knote *kn)
451 {
452 file_t *fp;
453
454 fp = kn->kn_obj;
455
456 return (*fp->f_ops->fo_kqfilter)(fp, kn);
457 }
458
459 /*
460 * Filter detach method for EVFILT_READ on kqueue descriptor.
461 */
462 static void
463 filt_kqdetach(struct knote *kn)
464 {
465 struct kqueue *kq;
466
467 kq = ((file_t *)kn->kn_obj)->f_kqueue;
468
469 mutex_spin_enter(&kq->kq_lock);
470 SLIST_REMOVE(&kq->kq_sel.sel_klist, kn, knote, kn_selnext);
471 mutex_spin_exit(&kq->kq_lock);
472 }
473
474 /*
475 * Filter event method for EVFILT_READ on kqueue descriptor.
476 */
477 /*ARGSUSED*/
478 static int
479 filt_kqueue(struct knote *kn, long hint)
480 {
481 struct kqueue *kq;
482 int rv;
483
484 kq = ((file_t *)kn->kn_obj)->f_kqueue;
485
486 if (hint != NOTE_SUBMIT)
487 mutex_spin_enter(&kq->kq_lock);
488 kn->kn_data = kq->kq_count;
489 rv = (kn->kn_data > 0);
490 if (hint != NOTE_SUBMIT)
491 mutex_spin_exit(&kq->kq_lock);
492
493 return rv;
494 }
495
496 /*
497 * Filter attach method for EVFILT_PROC.
498 */
499 static int
500 filt_procattach(struct knote *kn)
501 {
502 struct proc *p;
503 struct lwp *curl;
504
505 curl = curlwp;
506
507 mutex_enter(proc_lock);
508 if (kn->kn_flags & EV_FLAG1) {
509 /*
510 * NOTE_TRACK attaches to the child process too early
511 * for proc_find, so do a raw look up and check the state
512 * explicitly.
513 */
514 p = proc_find_raw(kn->kn_id);
515 if (p != NULL && p->p_stat != SIDL)
516 p = NULL;
517 } else {
518 p = proc_find(kn->kn_id);
519 }
520
521 if (p == NULL) {
522 mutex_exit(proc_lock);
523 return ESRCH;
524 }
525
526 /*
527 * Fail if it's not owned by you, or the last exec gave us
528 * setuid/setgid privs (unless you're root).
529 */
530 mutex_enter(p->p_lock);
531 mutex_exit(proc_lock);
532 if (kauth_authorize_process(curl->l_cred, KAUTH_PROCESS_KEVENT_FILTER,
533 p, NULL, NULL, NULL) != 0) {
534 mutex_exit(p->p_lock);
535 return EACCES;
536 }
537
538 kn->kn_obj = p;
539 kn->kn_flags |= EV_CLEAR; /* automatically set */
540
541 /*
542 * internal flag indicating registration done by kernel
543 */
544 if (kn->kn_flags & EV_FLAG1) {
545 kn->kn_data = kn->kn_sdata; /* ppid */
546 kn->kn_fflags = NOTE_CHILD;
547 kn->kn_flags &= ~EV_FLAG1;
548 }
549 SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext);
550 mutex_exit(p->p_lock);
551
552 return 0;
553 }
554
555 /*
556 * Filter detach method for EVFILT_PROC.
557 *
558 * The knote may be attached to a different process, which may exit,
559 * leaving nothing for the knote to be attached to. So when the process
560 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
561 * it will be deleted when read out. However, as part of the knote deletion,
562 * this routine is called, so a check is needed to avoid actually performing
563 * a detach, because the original process might not exist any more.
564 */
565 static void
566 filt_procdetach(struct knote *kn)
567 {
568 struct proc *p;
569
570 if (kn->kn_status & KN_DETACHED)
571 return;
572
573 p = kn->kn_obj;
574
575 mutex_enter(p->p_lock);
576 SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
577 mutex_exit(p->p_lock);
578 }
579
580 /*
581 * Filter event method for EVFILT_PROC.
582 */
583 static int
584 filt_proc(struct knote *kn, long hint)
585 {
586 u_int event, fflag;
587 struct kevent kev;
588 struct kqueue *kq;
589 int error;
590
591 event = (u_int)hint & NOTE_PCTRLMASK;
592 kq = kn->kn_kq;
593 fflag = 0;
594
595 /* If the user is interested in this event, record it. */
596 if (kn->kn_sfflags & event)
597 fflag |= event;
598
599 if (event == NOTE_EXIT) {
600 struct proc *p = kn->kn_obj;
601
602 if (p != NULL)
603 kn->kn_data = P_WAITSTATUS(p);
604 /*
605 * Process is gone, so flag the event as finished.
606 *
607 * Detach the knote from watched process and mark
608 * it as such. We can't leave this to kqueue_scan(),
609 * since the process might not exist by then. And we
610 * have to do this now, since psignal KNOTE() is called
611 * also for zombies and we might end up reading freed
612 * memory if the kevent would already be picked up
613 * and knote g/c'ed.
614 */
615 filt_procdetach(kn);
616
617 mutex_spin_enter(&kq->kq_lock);
618 kn->kn_status |= KN_DETACHED;
619 /* Mark as ONESHOT, so that the knote it g/c'ed when read */
620 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
621 kn->kn_fflags |= fflag;
622 mutex_spin_exit(&kq->kq_lock);
623
624 return 1;
625 }
626
627 mutex_spin_enter(&kq->kq_lock);
628 if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
629 /*
630 * Process forked, and user wants to track the new process,
631 * so attach a new knote to it, and immediately report an
632 * event with the parent's pid. Register knote with new
633 * process.
634 */
635 memset(&kev, 0, sizeof(kev));
636 kev.ident = hint & NOTE_PDATAMASK; /* pid */
637 kev.filter = kn->kn_filter;
638 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
639 kev.fflags = kn->kn_sfflags;
640 kev.data = kn->kn_id; /* parent */
641 kev.udata = kn->kn_kevent.udata; /* preserve udata */
642 mutex_spin_exit(&kq->kq_lock);
643 error = kqueue_register(kq, &kev);
644 mutex_spin_enter(&kq->kq_lock);
645 if (error != 0)
646 kn->kn_fflags |= NOTE_TRACKERR;
647 }
648 kn->kn_fflags |= fflag;
649 fflag = kn->kn_fflags;
650 mutex_spin_exit(&kq->kq_lock);
651
652 return fflag != 0;
653 }
654
655 static void
656 filt_timerexpire(void *knx)
657 {
658 struct knote *kn = knx;
659 int tticks;
660
661 mutex_enter(&kqueue_misc_lock);
662 kn->kn_data++;
663 knote_activate(kn);
664 if ((kn->kn_flags & EV_ONESHOT) == 0) {
665 tticks = mstohz(kn->kn_sdata);
666 if (tticks <= 0)
667 tticks = 1;
668 callout_schedule((callout_t *)kn->kn_hook, tticks);
669 }
670 mutex_exit(&kqueue_misc_lock);
671 }
672
673 /*
674 * data contains amount of time to sleep, in milliseconds
675 */
676 static int
677 filt_timerattach(struct knote *kn)
678 {
679 callout_t *calloutp;
680 struct kqueue *kq;
681 int tticks;
682
683 tticks = mstohz(kn->kn_sdata);
684
685 /* if the supplied value is under our resolution, use 1 tick */
686 if (tticks == 0) {
687 if (kn->kn_sdata == 0)
688 return EINVAL;
689 tticks = 1;
690 }
691
692 if (atomic_inc_uint_nv(&kq_ncallouts) >= kq_calloutmax ||
693 (calloutp = kmem_alloc(sizeof(*calloutp), KM_NOSLEEP)) == NULL) {
694 atomic_dec_uint(&kq_ncallouts);
695 return ENOMEM;
696 }
697 callout_init(calloutp, CALLOUT_MPSAFE);
698
699 kq = kn->kn_kq;
700 mutex_spin_enter(&kq->kq_lock);
701 kn->kn_flags |= EV_CLEAR; /* automatically set */
702 kn->kn_hook = calloutp;
703 mutex_spin_exit(&kq->kq_lock);
704
705 callout_reset(calloutp, tticks, filt_timerexpire, kn);
706
707 return (0);
708 }
709
710 static void
711 filt_timerdetach(struct knote *kn)
712 {
713 callout_t *calloutp;
714 struct kqueue *kq = kn->kn_kq;
715
716 mutex_spin_enter(&kq->kq_lock);
717 /* prevent rescheduling when we expire */
718 kn->kn_flags |= EV_ONESHOT;
719 mutex_spin_exit(&kq->kq_lock);
720
721 calloutp = (callout_t *)kn->kn_hook;
722 callout_halt(calloutp, NULL);
723 callout_destroy(calloutp);
724 kmem_free(calloutp, sizeof(*calloutp));
725 atomic_dec_uint(&kq_ncallouts);
726 }
727
728 static int
729 filt_timer(struct knote *kn, long hint)
730 {
731 int rv;
732
733 mutex_enter(&kqueue_misc_lock);
734 rv = (kn->kn_data != 0);
735 mutex_exit(&kqueue_misc_lock);
736
737 return rv;
738 }
739
740 /*
741 * Filter event method for EVFILT_FS.
742 */
743 struct klist fs_klist = SLIST_HEAD_INITIALIZER(&fs_klist);
744
745 static int
746 filt_fsattach(struct knote *kn)
747 {
748
749 mutex_enter(&kqueue_misc_lock);
750 kn->kn_flags |= EV_CLEAR;
751 SLIST_INSERT_HEAD(&fs_klist, kn, kn_selnext);
752 mutex_exit(&kqueue_misc_lock);
753
754 return 0;
755 }
756
757 static void
758 filt_fsdetach(struct knote *kn)
759 {
760
761 mutex_enter(&kqueue_misc_lock);
762 SLIST_REMOVE(&fs_klist, kn, knote, kn_selnext);
763 mutex_exit(&kqueue_misc_lock);
764 }
765
766 static int
767 filt_fs(struct knote *kn, long hint)
768 {
769 int rv;
770
771 mutex_enter(&kqueue_misc_lock);
772 kn->kn_fflags |= hint;
773 rv = (kn->kn_fflags != 0);
774 mutex_exit(&kqueue_misc_lock);
775
776 return rv;
777 }
778
779 /*
780 * filt_seltrue:
781 *
782 * This filter "event" routine simulates seltrue().
783 */
784 int
785 filt_seltrue(struct knote *kn, long hint)
786 {
787
788 /*
789 * We don't know how much data can be read/written,
790 * but we know that it *can* be. This is about as
791 * good as select/poll does as well.
792 */
793 kn->kn_data = 0;
794 return (1);
795 }
796
797 /*
798 * This provides full kqfilter entry for device switch tables, which
799 * has same effect as filter using filt_seltrue() as filter method.
800 */
801 static void
802 filt_seltruedetach(struct knote *kn)
803 {
804 /* Nothing to do */
805 }
806
807 const struct filterops seltrue_filtops = {
808 .f_isfd = 1,
809 .f_attach = NULL,
810 .f_detach = filt_seltruedetach,
811 .f_event = filt_seltrue,
812 };
813
814 int
815 seltrue_kqfilter(dev_t dev, struct knote *kn)
816 {
817 switch (kn->kn_filter) {
818 case EVFILT_READ:
819 case EVFILT_WRITE:
820 kn->kn_fop = &seltrue_filtops;
821 break;
822 default:
823 return (EINVAL);
824 }
825
826 /* Nothing more to do */
827 return (0);
828 }
829
830 /*
831 * kqueue(2) system call.
832 */
833 static int
834 kqueue1(struct lwp *l, int flags, register_t *retval)
835 {
836 struct kqueue *kq;
837 file_t *fp;
838 int fd, error;
839
840 if ((error = fd_allocfile(&fp, &fd)) != 0)
841 return error;
842 fp->f_flag = FREAD | FWRITE | (flags & (FNONBLOCK|FNOSIGPIPE));
843 fp->f_type = DTYPE_KQUEUE;
844 fp->f_ops = &kqueueops;
845 kq = kmem_zalloc(sizeof(*kq), KM_SLEEP);
846 mutex_init(&kq->kq_lock, MUTEX_DEFAULT, IPL_SCHED);
847 cv_init(&kq->kq_cv, "kqueue");
848 selinit(&kq->kq_sel);
849 TAILQ_INIT(&kq->kq_head);
850 fp->f_kqueue = kq;
851 *retval = fd;
852 kq->kq_fdp = curlwp->l_fd;
853 fd_set_exclose(l, fd, (flags & O_CLOEXEC) != 0);
854 fd_affix(curproc, fp, fd);
855 return error;
856 }
857
858 /*
859 * kqueue(2) system call.
860 */
861 int
862 sys_kqueue(struct lwp *l, const void *v, register_t *retval)
863 {
864 return kqueue1(l, 0, retval);
865 }
866
867 int
868 sys_kqueue1(struct lwp *l, const struct sys_kqueue1_args *uap,
869 register_t *retval)
870 {
871 /* {
872 syscallarg(int) flags;
873 } */
874 return kqueue1(l, SCARG(uap, flags), retval);
875 }
876
877 /*
878 * kevent(2) system call.
879 */
880 int
881 kevent_fetch_changes(void *ctx, const struct kevent *changelist,
882 struct kevent *changes, size_t index, int n)
883 {
884
885 return copyin(changelist + index, changes, n * sizeof(*changes));
886 }
887
888 int
889 kevent_put_events(void *ctx, struct kevent *events,
890 struct kevent *eventlist, size_t index, int n)
891 {
892
893 return copyout(events, eventlist + index, n * sizeof(*events));
894 }
895
896 static const struct kevent_ops kevent_native_ops = {
897 .keo_private = NULL,
898 .keo_fetch_timeout = copyin,
899 .keo_fetch_changes = kevent_fetch_changes,
900 .keo_put_events = kevent_put_events,
901 };
902
903 int
904 sys___kevent50(struct lwp *l, const struct sys___kevent50_args *uap,
905 register_t *retval)
906 {
907 /* {
908 syscallarg(int) fd;
909 syscallarg(const struct kevent *) changelist;
910 syscallarg(size_t) nchanges;
911 syscallarg(struct kevent *) eventlist;
912 syscallarg(size_t) nevents;
913 syscallarg(const struct timespec *) timeout;
914 } */
915
916 return kevent1(retval, SCARG(uap, fd), SCARG(uap, changelist),
917 SCARG(uap, nchanges), SCARG(uap, eventlist), SCARG(uap, nevents),
918 SCARG(uap, timeout), &kevent_native_ops);
919 }
920
921 int
922 kevent1(register_t *retval, int fd,
923 const struct kevent *changelist, size_t nchanges,
924 struct kevent *eventlist, size_t nevents,
925 const struct timespec *timeout,
926 const struct kevent_ops *keops)
927 {
928 struct kevent *kevp;
929 struct kqueue *kq;
930 struct timespec ts;
931 size_t i, n, ichange;
932 int nerrors, error;
933 struct kevent kevbuf[KQ_NEVENTS]; /* approx 300 bytes on 64-bit */
934 file_t *fp;
935
936 /* check that we're dealing with a kq */
937 fp = fd_getfile(fd);
938 if (fp == NULL)
939 return (EBADF);
940
941 if (fp->f_type != DTYPE_KQUEUE) {
942 fd_putfile(fd);
943 return (EBADF);
944 }
945
946 if (timeout != NULL) {
947 error = (*keops->keo_fetch_timeout)(timeout, &ts, sizeof(ts));
948 if (error)
949 goto done;
950 timeout = &ts;
951 }
952
953 kq = fp->f_kqueue;
954 nerrors = 0;
955 ichange = 0;
956
957 /* traverse list of events to register */
958 while (nchanges > 0) {
959 n = MIN(nchanges, __arraycount(kevbuf));
960 error = (*keops->keo_fetch_changes)(keops->keo_private,
961 changelist, kevbuf, ichange, n);
962 if (error)
963 goto done;
964 for (i = 0; i < n; i++) {
965 kevp = &kevbuf[i];
966 kevp->flags &= ~EV_SYSFLAGS;
967 /* register each knote */
968 error = kqueue_register(kq, kevp);
969 if (!error && !(kevp->flags & EV_RECEIPT))
970 continue;
971 if (nevents == 0)
972 goto done;
973 kevp->flags = EV_ERROR;
974 kevp->data = error;
975 error = (*keops->keo_put_events)
976 (keops->keo_private, kevp,
977 eventlist, nerrors, 1);
978 if (error)
979 goto done;
980 nevents--;
981 nerrors++;
982 }
983 nchanges -= n; /* update the results */
984 ichange += n;
985 }
986 if (nerrors) {
987 *retval = nerrors;
988 error = 0;
989 goto done;
990 }
991
992 /* actually scan through the events */
993 error = kqueue_scan(fp, nevents, eventlist, timeout, retval, keops,
994 kevbuf, __arraycount(kevbuf));
995 done:
996 fd_putfile(fd);
997 return (error);
998 }
999
1000 /*
1001 * Register a given kevent kev onto the kqueue
1002 */
1003 static int
1004 kqueue_register(struct kqueue *kq, struct kevent *kev)
1005 {
1006 struct kfilter *kfilter;
1007 filedesc_t *fdp;
1008 file_t *fp;
1009 fdfile_t *ff;
1010 struct knote *kn, *newkn;
1011 struct klist *list;
1012 int error, fd, rv;
1013
1014 fdp = kq->kq_fdp;
1015 fp = NULL;
1016 kn = NULL;
1017 error = 0;
1018 fd = 0;
1019
1020 newkn = kmem_zalloc(sizeof(*newkn), KM_SLEEP);
1021
1022 rw_enter(&kqueue_filter_lock, RW_READER);
1023 kfilter = kfilter_byfilter(kev->filter);
1024 if (kfilter == NULL || kfilter->filtops == NULL) {
1025 /* filter not found nor implemented */
1026 rw_exit(&kqueue_filter_lock);
1027 kmem_free(newkn, sizeof(*newkn));
1028 return (EINVAL);
1029 }
1030
1031 /* search if knote already exists */
1032 if (kfilter->filtops->f_isfd) {
1033 /* monitoring a file descriptor */
1034 /* validate descriptor */
1035 if (kev->ident > INT_MAX
1036 || (fp = fd_getfile(fd = kev->ident)) == NULL) {
1037 rw_exit(&kqueue_filter_lock);
1038 kmem_free(newkn, sizeof(*newkn));
1039 return EBADF;
1040 }
1041 mutex_enter(&fdp->fd_lock);
1042 ff = fdp->fd_dt->dt_ff[fd];
1043 if (ff->ff_refcnt & FR_CLOSING) {
1044 error = EBADF;
1045 goto doneunlock;
1046 }
1047 if (fd <= fdp->fd_lastkqfile) {
1048 SLIST_FOREACH(kn, &ff->ff_knlist, kn_link) {
1049 if (kq == kn->kn_kq &&
1050 kev->filter == kn->kn_filter)
1051 break;
1052 }
1053 }
1054 } else {
1055 /*
1056 * not monitoring a file descriptor, so
1057 * lookup knotes in internal hash table
1058 */
1059 mutex_enter(&fdp->fd_lock);
1060 if (fdp->fd_knhashmask != 0) {
1061 list = &fdp->fd_knhash[
1062 KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
1063 SLIST_FOREACH(kn, list, kn_link) {
1064 if (kev->ident == kn->kn_id &&
1065 kq == kn->kn_kq &&
1066 kev->filter == kn->kn_filter)
1067 break;
1068 }
1069 }
1070 }
1071
1072 /*
1073 * kn now contains the matching knote, or NULL if no match
1074 */
1075 if (kev->flags & EV_ADD) {
1076 if (kn == NULL) {
1077 /* create new knote */
1078 kn = newkn;
1079 newkn = NULL;
1080 kn->kn_obj = fp;
1081 kn->kn_id = kev->ident;
1082 kn->kn_kq = kq;
1083 kn->kn_fop = kfilter->filtops;
1084 kn->kn_kfilter = kfilter;
1085 kn->kn_sfflags = kev->fflags;
1086 kn->kn_sdata = kev->data;
1087 kev->fflags = 0;
1088 kev->data = 0;
1089 kn->kn_kevent = *kev;
1090
1091 KASSERT(kn->kn_fop != NULL);
1092 /*
1093 * apply reference count to knote structure, and
1094 * do not release it at the end of this routine.
1095 */
1096 fp = NULL;
1097
1098 if (!kn->kn_fop->f_isfd) {
1099 /*
1100 * If knote is not on an fd, store on
1101 * internal hash table.
1102 */
1103 if (fdp->fd_knhashmask == 0) {
1104 /* XXXAD can block with fd_lock held */
1105 fdp->fd_knhash = hashinit(KN_HASHSIZE,
1106 HASH_LIST, true,
1107 &fdp->fd_knhashmask);
1108 }
1109 list = &fdp->fd_knhash[KN_HASH(kn->kn_id,
1110 fdp->fd_knhashmask)];
1111 } else {
1112 /* Otherwise, knote is on an fd. */
1113 list = (struct klist *)
1114 &fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
1115 if ((int)kn->kn_id > fdp->fd_lastkqfile)
1116 fdp->fd_lastkqfile = kn->kn_id;
1117 }
1118 SLIST_INSERT_HEAD(list, kn, kn_link);
1119
1120 KERNEL_LOCK(1, NULL); /* XXXSMP */
1121 error = (*kfilter->filtops->f_attach)(kn);
1122 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */
1123 if (error != 0) {
1124 #ifdef DEBUG
1125 const file_t *ft = kn->kn_obj;
1126 uprintf("%s: event type %d not supported for "
1127 "file type %d/%s (error %d)\n", __func__,
1128 kn->kn_filter, ft ? ft->f_type : -1,
1129 ft ? ft->f_ops->fo_name : "?", error);
1130 #endif
1131
1132 /* knote_detach() drops fdp->fd_lock */
1133 knote_detach(kn, fdp, false);
1134 goto done;
1135 }
1136 atomic_inc_uint(&kfilter->refcnt);
1137 } else {
1138 /*
1139 * The user may change some filter values after the
1140 * initial EV_ADD, but doing so will not reset any
1141 * filter which have already been triggered.
1142 */
1143 kn->kn_sfflags = kev->fflags;
1144 kn->kn_sdata = kev->data;
1145 kn->kn_kevent.udata = kev->udata;
1146 }
1147 /*
1148 * We can get here if we are trying to attach
1149 * an event to a file descriptor that does not
1150 * support events, and the attach routine is
1151 * broken and does not return an error.
1152 */
1153 KASSERT(kn->kn_fop != NULL);
1154 KASSERT(kn->kn_fop->f_event != NULL);
1155 KERNEL_LOCK(1, NULL); /* XXXSMP */
1156 rv = (*kn->kn_fop->f_event)(kn, 0);
1157 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */
1158 if (rv)
1159 knote_activate(kn);
1160 } else {
1161 if (kn == NULL) {
1162 error = ENOENT;
1163 goto doneunlock;
1164 }
1165 if (kev->flags & EV_DELETE) {
1166 /* knote_detach() drops fdp->fd_lock */
1167 knote_detach(kn, fdp, true);
1168 goto done;
1169 }
1170 }
1171
1172 /* disable knote */
1173 if ((kev->flags & EV_DISABLE)) {
1174 mutex_spin_enter(&kq->kq_lock);
1175 if ((kn->kn_status & KN_DISABLED) == 0)
1176 kn->kn_status |= KN_DISABLED;
1177 mutex_spin_exit(&kq->kq_lock);
1178 }
1179
1180 /* enable knote */
1181 if ((kev->flags & EV_ENABLE)) {
1182 knote_enqueue(kn);
1183 }
1184 doneunlock:
1185 mutex_exit(&fdp->fd_lock);
1186 done:
1187 rw_exit(&kqueue_filter_lock);
1188 if (newkn != NULL)
1189 kmem_free(newkn, sizeof(*newkn));
1190 if (fp != NULL)
1191 fd_putfile(fd);
1192 return (error);
1193 }
1194
1195 #if defined(DEBUG)
1196 #define KN_FMT(buf, kn) \
1197 (snprintb((buf), sizeof(buf), __KN_FLAG_BITS, (kn)->kn_status), buf)
1198
1199 static void
1200 kqueue_check(const char *func, size_t line, const struct kqueue *kq)
1201 {
1202 const struct knote *kn;
1203 int count;
1204 int nmarker;
1205 char buf[128];
1206
1207 KASSERT(mutex_owned(&kq->kq_lock));
1208 KASSERT(kq->kq_count >= 0);
1209
1210 count = 0;
1211 nmarker = 0;
1212 TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) {
1213 if ((kn->kn_status & (KN_MARKER | KN_QUEUED)) == 0) {
1214 panic("%s,%zu: kq=%p kn=%p !(MARKER|QUEUED) %s",
1215 func, line, kq, kn, KN_FMT(buf, kn));
1216 }
1217 if ((kn->kn_status & KN_MARKER) == 0) {
1218 if (kn->kn_kq != kq) {
1219 panic("%s,%zu: kq=%p kn(%p) != kn->kq(%p): %s",
1220 func, line, kq, kn, kn->kn_kq,
1221 KN_FMT(buf, kn));
1222 }
1223 if ((kn->kn_status & KN_ACTIVE) == 0) {
1224 panic("%s,%zu: kq=%p kn=%p: !ACTIVE %s",
1225 func, line, kq, kn, KN_FMT(buf, kn));
1226 }
1227 count++;
1228 if (count > kq->kq_count) {
1229 goto bad;
1230 }
1231 } else {
1232 nmarker++;
1233 #if 0
1234 if (nmarker > 10000) {
1235 panic("%s,%zu: kq=%p too many markers: "
1236 "%d != %d, nmarker=%d",
1237 func, line, kq, kq->kq_count, count,
1238 nmarker);
1239 }
1240 #endif
1241 }
1242 }
1243 if (kq->kq_count != count) {
1244 bad:
1245 panic("%s,%zu: kq=%p kq->kq_count(%d) != count(%d), nmarker=%d",
1246 func, line, kq, kq->kq_count, count, nmarker);
1247 }
1248 }
1249 #define kq_check(a) kqueue_check(__func__, __LINE__, (a))
1250 #else /* defined(DEBUG) */
1251 #define kq_check(a) /* nothing */
1252 #endif /* defined(DEBUG) */
1253
1254 /*
1255 * Scan through the list of events on fp (for a maximum of maxevents),
1256 * returning the results in to ulistp. Timeout is determined by tsp; if
1257 * NULL, wait indefinitely, if 0 valued, perform a poll, otherwise wait
1258 * as appropriate.
1259 */
1260 static int
1261 kqueue_scan(file_t *fp, size_t maxevents, struct kevent *ulistp,
1262 const struct timespec *tsp, register_t *retval,
1263 const struct kevent_ops *keops, struct kevent *kevbuf,
1264 size_t kevcnt)
1265 {
1266 struct kqueue *kq;
1267 struct kevent *kevp;
1268 struct timespec ats, sleepts;
1269 struct knote *kn, *marker, morker;
1270 size_t count, nkev, nevents;
1271 int timeout, error, rv;
1272 filedesc_t *fdp;
1273
1274 fdp = curlwp->l_fd;
1275 kq = fp->f_kqueue;
1276 count = maxevents;
1277 nkev = nevents = error = 0;
1278 if (count == 0) {
1279 *retval = 0;
1280 return 0;
1281 }
1282
1283 if (tsp) { /* timeout supplied */
1284 ats = *tsp;
1285 if (inittimeleft(&ats, &sleepts) == -1) {
1286 *retval = maxevents;
1287 return EINVAL;
1288 }
1289 timeout = tstohz(&ats);
1290 if (timeout <= 0)
1291 timeout = -1; /* do poll */
1292 } else {
1293 /* no timeout, wait forever */
1294 timeout = 0;
1295 }
1296
1297 memset(&morker, 0, sizeof(morker));
1298 marker = &morker;
1299 marker->kn_status = KN_MARKER;
1300 mutex_spin_enter(&kq->kq_lock);
1301 retry:
1302 kevp = kevbuf;
1303 if (kq->kq_count == 0) {
1304 if (timeout >= 0) {
1305 error = cv_timedwait_sig(&kq->kq_cv,
1306 &kq->kq_lock, timeout);
1307 if (error == 0) {
1308 if (tsp == NULL || (timeout =
1309 gettimeleft(&ats, &sleepts)) > 0)
1310 goto retry;
1311 } else {
1312 /* don't restart after signals... */
1313 if (error == ERESTART)
1314 error = EINTR;
1315 if (error == EWOULDBLOCK)
1316 error = 0;
1317 }
1318 }
1319 mutex_spin_exit(&kq->kq_lock);
1320 } else {
1321 /* mark end of knote list */
1322 TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
1323
1324 /*
1325 * Acquire the fdp->fd_lock interlock to avoid races with
1326 * file creation/destruction from other threads.
1327 */
1328 mutex_spin_exit(&kq->kq_lock);
1329 mutex_enter(&fdp->fd_lock);
1330 mutex_spin_enter(&kq->kq_lock);
1331
1332 while (count != 0) {
1333 kn = TAILQ_FIRST(&kq->kq_head); /* get next knote */
1334 while ((kn->kn_status & KN_MARKER) != 0) {
1335 if (kn == marker) {
1336 /* it's our marker, stop */
1337 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1338 if (count < maxevents || (tsp != NULL &&
1339 (timeout = gettimeleft(&ats,
1340 &sleepts)) <= 0))
1341 goto done;
1342 mutex_exit(&fdp->fd_lock);
1343 goto retry;
1344 }
1345 /* someone else's marker. */
1346 kn = TAILQ_NEXT(kn, kn_tqe);
1347 }
1348 kq_check(kq);
1349 kq->kq_count--;
1350 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1351 kn->kn_status &= ~KN_QUEUED;
1352 kn->kn_status |= KN_BUSY;
1353 kq_check(kq);
1354 if (kn->kn_status & KN_DISABLED) {
1355 kn->kn_status &= ~KN_BUSY;
1356 /* don't want disabled events */
1357 continue;
1358 }
1359 if ((kn->kn_flags & EV_ONESHOT) == 0) {
1360 mutex_spin_exit(&kq->kq_lock);
1361 KASSERT(kn->kn_fop != NULL);
1362 KASSERT(kn->kn_fop->f_event != NULL);
1363 KERNEL_LOCK(1, NULL); /* XXXSMP */
1364 KASSERT(mutex_owned(&fdp->fd_lock));
1365 rv = (*kn->kn_fop->f_event)(kn, 0);
1366 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */
1367 mutex_spin_enter(&kq->kq_lock);
1368 /* Re-poll if note was re-enqueued. */
1369 if ((kn->kn_status & KN_QUEUED) != 0) {
1370 kn->kn_status &= ~KN_BUSY;
1371 continue;
1372 }
1373 if (rv == 0) {
1374 /*
1375 * non-ONESHOT event that hasn't
1376 * triggered again, so de-queue.
1377 */
1378 kn->kn_status &= ~(KN_ACTIVE|KN_BUSY);
1379 continue;
1380 }
1381 }
1382 /* XXXAD should be got from f_event if !oneshot. */
1383 *kevp++ = kn->kn_kevent;
1384 nkev++;
1385 if (kn->kn_flags & EV_ONESHOT) {
1386 /* delete ONESHOT events after retrieval */
1387 kn->kn_status &= ~KN_BUSY;
1388 mutex_spin_exit(&kq->kq_lock);
1389 knote_detach(kn, fdp, true);
1390 mutex_enter(&fdp->fd_lock);
1391 mutex_spin_enter(&kq->kq_lock);
1392 } else if (kn->kn_flags & EV_CLEAR) {
1393 /* clear state after retrieval */
1394 kn->kn_data = 0;
1395 kn->kn_fflags = 0;
1396 kn->kn_status &= ~(KN_QUEUED|KN_ACTIVE|KN_BUSY);
1397 } else if (kn->kn_flags & EV_DISPATCH) {
1398 kn->kn_status |= KN_DISABLED;
1399 kn->kn_status &= ~(KN_QUEUED|KN_ACTIVE|KN_BUSY);
1400 } else {
1401 /* add event back on list */
1402 kq_check(kq);
1403 kn->kn_status |= KN_QUEUED;
1404 kn->kn_status &= ~KN_BUSY;
1405 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1406 kq->kq_count++;
1407 kq_check(kq);
1408 }
1409 if (nkev == kevcnt) {
1410 /* do copyouts in kevcnt chunks */
1411 mutex_spin_exit(&kq->kq_lock);
1412 mutex_exit(&fdp->fd_lock);
1413 error = (*keops->keo_put_events)
1414 (keops->keo_private,
1415 kevbuf, ulistp, nevents, nkev);
1416 mutex_enter(&fdp->fd_lock);
1417 mutex_spin_enter(&kq->kq_lock);
1418 nevents += nkev;
1419 nkev = 0;
1420 kevp = kevbuf;
1421 }
1422 count--;
1423 if (error != 0 || count == 0) {
1424 /* remove marker */
1425 TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
1426 break;
1427 }
1428 }
1429 done:
1430 mutex_spin_exit(&kq->kq_lock);
1431 mutex_exit(&fdp->fd_lock);
1432 }
1433 if (nkev != 0) {
1434 /* copyout remaining events */
1435 error = (*keops->keo_put_events)(keops->keo_private,
1436 kevbuf, ulistp, nevents, nkev);
1437 }
1438 *retval = maxevents - count;
1439
1440 return error;
1441 }
1442
1443 /*
1444 * fileops ioctl method for a kqueue descriptor.
1445 *
1446 * Two ioctls are currently supported. They both use struct kfilter_mapping:
1447 * KFILTER_BYNAME find name for filter, and return result in
1448 * name, which is of size len.
1449 * KFILTER_BYFILTER find filter for name. len is ignored.
1450 */
1451 /*ARGSUSED*/
1452 static int
1453 kqueue_ioctl(file_t *fp, u_long com, void *data)
1454 {
1455 struct kfilter_mapping *km;
1456 const struct kfilter *kfilter;
1457 char *name;
1458 int error;
1459
1460 km = data;
1461 error = 0;
1462 name = kmem_alloc(KFILTER_MAXNAME, KM_SLEEP);
1463
1464 switch (com) {
1465 case KFILTER_BYFILTER: /* convert filter -> name */
1466 rw_enter(&kqueue_filter_lock, RW_READER);
1467 kfilter = kfilter_byfilter(km->filter);
1468 if (kfilter != NULL) {
1469 strlcpy(name, kfilter->name, KFILTER_MAXNAME);
1470 rw_exit(&kqueue_filter_lock);
1471 error = copyoutstr(name, km->name, km->len, NULL);
1472 } else {
1473 rw_exit(&kqueue_filter_lock);
1474 error = ENOENT;
1475 }
1476 break;
1477
1478 case KFILTER_BYNAME: /* convert name -> filter */
1479 error = copyinstr(km->name, name, KFILTER_MAXNAME, NULL);
1480 if (error) {
1481 break;
1482 }
1483 rw_enter(&kqueue_filter_lock, RW_READER);
1484 kfilter = kfilter_byname(name);
1485 if (kfilter != NULL)
1486 km->filter = kfilter->filter;
1487 else
1488 error = ENOENT;
1489 rw_exit(&kqueue_filter_lock);
1490 break;
1491
1492 default:
1493 error = ENOTTY;
1494 break;
1495
1496 }
1497 kmem_free(name, KFILTER_MAXNAME);
1498 return (error);
1499 }
1500
1501 /*
1502 * fileops fcntl method for a kqueue descriptor.
1503 */
1504 static int
1505 kqueue_fcntl(file_t *fp, u_int com, void *data)
1506 {
1507
1508 return (ENOTTY);
1509 }
1510
1511 /*
1512 * fileops poll method for a kqueue descriptor.
1513 * Determine if kqueue has events pending.
1514 */
1515 static int
1516 kqueue_poll(file_t *fp, int events)
1517 {
1518 struct kqueue *kq;
1519 int revents;
1520
1521 kq = fp->f_kqueue;
1522
1523 revents = 0;
1524 if (events & (POLLIN | POLLRDNORM)) {
1525 mutex_spin_enter(&kq->kq_lock);
1526 if (kq->kq_count != 0) {
1527 revents |= events & (POLLIN | POLLRDNORM);
1528 } else {
1529 selrecord(curlwp, &kq->kq_sel);
1530 }
1531 kq_check(kq);
1532 mutex_spin_exit(&kq->kq_lock);
1533 }
1534
1535 return revents;
1536 }
1537
1538 /*
1539 * fileops stat method for a kqueue descriptor.
1540 * Returns dummy info, with st_size being number of events pending.
1541 */
1542 static int
1543 kqueue_stat(file_t *fp, struct stat *st)
1544 {
1545 struct kqueue *kq;
1546
1547 kq = fp->f_kqueue;
1548
1549 memset(st, 0, sizeof(*st));
1550 st->st_size = kq->kq_count;
1551 st->st_blksize = sizeof(struct kevent);
1552 st->st_mode = S_IFIFO;
1553
1554 return 0;
1555 }
1556
1557 static void
1558 kqueue_doclose(struct kqueue *kq, struct klist *list, int fd)
1559 {
1560 struct knote *kn;
1561 filedesc_t *fdp;
1562
1563 fdp = kq->kq_fdp;
1564
1565 KASSERT(mutex_owned(&fdp->fd_lock));
1566
1567 for (kn = SLIST_FIRST(list); kn != NULL;) {
1568 if (kq != kn->kn_kq) {
1569 kn = SLIST_NEXT(kn, kn_link);
1570 continue;
1571 }
1572 knote_detach(kn, fdp, true);
1573 mutex_enter(&fdp->fd_lock);
1574 kn = SLIST_FIRST(list);
1575 }
1576 }
1577
1578
1579 /*
1580 * fileops close method for a kqueue descriptor.
1581 */
1582 static int
1583 kqueue_close(file_t *fp)
1584 {
1585 struct kqueue *kq;
1586 filedesc_t *fdp;
1587 fdfile_t *ff;
1588 int i;
1589
1590 kq = fp->f_kqueue;
1591 fp->f_kqueue = NULL;
1592 fp->f_type = 0;
1593 fdp = curlwp->l_fd;
1594
1595 mutex_enter(&fdp->fd_lock);
1596 for (i = 0; i <= fdp->fd_lastkqfile; i++) {
1597 if ((ff = fdp->fd_dt->dt_ff[i]) == NULL)
1598 continue;
1599 kqueue_doclose(kq, (struct klist *)&ff->ff_knlist, i);
1600 }
1601 if (fdp->fd_knhashmask != 0) {
1602 for (i = 0; i < fdp->fd_knhashmask + 1; i++) {
1603 kqueue_doclose(kq, &fdp->fd_knhash[i], -1);
1604 }
1605 }
1606 mutex_exit(&fdp->fd_lock);
1607
1608 KASSERT(kq->kq_count == 0);
1609 mutex_destroy(&kq->kq_lock);
1610 cv_destroy(&kq->kq_cv);
1611 seldestroy(&kq->kq_sel);
1612 kmem_free(kq, sizeof(*kq));
1613
1614 return (0);
1615 }
1616
1617 /*
1618 * struct fileops kqfilter method for a kqueue descriptor.
1619 * Event triggered when monitored kqueue changes.
1620 */
1621 static int
1622 kqueue_kqfilter(file_t *fp, struct knote *kn)
1623 {
1624 struct kqueue *kq;
1625
1626 kq = ((file_t *)kn->kn_obj)->f_kqueue;
1627
1628 KASSERT(fp == kn->kn_obj);
1629
1630 if (kn->kn_filter != EVFILT_READ)
1631 return 1;
1632
1633 kn->kn_fop = &kqread_filtops;
1634 mutex_enter(&kq->kq_lock);
1635 SLIST_INSERT_HEAD(&kq->kq_sel.sel_klist, kn, kn_selnext);
1636 mutex_exit(&kq->kq_lock);
1637
1638 return 0;
1639 }
1640
1641
1642 /*
1643 * Walk down a list of knotes, activating them if their event has
1644 * triggered. The caller's object lock (e.g. device driver lock)
1645 * must be held.
1646 */
1647 void
1648 knote(struct klist *list, long hint)
1649 {
1650 struct knote *kn, *tmpkn;
1651
1652 SLIST_FOREACH_SAFE(kn, list, kn_selnext, tmpkn) {
1653 KASSERT(kn->kn_fop != NULL);
1654 KASSERT(kn->kn_fop->f_event != NULL);
1655 if ((*kn->kn_fop->f_event)(kn, hint))
1656 knote_activate(kn);
1657 }
1658 }
1659
1660 /*
1661 * Remove all knotes referencing a specified fd
1662 */
1663 void
1664 knote_fdclose(int fd)
1665 {
1666 struct klist *list;
1667 struct knote *kn;
1668 filedesc_t *fdp;
1669
1670 fdp = curlwp->l_fd;
1671 list = (struct klist *)&fdp->fd_dt->dt_ff[fd]->ff_knlist;
1672 mutex_enter(&fdp->fd_lock);
1673 while ((kn = SLIST_FIRST(list)) != NULL) {
1674 knote_detach(kn, fdp, true);
1675 mutex_enter(&fdp->fd_lock);
1676 }
1677 mutex_exit(&fdp->fd_lock);
1678 }
1679
1680 /*
1681 * Drop knote. Called with fdp->fd_lock held, and will drop before
1682 * returning.
1683 */
1684 static void
1685 knote_detach(struct knote *kn, filedesc_t *fdp, bool dofop)
1686 {
1687 struct klist *list;
1688 struct kqueue *kq;
1689
1690 kq = kn->kn_kq;
1691
1692 KASSERT((kn->kn_status & KN_MARKER) == 0);
1693 KASSERT(mutex_owned(&fdp->fd_lock));
1694
1695 KASSERT(kn->kn_fop != NULL);
1696 /* Remove from monitored object. */
1697 if (dofop) {
1698 KASSERT(kn->kn_fop->f_detach != NULL);
1699 KERNEL_LOCK(1, NULL); /* XXXSMP */
1700 (*kn->kn_fop->f_detach)(kn);
1701 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */
1702 }
1703
1704 /* Remove from descriptor table. */
1705 if (kn->kn_fop->f_isfd)
1706 list = (struct klist *)&fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
1707 else
1708 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
1709
1710 SLIST_REMOVE(list, kn, knote, kn_link);
1711
1712 /* Remove from kqueue. */
1713 again:
1714 mutex_spin_enter(&kq->kq_lock);
1715 if ((kn->kn_status & KN_QUEUED) != 0) {
1716 kq_check(kq);
1717 kq->kq_count--;
1718 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1719 kn->kn_status &= ~KN_QUEUED;
1720 kq_check(kq);
1721 } else if (kn->kn_status & KN_BUSY) {
1722 mutex_spin_exit(&kq->kq_lock);
1723 goto again;
1724 }
1725 mutex_spin_exit(&kq->kq_lock);
1726
1727 mutex_exit(&fdp->fd_lock);
1728 if (kn->kn_fop->f_isfd)
1729 fd_putfile(kn->kn_id);
1730 atomic_dec_uint(&kn->kn_kfilter->refcnt);
1731 kmem_free(kn, sizeof(*kn));
1732 }
1733
1734 /*
1735 * Queue new event for knote.
1736 */
1737 static void
1738 knote_enqueue(struct knote *kn)
1739 {
1740 struct kqueue *kq;
1741
1742 KASSERT((kn->kn_status & KN_MARKER) == 0);
1743
1744 kq = kn->kn_kq;
1745
1746 mutex_spin_enter(&kq->kq_lock);
1747 if ((kn->kn_status & KN_DISABLED) != 0) {
1748 kn->kn_status &= ~KN_DISABLED;
1749 }
1750 if ((kn->kn_status & (KN_ACTIVE | KN_QUEUED)) == KN_ACTIVE) {
1751 kq_check(kq);
1752 kn->kn_status |= KN_QUEUED;
1753 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1754 kq->kq_count++;
1755 kq_check(kq);
1756 cv_broadcast(&kq->kq_cv);
1757 selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
1758 }
1759 mutex_spin_exit(&kq->kq_lock);
1760 }
1761 /*
1762 * Queue new event for knote.
1763 */
1764 static void
1765 knote_activate(struct knote *kn)
1766 {
1767 struct kqueue *kq;
1768
1769 KASSERT((kn->kn_status & KN_MARKER) == 0);
1770
1771 kq = kn->kn_kq;
1772
1773 mutex_spin_enter(&kq->kq_lock);
1774 kn->kn_status |= KN_ACTIVE;
1775 if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) {
1776 kq_check(kq);
1777 kn->kn_status |= KN_QUEUED;
1778 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1779 kq->kq_count++;
1780 kq_check(kq);
1781 cv_broadcast(&kq->kq_cv);
1782 selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
1783 }
1784 mutex_spin_exit(&kq->kq_lock);
1785 }
1786