kern_event.c revision 1.76.2.4 1 /* $NetBSD: kern_event.c,v 1.76.2.4 2017/12/03 11:38:44 jdolecek Exp $ */
2
3 /*-
4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*-
33 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon (at) FreeBSD.org>
34 * All rights reserved.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 *
45 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 * SUCH DAMAGE.
56 *
57 * FreeBSD: src/sys/kern/kern_event.c,v 1.27 2001/07/05 17:10:44 rwatson Exp
58 */
59
60 #include <sys/cdefs.h>
61 __KERNEL_RCSID(0, "$NetBSD: kern_event.c,v 1.76.2.4 2017/12/03 11:38:44 jdolecek Exp $");
62
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/kernel.h>
66 #include <sys/wait.h>
67 #include <sys/proc.h>
68 #include <sys/file.h>
69 #include <sys/select.h>
70 #include <sys/queue.h>
71 #include <sys/event.h>
72 #include <sys/eventvar.h>
73 #include <sys/poll.h>
74 #include <sys/kmem.h>
75 #include <sys/stat.h>
76 #include <sys/filedesc.h>
77 #include <sys/syscallargs.h>
78 #include <sys/kauth.h>
79 #include <sys/conf.h>
80 #include <sys/atomic.h>
81
82 static int kqueue_scan(file_t *, size_t, struct kevent *,
83 const struct timespec *, register_t *,
84 const struct kevent_ops *, struct kevent *,
85 size_t);
86 static int kqueue_ioctl(file_t *, u_long, void *);
87 static int kqueue_fcntl(file_t *, u_int, void *);
88 static int kqueue_poll(file_t *, int);
89 static int kqueue_kqfilter(file_t *, struct knote *);
90 static int kqueue_stat(file_t *, struct stat *);
91 static int kqueue_close(file_t *);
92 static int kqueue_register(struct kqueue *, struct kevent *);
93 static void kqueue_doclose(struct kqueue *, struct klist *, int);
94
95 static void knote_detach(struct knote *, filedesc_t *fdp, bool);
96 static void knote_enqueue(struct knote *);
97 static void knote_activate(struct knote *);
98
99 static void filt_kqdetach(struct knote *);
100 static int filt_kqueue(struct knote *, long hint);
101 static int filt_procattach(struct knote *);
102 static void filt_procdetach(struct knote *);
103 static int filt_proc(struct knote *, long hint);
104 static int filt_fileattach(struct knote *);
105 static void filt_timerexpire(void *x);
106 static int filt_timerattach(struct knote *);
107 static void filt_timerdetach(struct knote *);
108 static int filt_timer(struct knote *, long hint);
109
110 static const struct fileops kqueueops = {
111 .fo_name = "kqueue",
112 .fo_read = (void *)enxio,
113 .fo_write = (void *)enxio,
114 .fo_ioctl = kqueue_ioctl,
115 .fo_fcntl = kqueue_fcntl,
116 .fo_poll = kqueue_poll,
117 .fo_stat = kqueue_stat,
118 .fo_close = kqueue_close,
119 .fo_kqfilter = kqueue_kqfilter,
120 .fo_restart = fnullop_restart,
121 };
122
123 static const struct filterops kqread_filtops = {
124 .f_isfd = 1,
125 .f_attach = NULL,
126 .f_detach = filt_kqdetach,
127 .f_event = filt_kqueue,
128 };
129
130 static const struct filterops proc_filtops = {
131 .f_isfd = 0,
132 .f_attach = filt_procattach,
133 .f_detach = filt_procdetach,
134 .f_event = filt_proc,
135 };
136
137 static const struct filterops file_filtops = {
138 .f_isfd = 1,
139 .f_attach = filt_fileattach,
140 .f_detach = NULL,
141 .f_event = NULL,
142 };
143
144 static const struct filterops timer_filtops = {
145 .f_isfd = 0,
146 .f_attach = filt_timerattach,
147 .f_detach = filt_timerdetach,
148 .f_event = filt_timer,
149 };
150
151 static u_int kq_ncallouts = 0;
152 static int kq_calloutmax = (4 * 1024);
153
154 #define KN_HASHSIZE 64 /* XXX should be tunable */
155 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
156
157 extern const struct filterops sig_filtops;
158
159 /*
160 * Table for for all system-defined filters.
161 * These should be listed in the numeric order of the EVFILT_* defines.
162 * If filtops is NULL, the filter isn't implemented in NetBSD.
163 * End of list is when name is NULL.
164 *
165 * Note that 'refcnt' is meaningless for built-in filters.
166 */
167 struct kfilter {
168 const char *name; /* name of filter */
169 uint32_t filter; /* id of filter */
170 unsigned refcnt; /* reference count */
171 const struct filterops *filtops;/* operations for filter */
172 size_t namelen; /* length of name string */
173 };
174
175 /* System defined filters */
176 static struct kfilter sys_kfilters[] = {
177 { "EVFILT_READ", EVFILT_READ, 0, &file_filtops, 0 },
178 { "EVFILT_WRITE", EVFILT_WRITE, 0, &file_filtops, 0, },
179 { "EVFILT_AIO", EVFILT_AIO, 0, NULL, 0 },
180 { "EVFILT_VNODE", EVFILT_VNODE, 0, &file_filtops, 0 },
181 { "EVFILT_PROC", EVFILT_PROC, 0, &proc_filtops, 0 },
182 { "EVFILT_SIGNAL", EVFILT_SIGNAL, 0, &sig_filtops, 0 },
183 { "EVFILT_TIMER", EVFILT_TIMER, 0, &timer_filtops, 0 },
184 { NULL, 0, 0, NULL, 0 },
185 };
186
187 /* User defined kfilters */
188 static struct kfilter *user_kfilters; /* array */
189 static int user_kfilterc; /* current offset */
190 static int user_kfiltermaxc; /* max size so far */
191 static size_t user_kfiltersz; /* size of allocated memory */
192
193 /*
194 * Global Locks.
195 *
196 * Lock order:
197 *
198 * kqueue_filter_lock
199 * -> kn_kq->kq_fdp->fd_lock
200 * -> object lock (e.g., device driver lock, kqueue_misc_lock, &c.)
201 * -> kn_kq->kq_lock
202 *
203 * Locking rules:
204 *
205 * f_attach: fdp->fd_lock, KERNEL_LOCK
206 * f_detach: fdp->fd_lock, KERNEL_LOCK
207 * f_event(!NOTE_SUBMIT) via kevent: fdp->fd_lock, _no_ object lock
208 * f_event via knote: whatever caller guarantees
209 * Typically, f_event(NOTE_SUBMIT) via knote: object lock
210 * f_event(!NOTE_SUBMIT) via knote: nothing,
211 * acquires/releases object lock inside.
212 */
213 static krwlock_t kqueue_filter_lock; /* lock on filter lists */
214 static kmutex_t kqueue_misc_lock; /* miscellaneous */
215
216 static kauth_listener_t kqueue_listener;
217
218 static int
219 kqueue_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
220 void *arg0, void *arg1, void *arg2, void *arg3)
221 {
222 struct proc *p;
223 int result;
224
225 result = KAUTH_RESULT_DEFER;
226 p = arg0;
227
228 if (action != KAUTH_PROCESS_KEVENT_FILTER)
229 return result;
230
231 if ((kauth_cred_getuid(p->p_cred) != kauth_cred_getuid(cred) ||
232 ISSET(p->p_flag, PK_SUGID)))
233 return result;
234
235 result = KAUTH_RESULT_ALLOW;
236
237 return result;
238 }
239
240 /*
241 * Initialize the kqueue subsystem.
242 */
243 void
244 kqueue_init(void)
245 {
246
247 rw_init(&kqueue_filter_lock);
248 mutex_init(&kqueue_misc_lock, MUTEX_DEFAULT, IPL_NONE);
249
250 kqueue_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
251 kqueue_listener_cb, NULL);
252 }
253
254 /*
255 * Find kfilter entry by name, or NULL if not found.
256 */
257 static struct kfilter *
258 kfilter_byname_sys(const char *name)
259 {
260 int i;
261
262 KASSERT(rw_lock_held(&kqueue_filter_lock));
263
264 for (i = 0; sys_kfilters[i].name != NULL; i++) {
265 if (strcmp(name, sys_kfilters[i].name) == 0)
266 return &sys_kfilters[i];
267 }
268 return NULL;
269 }
270
271 static struct kfilter *
272 kfilter_byname_user(const char *name)
273 {
274 int i;
275
276 KASSERT(rw_lock_held(&kqueue_filter_lock));
277
278 /* user filter slots have a NULL name if previously deregistered */
279 for (i = 0; i < user_kfilterc ; i++) {
280 if (user_kfilters[i].name != NULL &&
281 strcmp(name, user_kfilters[i].name) == 0)
282 return &user_kfilters[i];
283 }
284 return NULL;
285 }
286
287 static struct kfilter *
288 kfilter_byname(const char *name)
289 {
290 struct kfilter *kfilter;
291
292 KASSERT(rw_lock_held(&kqueue_filter_lock));
293
294 if ((kfilter = kfilter_byname_sys(name)) != NULL)
295 return kfilter;
296
297 return kfilter_byname_user(name);
298 }
299
300 /*
301 * Find kfilter entry by filter id, or NULL if not found.
302 * Assumes entries are indexed in filter id order, for speed.
303 */
304 static struct kfilter *
305 kfilter_byfilter(uint32_t filter)
306 {
307 struct kfilter *kfilter;
308
309 KASSERT(rw_lock_held(&kqueue_filter_lock));
310
311 if (filter < EVFILT_SYSCOUNT) /* it's a system filter */
312 kfilter = &sys_kfilters[filter];
313 else if (user_kfilters != NULL &&
314 filter < EVFILT_SYSCOUNT + user_kfilterc)
315 /* it's a user filter */
316 kfilter = &user_kfilters[filter - EVFILT_SYSCOUNT];
317 else
318 return (NULL); /* out of range */
319 KASSERT(kfilter->filter == filter); /* sanity check! */
320 return (kfilter);
321 }
322
323 /*
324 * Register a new kfilter. Stores the entry in user_kfilters.
325 * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
326 * If retfilter != NULL, the new filterid is returned in it.
327 */
328 int
329 kfilter_register(const char *name, const struct filterops *filtops,
330 int *retfilter)
331 {
332 struct kfilter *kfilter;
333 size_t len;
334 int i;
335
336 if (name == NULL || name[0] == '\0' || filtops == NULL)
337 return (EINVAL); /* invalid args */
338
339 rw_enter(&kqueue_filter_lock, RW_WRITER);
340 if (kfilter_byname(name) != NULL) {
341 rw_exit(&kqueue_filter_lock);
342 return (EEXIST); /* already exists */
343 }
344 if (user_kfilterc > 0xffffffff - EVFILT_SYSCOUNT) {
345 rw_exit(&kqueue_filter_lock);
346 return (EINVAL); /* too many */
347 }
348
349 for (i = 0; i < user_kfilterc; i++) {
350 kfilter = &user_kfilters[i];
351 if (kfilter->name == NULL) {
352 /* Previously deregistered slot. Reuse. */
353 goto reuse;
354 }
355 }
356
357 /* check if need to grow user_kfilters */
358 if (user_kfilterc + 1 > user_kfiltermaxc) {
359 /* Grow in KFILTER_EXTENT chunks. */
360 user_kfiltermaxc += KFILTER_EXTENT;
361 len = user_kfiltermaxc * sizeof(*kfilter);
362 kfilter = kmem_alloc(len, KM_SLEEP);
363 memset((char *)kfilter + user_kfiltersz, 0, len - user_kfiltersz);
364 if (user_kfilters != NULL) {
365 memcpy(kfilter, user_kfilters, user_kfiltersz);
366 kmem_free(user_kfilters, user_kfiltersz);
367 }
368 user_kfiltersz = len;
369 user_kfilters = kfilter;
370 }
371 /* Adding new slot */
372 kfilter = &user_kfilters[user_kfilterc++];
373 reuse:
374 kfilter->name = kmem_strdupsize(name, &kfilter->namelen, KM_SLEEP);
375
376 kfilter->filter = (kfilter - user_kfilters) + EVFILT_SYSCOUNT;
377
378 kfilter->filtops = kmem_alloc(sizeof(*filtops), KM_SLEEP);
379 memcpy(__UNCONST(kfilter->filtops), filtops, sizeof(*filtops));
380
381 if (retfilter != NULL)
382 *retfilter = kfilter->filter;
383 rw_exit(&kqueue_filter_lock);
384
385 return (0);
386 }
387
388 /*
389 * Unregister a kfilter previously registered with kfilter_register.
390 * This retains the filter id, but clears the name and frees filtops (filter
391 * operations), so that the number isn't reused during a boot.
392 * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
393 */
394 int
395 kfilter_unregister(const char *name)
396 {
397 struct kfilter *kfilter;
398
399 if (name == NULL || name[0] == '\0')
400 return (EINVAL); /* invalid name */
401
402 rw_enter(&kqueue_filter_lock, RW_WRITER);
403 if (kfilter_byname_sys(name) != NULL) {
404 rw_exit(&kqueue_filter_lock);
405 return (EINVAL); /* can't detach system filters */
406 }
407
408 kfilter = kfilter_byname_user(name);
409 if (kfilter == NULL) {
410 rw_exit(&kqueue_filter_lock);
411 return (ENOENT);
412 }
413 if (kfilter->refcnt != 0) {
414 rw_exit(&kqueue_filter_lock);
415 return (EBUSY);
416 }
417
418 /* Cast away const (but we know it's safe. */
419 kmem_free(__UNCONST(kfilter->name), kfilter->namelen);
420 kfilter->name = NULL; /* mark as `not implemented' */
421
422 if (kfilter->filtops != NULL) {
423 /* Cast away const (but we know it's safe. */
424 kmem_free(__UNCONST(kfilter->filtops),
425 sizeof(*kfilter->filtops));
426 kfilter->filtops = NULL; /* mark as `not implemented' */
427 }
428 rw_exit(&kqueue_filter_lock);
429
430 return (0);
431 }
432
433
434 /*
435 * Filter attach method for EVFILT_READ and EVFILT_WRITE on normal file
436 * descriptors. Calls fileops kqfilter method for given file descriptor.
437 */
438 static int
439 filt_fileattach(struct knote *kn)
440 {
441 file_t *fp;
442
443 fp = kn->kn_obj;
444
445 return (*fp->f_ops->fo_kqfilter)(fp, kn);
446 }
447
448 /*
449 * Filter detach method for EVFILT_READ on kqueue descriptor.
450 */
451 static void
452 filt_kqdetach(struct knote *kn)
453 {
454 struct kqueue *kq;
455
456 kq = ((file_t *)kn->kn_obj)->f_kqueue;
457
458 mutex_spin_enter(&kq->kq_lock);
459 SLIST_REMOVE(&kq->kq_sel.sel_klist, kn, knote, kn_selnext);
460 mutex_spin_exit(&kq->kq_lock);
461 }
462
463 /*
464 * Filter event method for EVFILT_READ on kqueue descriptor.
465 */
466 /*ARGSUSED*/
467 static int
468 filt_kqueue(struct knote *kn, long hint)
469 {
470 struct kqueue *kq;
471 int rv;
472
473 kq = ((file_t *)kn->kn_obj)->f_kqueue;
474
475 if (hint != NOTE_SUBMIT)
476 mutex_spin_enter(&kq->kq_lock);
477 kn->kn_data = kq->kq_count;
478 rv = (kn->kn_data > 0);
479 if (hint != NOTE_SUBMIT)
480 mutex_spin_exit(&kq->kq_lock);
481
482 return rv;
483 }
484
485 /*
486 * Filter attach method for EVFILT_PROC.
487 */
488 static int
489 filt_procattach(struct knote *kn)
490 {
491 struct proc *p;
492 struct lwp *curl;
493
494 curl = curlwp;
495
496 mutex_enter(proc_lock);
497 if (kn->kn_flags & EV_FLAG1) {
498 /*
499 * NOTE_TRACK attaches to the child process too early
500 * for proc_find, so do a raw look up and check the state
501 * explicitly.
502 */
503 p = proc_find_raw(kn->kn_id);
504 if (p != NULL && p->p_stat != SIDL)
505 p = NULL;
506 } else {
507 p = proc_find(kn->kn_id);
508 }
509
510 if (p == NULL) {
511 mutex_exit(proc_lock);
512 return ESRCH;
513 }
514
515 /*
516 * Fail if it's not owned by you, or the last exec gave us
517 * setuid/setgid privs (unless you're root).
518 */
519 mutex_enter(p->p_lock);
520 mutex_exit(proc_lock);
521 if (kauth_authorize_process(curl->l_cred, KAUTH_PROCESS_KEVENT_FILTER,
522 p, NULL, NULL, NULL) != 0) {
523 mutex_exit(p->p_lock);
524 return EACCES;
525 }
526
527 kn->kn_obj = p;
528 kn->kn_flags |= EV_CLEAR; /* automatically set */
529
530 /*
531 * internal flag indicating registration done by kernel
532 */
533 if (kn->kn_flags & EV_FLAG1) {
534 kn->kn_data = kn->kn_sdata; /* ppid */
535 kn->kn_fflags = NOTE_CHILD;
536 kn->kn_flags &= ~EV_FLAG1;
537 }
538 SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext);
539 mutex_exit(p->p_lock);
540
541 return 0;
542 }
543
544 /*
545 * Filter detach method for EVFILT_PROC.
546 *
547 * The knote may be attached to a different process, which may exit,
548 * leaving nothing for the knote to be attached to. So when the process
549 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
550 * it will be deleted when read out. However, as part of the knote deletion,
551 * this routine is called, so a check is needed to avoid actually performing
552 * a detach, because the original process might not exist any more.
553 */
554 static void
555 filt_procdetach(struct knote *kn)
556 {
557 struct proc *p;
558
559 if (kn->kn_status & KN_DETACHED)
560 return;
561
562 p = kn->kn_obj;
563
564 mutex_enter(p->p_lock);
565 SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
566 mutex_exit(p->p_lock);
567 }
568
569 /*
570 * Filter event method for EVFILT_PROC.
571 */
572 static int
573 filt_proc(struct knote *kn, long hint)
574 {
575 u_int event, fflag;
576 struct kevent kev;
577 struct kqueue *kq;
578 int error;
579
580 event = (u_int)hint & NOTE_PCTRLMASK;
581 kq = kn->kn_kq;
582 fflag = 0;
583
584 /* If the user is interested in this event, record it. */
585 if (kn->kn_sfflags & event)
586 fflag |= event;
587
588 if (event == NOTE_EXIT) {
589 struct proc *p = kn->kn_obj;
590
591 if (p != NULL)
592 kn->kn_data = P_WAITSTATUS(p);
593 /*
594 * Process is gone, so flag the event as finished.
595 *
596 * Detach the knote from watched process and mark
597 * it as such. We can't leave this to kqueue_scan(),
598 * since the process might not exist by then. And we
599 * have to do this now, since psignal KNOTE() is called
600 * also for zombies and we might end up reading freed
601 * memory if the kevent would already be picked up
602 * and knote g/c'ed.
603 */
604 filt_procdetach(kn);
605
606 mutex_spin_enter(&kq->kq_lock);
607 kn->kn_status |= KN_DETACHED;
608 /* Mark as ONESHOT, so that the knote it g/c'ed when read */
609 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
610 kn->kn_fflags |= fflag;
611 mutex_spin_exit(&kq->kq_lock);
612
613 return 1;
614 }
615
616 mutex_spin_enter(&kq->kq_lock);
617 if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
618 /*
619 * Process forked, and user wants to track the new process,
620 * so attach a new knote to it, and immediately report an
621 * event with the parent's pid. Register knote with new
622 * process.
623 */
624 kev.ident = hint & NOTE_PDATAMASK; /* pid */
625 kev.filter = kn->kn_filter;
626 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
627 kev.fflags = kn->kn_sfflags;
628 kev.data = kn->kn_id; /* parent */
629 kev.udata = kn->kn_kevent.udata; /* preserve udata */
630 mutex_spin_exit(&kq->kq_lock);
631 error = kqueue_register(kq, &kev);
632 mutex_spin_enter(&kq->kq_lock);
633 if (error != 0)
634 kn->kn_fflags |= NOTE_TRACKERR;
635 }
636 kn->kn_fflags |= fflag;
637 fflag = kn->kn_fflags;
638 mutex_spin_exit(&kq->kq_lock);
639
640 return fflag != 0;
641 }
642
643 static void
644 filt_timerexpire(void *knx)
645 {
646 struct knote *kn = knx;
647 int tticks;
648
649 mutex_enter(&kqueue_misc_lock);
650 kn->kn_data++;
651 knote_activate(kn);
652 if ((kn->kn_flags & EV_ONESHOT) == 0) {
653 tticks = mstohz(kn->kn_sdata);
654 if (tticks <= 0)
655 tticks = 1;
656 callout_schedule((callout_t *)kn->kn_hook, tticks);
657 }
658 mutex_exit(&kqueue_misc_lock);
659 }
660
661 /*
662 * data contains amount of time to sleep, in milliseconds
663 */
664 static int
665 filt_timerattach(struct knote *kn)
666 {
667 callout_t *calloutp;
668 struct kqueue *kq;
669 int tticks;
670
671 tticks = mstohz(kn->kn_sdata);
672
673 /* if the supplied value is under our resolution, use 1 tick */
674 if (tticks == 0) {
675 if (kn->kn_sdata == 0)
676 return EINVAL;
677 tticks = 1;
678 }
679
680 if (atomic_inc_uint_nv(&kq_ncallouts) >= kq_calloutmax ||
681 (calloutp = kmem_alloc(sizeof(*calloutp), KM_NOSLEEP)) == NULL) {
682 atomic_dec_uint(&kq_ncallouts);
683 return ENOMEM;
684 }
685 callout_init(calloutp, CALLOUT_MPSAFE);
686
687 kq = kn->kn_kq;
688 mutex_spin_enter(&kq->kq_lock);
689 kn->kn_flags |= EV_CLEAR; /* automatically set */
690 kn->kn_hook = calloutp;
691 mutex_spin_exit(&kq->kq_lock);
692
693 callout_reset(calloutp, tticks, filt_timerexpire, kn);
694
695 return (0);
696 }
697
698 static void
699 filt_timerdetach(struct knote *kn)
700 {
701 callout_t *calloutp;
702
703 calloutp = (callout_t *)kn->kn_hook;
704 callout_halt(calloutp, NULL);
705 callout_destroy(calloutp);
706 kmem_free(calloutp, sizeof(*calloutp));
707 atomic_dec_uint(&kq_ncallouts);
708 }
709
710 static int
711 filt_timer(struct knote *kn, long hint)
712 {
713 int rv;
714
715 mutex_enter(&kqueue_misc_lock);
716 rv = (kn->kn_data != 0);
717 mutex_exit(&kqueue_misc_lock);
718
719 return rv;
720 }
721
722 /*
723 * filt_seltrue:
724 *
725 * This filter "event" routine simulates seltrue().
726 */
727 int
728 filt_seltrue(struct knote *kn, long hint)
729 {
730
731 /*
732 * We don't know how much data can be read/written,
733 * but we know that it *can* be. This is about as
734 * good as select/poll does as well.
735 */
736 kn->kn_data = 0;
737 return (1);
738 }
739
740 /*
741 * This provides full kqfilter entry for device switch tables, which
742 * has same effect as filter using filt_seltrue() as filter method.
743 */
744 static void
745 filt_seltruedetach(struct knote *kn)
746 {
747 /* Nothing to do */
748 }
749
750 const struct filterops seltrue_filtops = {
751 .f_isfd = 1,
752 .f_attach = NULL,
753 .f_detach = filt_seltruedetach,
754 .f_event = filt_seltrue,
755 };
756
757 int
758 seltrue_kqfilter(dev_t dev, struct knote *kn)
759 {
760 switch (kn->kn_filter) {
761 case EVFILT_READ:
762 case EVFILT_WRITE:
763 kn->kn_fop = &seltrue_filtops;
764 break;
765 default:
766 return (EINVAL);
767 }
768
769 /* Nothing more to do */
770 return (0);
771 }
772
773 /*
774 * kqueue(2) system call.
775 */
776 static int
777 kqueue1(struct lwp *l, int flags, register_t *retval)
778 {
779 struct kqueue *kq;
780 file_t *fp;
781 int fd, error;
782
783 if ((error = fd_allocfile(&fp, &fd)) != 0)
784 return error;
785 fp->f_flag = FREAD | FWRITE | (flags & (FNONBLOCK|FNOSIGPIPE));
786 fp->f_type = DTYPE_KQUEUE;
787 fp->f_ops = &kqueueops;
788 kq = kmem_zalloc(sizeof(*kq), KM_SLEEP);
789 mutex_init(&kq->kq_lock, MUTEX_DEFAULT, IPL_SCHED);
790 cv_init(&kq->kq_cv, "kqueue");
791 selinit(&kq->kq_sel);
792 TAILQ_INIT(&kq->kq_head);
793 fp->f_kqueue = kq;
794 *retval = fd;
795 kq->kq_fdp = curlwp->l_fd;
796 fd_set_exclose(l, fd, (flags & O_CLOEXEC) != 0);
797 fd_affix(curproc, fp, fd);
798 return error;
799 }
800
801 /*
802 * kqueue(2) system call.
803 */
804 int
805 sys_kqueue(struct lwp *l, const void *v, register_t *retval)
806 {
807 return kqueue1(l, 0, retval);
808 }
809
810 int
811 sys_kqueue1(struct lwp *l, const struct sys_kqueue1_args *uap,
812 register_t *retval)
813 {
814 /* {
815 syscallarg(int) flags;
816 } */
817 return kqueue1(l, SCARG(uap, flags), retval);
818 }
819
820 /*
821 * kevent(2) system call.
822 */
823 int
824 kevent_fetch_changes(void *ctx, const struct kevent *changelist,
825 struct kevent *changes, size_t index, int n)
826 {
827
828 return copyin(changelist + index, changes, n * sizeof(*changes));
829 }
830
831 int
832 kevent_put_events(void *ctx, struct kevent *events,
833 struct kevent *eventlist, size_t index, int n)
834 {
835
836 return copyout(events, eventlist + index, n * sizeof(*events));
837 }
838
839 static const struct kevent_ops kevent_native_ops = {
840 .keo_private = NULL,
841 .keo_fetch_timeout = copyin,
842 .keo_fetch_changes = kevent_fetch_changes,
843 .keo_put_events = kevent_put_events,
844 };
845
846 int
847 sys___kevent50(struct lwp *l, const struct sys___kevent50_args *uap,
848 register_t *retval)
849 {
850 /* {
851 syscallarg(int) fd;
852 syscallarg(const struct kevent *) changelist;
853 syscallarg(size_t) nchanges;
854 syscallarg(struct kevent *) eventlist;
855 syscallarg(size_t) nevents;
856 syscallarg(const struct timespec *) timeout;
857 } */
858
859 return kevent1(retval, SCARG(uap, fd), SCARG(uap, changelist),
860 SCARG(uap, nchanges), SCARG(uap, eventlist), SCARG(uap, nevents),
861 SCARG(uap, timeout), &kevent_native_ops);
862 }
863
864 int
865 kevent1(register_t *retval, int fd,
866 const struct kevent *changelist, size_t nchanges,
867 struct kevent *eventlist, size_t nevents,
868 const struct timespec *timeout,
869 const struct kevent_ops *keops)
870 {
871 struct kevent *kevp;
872 struct kqueue *kq;
873 struct timespec ts;
874 size_t i, n, ichange;
875 int nerrors, error;
876 struct kevent kevbuf[KQ_NEVENTS]; /* approx 300 bytes on 64-bit */
877 file_t *fp;
878
879 /* check that we're dealing with a kq */
880 fp = fd_getfile(fd);
881 if (fp == NULL)
882 return (EBADF);
883
884 if (fp->f_type != DTYPE_KQUEUE) {
885 fd_putfile(fd);
886 return (EBADF);
887 }
888
889 if (timeout != NULL) {
890 error = (*keops->keo_fetch_timeout)(timeout, &ts, sizeof(ts));
891 if (error)
892 goto done;
893 timeout = &ts;
894 }
895
896 kq = fp->f_kqueue;
897 nerrors = 0;
898 ichange = 0;
899
900 /* traverse list of events to register */
901 while (nchanges > 0) {
902 n = MIN(nchanges, __arraycount(kevbuf));
903 error = (*keops->keo_fetch_changes)(keops->keo_private,
904 changelist, kevbuf, ichange, n);
905 if (error)
906 goto done;
907 for (i = 0; i < n; i++) {
908 kevp = &kevbuf[i];
909 kevp->flags &= ~EV_SYSFLAGS;
910 /* register each knote */
911 error = kqueue_register(kq, kevp);
912 if (!error && !(kevp->flags & EV_RECEIPT))
913 continue;
914 if (nevents == 0)
915 goto done;
916 kevp->flags = EV_ERROR;
917 kevp->data = error;
918 error = (*keops->keo_put_events)
919 (keops->keo_private, kevp,
920 eventlist, nerrors, 1);
921 if (error)
922 goto done;
923 nevents--;
924 nerrors++;
925 }
926 nchanges -= n; /* update the results */
927 ichange += n;
928 }
929 if (nerrors) {
930 *retval = nerrors;
931 error = 0;
932 goto done;
933 }
934
935 /* actually scan through the events */
936 error = kqueue_scan(fp, nevents, eventlist, timeout, retval, keops,
937 kevbuf, __arraycount(kevbuf));
938 done:
939 fd_putfile(fd);
940 return (error);
941 }
942
943 /*
944 * Register a given kevent kev onto the kqueue
945 */
946 static int
947 kqueue_register(struct kqueue *kq, struct kevent *kev)
948 {
949 struct kfilter *kfilter;
950 filedesc_t *fdp;
951 file_t *fp;
952 fdfile_t *ff;
953 struct knote *kn, *newkn;
954 struct klist *list;
955 int error, fd, rv;
956
957 fdp = kq->kq_fdp;
958 fp = NULL;
959 kn = NULL;
960 error = 0;
961 fd = 0;
962
963 newkn = kmem_zalloc(sizeof(*newkn), KM_SLEEP);
964
965 rw_enter(&kqueue_filter_lock, RW_READER);
966 kfilter = kfilter_byfilter(kev->filter);
967 if (kfilter == NULL || kfilter->filtops == NULL) {
968 /* filter not found nor implemented */
969 rw_exit(&kqueue_filter_lock);
970 kmem_free(newkn, sizeof(*newkn));
971 return (EINVAL);
972 }
973
974 /* search if knote already exists */
975 if (kfilter->filtops->f_isfd) {
976 /* monitoring a file descriptor */
977 /* validate descriptor */
978 if (kev->ident > INT_MAX
979 || (fp = fd_getfile(fd = kev->ident)) == NULL) {
980 rw_exit(&kqueue_filter_lock);
981 kmem_free(newkn, sizeof(*newkn));
982 return EBADF;
983 }
984 mutex_enter(&fdp->fd_lock);
985 ff = fdp->fd_dt->dt_ff[fd];
986 if (ff->ff_refcnt & FR_CLOSING) {
987 error = EBADF;
988 goto doneunlock;
989 }
990 if (fd <= fdp->fd_lastkqfile) {
991 SLIST_FOREACH(kn, &ff->ff_knlist, kn_link) {
992 if (kq == kn->kn_kq &&
993 kev->filter == kn->kn_filter)
994 break;
995 }
996 }
997 } else {
998 /*
999 * not monitoring a file descriptor, so
1000 * lookup knotes in internal hash table
1001 */
1002 mutex_enter(&fdp->fd_lock);
1003 if (fdp->fd_knhashmask != 0) {
1004 list = &fdp->fd_knhash[
1005 KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
1006 SLIST_FOREACH(kn, list, kn_link) {
1007 if (kev->ident == kn->kn_id &&
1008 kq == kn->kn_kq &&
1009 kev->filter == kn->kn_filter)
1010 break;
1011 }
1012 }
1013 }
1014
1015 /*
1016 * kn now contains the matching knote, or NULL if no match
1017 */
1018 if (kev->flags & EV_ADD) {
1019 if (kn == NULL) {
1020 /* create new knote */
1021 kn = newkn;
1022 newkn = NULL;
1023 kn->kn_obj = fp;
1024 kn->kn_id = kev->ident;
1025 kn->kn_kq = kq;
1026 kn->kn_fop = kfilter->filtops;
1027 kn->kn_kfilter = kfilter;
1028 kn->kn_sfflags = kev->fflags;
1029 kn->kn_sdata = kev->data;
1030 kev->fflags = 0;
1031 kev->data = 0;
1032 kn->kn_kevent = *kev;
1033
1034 KASSERT(kn->kn_fop != NULL);
1035 /*
1036 * apply reference count to knote structure, and
1037 * do not release it at the end of this routine.
1038 */
1039 fp = NULL;
1040
1041 if (!kn->kn_fop->f_isfd) {
1042 /*
1043 * If knote is not on an fd, store on
1044 * internal hash table.
1045 */
1046 if (fdp->fd_knhashmask == 0) {
1047 /* XXXAD can block with fd_lock held */
1048 fdp->fd_knhash = hashinit(KN_HASHSIZE,
1049 HASH_LIST, true,
1050 &fdp->fd_knhashmask);
1051 }
1052 list = &fdp->fd_knhash[KN_HASH(kn->kn_id,
1053 fdp->fd_knhashmask)];
1054 } else {
1055 /* Otherwise, knote is on an fd. */
1056 list = (struct klist *)
1057 &fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
1058 if ((int)kn->kn_id > fdp->fd_lastkqfile)
1059 fdp->fd_lastkqfile = kn->kn_id;
1060 }
1061 SLIST_INSERT_HEAD(list, kn, kn_link);
1062
1063 KERNEL_LOCK(1, NULL); /* XXXSMP */
1064 error = (*kfilter->filtops->f_attach)(kn);
1065 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */
1066 if (error != 0) {
1067 #ifdef DEBUG
1068 const file_t *ft = kn->kn_obj;
1069 uprintf("%s: event type %d not supported for "
1070 "file type %d/%s (error %d)\n", __func__,
1071 kn->kn_filter, ft ? ft->f_type : -1,
1072 ft ? ft->f_ops->fo_name : "?", error);
1073 #endif
1074
1075 /* knote_detach() drops fdp->fd_lock */
1076 knote_detach(kn, fdp, false);
1077 goto done;
1078 }
1079 atomic_inc_uint(&kfilter->refcnt);
1080 } else {
1081 /*
1082 * The user may change some filter values after the
1083 * initial EV_ADD, but doing so will not reset any
1084 * filter which have already been triggered.
1085 */
1086 kn->kn_sfflags = kev->fflags;
1087 kn->kn_sdata = kev->data;
1088 kn->kn_kevent.udata = kev->udata;
1089 }
1090 /*
1091 * We can get here if we are trying to attach
1092 * an event to a file descriptor that does not
1093 * support events, and the attach routine is
1094 * broken and does not return an error.
1095 */
1096 KASSERT(kn->kn_fop != NULL);
1097 KASSERT(kn->kn_fop->f_event != NULL);
1098 KERNEL_LOCK(1, NULL); /* XXXSMP */
1099 rv = (*kn->kn_fop->f_event)(kn, 0);
1100 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */
1101 if (rv)
1102 knote_activate(kn);
1103 } else {
1104 if (kn == NULL) {
1105 error = ENOENT;
1106 goto doneunlock;
1107 }
1108 if (kev->flags & EV_DELETE) {
1109 /* knote_detach() drops fdp->fd_lock */
1110 knote_detach(kn, fdp, true);
1111 goto done;
1112 }
1113 }
1114
1115 /* disable knote */
1116 if ((kev->flags & EV_DISABLE)) {
1117 mutex_spin_enter(&kq->kq_lock);
1118 if ((kn->kn_status & KN_DISABLED) == 0)
1119 kn->kn_status |= KN_DISABLED;
1120 mutex_spin_exit(&kq->kq_lock);
1121 }
1122
1123 /* enable knote */
1124 if ((kev->flags & EV_ENABLE)) {
1125 knote_enqueue(kn);
1126 }
1127 doneunlock:
1128 mutex_exit(&fdp->fd_lock);
1129 done:
1130 rw_exit(&kqueue_filter_lock);
1131 if (newkn != NULL)
1132 kmem_free(newkn, sizeof(*newkn));
1133 if (fp != NULL)
1134 fd_putfile(fd);
1135 return (error);
1136 }
1137
1138 #if defined(DEBUG)
1139 #define KN_FMT(buf, kn) \
1140 (snprintb((buf), sizeof(buf), __KN_FLAG_BITS, (kn)->kn_status), buf)
1141
1142 static void
1143 kqueue_check(const char *func, size_t line, const struct kqueue *kq)
1144 {
1145 const struct knote *kn;
1146 int count;
1147 int nmarker;
1148 char buf[128];
1149
1150 KASSERT(mutex_owned(&kq->kq_lock));
1151 KASSERT(kq->kq_count >= 0);
1152
1153 count = 0;
1154 nmarker = 0;
1155 TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) {
1156 if ((kn->kn_status & (KN_MARKER | KN_QUEUED)) == 0) {
1157 panic("%s,%zu: kq=%p kn=%p !(MARKER|QUEUED) %s",
1158 func, line, kq, kn, KN_FMT(buf, kn));
1159 }
1160 if ((kn->kn_status & KN_MARKER) == 0) {
1161 if (kn->kn_kq != kq) {
1162 panic("%s,%zu: kq=%p kn(%p) != kn->kq(%p): %s",
1163 func, line, kq, kn, kn->kn_kq,
1164 KN_FMT(buf, kn));
1165 }
1166 if ((kn->kn_status & KN_ACTIVE) == 0) {
1167 panic("%s,%zu: kq=%p kn=%p: !ACTIVE %s",
1168 func, line, kq, kn, KN_FMT(buf, kn));
1169 }
1170 count++;
1171 if (count > kq->kq_count) {
1172 goto bad;
1173 }
1174 } else {
1175 nmarker++;
1176 #if 0
1177 if (nmarker > 10000) {
1178 panic("%s,%zu: kq=%p too many markers: "
1179 "%d != %d, nmarker=%d",
1180 func, line, kq, kq->kq_count, count,
1181 nmarker);
1182 }
1183 #endif
1184 }
1185 }
1186 if (kq->kq_count != count) {
1187 bad:
1188 panic("%s,%zu: kq=%p kq->kq_count(%d) != count(%d), nmarker=%d",
1189 func, line, kq, kq->kq_count, count, nmarker);
1190 }
1191 }
1192 #define kq_check(a) kqueue_check(__func__, __LINE__, (a))
1193 #else /* defined(DEBUG) */
1194 #define kq_check(a) /* nothing */
1195 #endif /* defined(DEBUG) */
1196
1197 /*
1198 * Scan through the list of events on fp (for a maximum of maxevents),
1199 * returning the results in to ulistp. Timeout is determined by tsp; if
1200 * NULL, wait indefinitely, if 0 valued, perform a poll, otherwise wait
1201 * as appropriate.
1202 */
1203 static int
1204 kqueue_scan(file_t *fp, size_t maxevents, struct kevent *ulistp,
1205 const struct timespec *tsp, register_t *retval,
1206 const struct kevent_ops *keops, struct kevent *kevbuf,
1207 size_t kevcnt)
1208 {
1209 struct kqueue *kq;
1210 struct kevent *kevp;
1211 struct timespec ats, sleepts;
1212 struct knote *kn, *marker, morker;
1213 size_t count, nkev, nevents;
1214 int timeout, error, rv;
1215 filedesc_t *fdp;
1216
1217 fdp = curlwp->l_fd;
1218 kq = fp->f_kqueue;
1219 count = maxevents;
1220 nkev = nevents = error = 0;
1221 if (count == 0) {
1222 *retval = 0;
1223 return 0;
1224 }
1225
1226 if (tsp) { /* timeout supplied */
1227 ats = *tsp;
1228 if (inittimeleft(&ats, &sleepts) == -1) {
1229 *retval = maxevents;
1230 return EINVAL;
1231 }
1232 timeout = tstohz(&ats);
1233 if (timeout <= 0)
1234 timeout = -1; /* do poll */
1235 } else {
1236 /* no timeout, wait forever */
1237 timeout = 0;
1238 }
1239
1240 memset(&morker, 0, sizeof(morker));
1241 marker = &morker;
1242 marker->kn_status = KN_MARKER;
1243 mutex_spin_enter(&kq->kq_lock);
1244 retry:
1245 kevp = kevbuf;
1246 if (kq->kq_count == 0) {
1247 if (timeout >= 0) {
1248 error = cv_timedwait_sig(&kq->kq_cv,
1249 &kq->kq_lock, timeout);
1250 if (error == 0) {
1251 if (tsp == NULL || (timeout =
1252 gettimeleft(&ats, &sleepts)) > 0)
1253 goto retry;
1254 } else {
1255 /* don't restart after signals... */
1256 if (error == ERESTART)
1257 error = EINTR;
1258 if (error == EWOULDBLOCK)
1259 error = 0;
1260 }
1261 }
1262 mutex_spin_exit(&kq->kq_lock);
1263 } else {
1264 /* mark end of knote list */
1265 TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
1266
1267 /*
1268 * Acquire the fdp->fd_lock interlock to avoid races with
1269 * file creation/destruction from other threads.
1270 */
1271 mutex_spin_exit(&kq->kq_lock);
1272 mutex_enter(&fdp->fd_lock);
1273 mutex_spin_enter(&kq->kq_lock);
1274
1275 while (count != 0) {
1276 kn = TAILQ_FIRST(&kq->kq_head); /* get next knote */
1277 while ((kn->kn_status & KN_MARKER) != 0) {
1278 if (kn == marker) {
1279 /* it's our marker, stop */
1280 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1281 if (count < maxevents || (tsp != NULL &&
1282 (timeout = gettimeleft(&ats,
1283 &sleepts)) <= 0))
1284 goto done;
1285 mutex_exit(&fdp->fd_lock);
1286 goto retry;
1287 }
1288 /* someone else's marker. */
1289 kn = TAILQ_NEXT(kn, kn_tqe);
1290 }
1291 kq_check(kq);
1292 kq->kq_count--;
1293 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1294 kn->kn_status &= ~KN_QUEUED;
1295 kn->kn_status |= KN_BUSY;
1296 kq_check(kq);
1297 if (kn->kn_status & KN_DISABLED) {
1298 kn->kn_status &= ~KN_BUSY;
1299 /* don't want disabled events */
1300 continue;
1301 }
1302 if ((kn->kn_flags & EV_ONESHOT) == 0) {
1303 mutex_spin_exit(&kq->kq_lock);
1304 KASSERT(kn->kn_fop != NULL);
1305 KASSERT(kn->kn_fop->f_event != NULL);
1306 KERNEL_LOCK(1, NULL); /* XXXSMP */
1307 KASSERT(mutex_owned(&fdp->fd_lock));
1308 rv = (*kn->kn_fop->f_event)(kn, 0);
1309 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */
1310 mutex_spin_enter(&kq->kq_lock);
1311 /* Re-poll if note was re-enqueued. */
1312 if ((kn->kn_status & KN_QUEUED) != 0) {
1313 kn->kn_status &= ~KN_BUSY;
1314 continue;
1315 }
1316 if (rv == 0) {
1317 /*
1318 * non-ONESHOT event that hasn't
1319 * triggered again, so de-queue.
1320 */
1321 kn->kn_status &= ~(KN_ACTIVE|KN_BUSY);
1322 continue;
1323 }
1324 }
1325 /* XXXAD should be got from f_event if !oneshot. */
1326 *kevp++ = kn->kn_kevent;
1327 nkev++;
1328 if (kn->kn_flags & EV_ONESHOT) {
1329 /* delete ONESHOT events after retrieval */
1330 kn->kn_status &= ~KN_BUSY;
1331 mutex_spin_exit(&kq->kq_lock);
1332 knote_detach(kn, fdp, true);
1333 mutex_enter(&fdp->fd_lock);
1334 mutex_spin_enter(&kq->kq_lock);
1335 } else if (kn->kn_flags & EV_CLEAR) {
1336 /* clear state after retrieval */
1337 kn->kn_data = 0;
1338 kn->kn_fflags = 0;
1339 kn->kn_status &= ~(KN_QUEUED|KN_ACTIVE|KN_BUSY);
1340 } else if (kn->kn_flags & EV_DISPATCH) {
1341 kn->kn_status |= KN_DISABLED;
1342 kn->kn_status &= ~(KN_QUEUED|KN_ACTIVE|KN_BUSY);
1343 } else {
1344 /* add event back on list */
1345 kq_check(kq);
1346 kn->kn_status |= KN_QUEUED;
1347 kn->kn_status &= ~KN_BUSY;
1348 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1349 kq->kq_count++;
1350 kq_check(kq);
1351 }
1352 if (nkev == kevcnt) {
1353 /* do copyouts in kevcnt chunks */
1354 mutex_spin_exit(&kq->kq_lock);
1355 mutex_exit(&fdp->fd_lock);
1356 error = (*keops->keo_put_events)
1357 (keops->keo_private,
1358 kevbuf, ulistp, nevents, nkev);
1359 mutex_enter(&fdp->fd_lock);
1360 mutex_spin_enter(&kq->kq_lock);
1361 nevents += nkev;
1362 nkev = 0;
1363 kevp = kevbuf;
1364 }
1365 count--;
1366 if (error != 0 || count == 0) {
1367 /* remove marker */
1368 TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
1369 break;
1370 }
1371 }
1372 done:
1373 mutex_spin_exit(&kq->kq_lock);
1374 mutex_exit(&fdp->fd_lock);
1375 }
1376 if (nkev != 0) {
1377 /* copyout remaining events */
1378 error = (*keops->keo_put_events)(keops->keo_private,
1379 kevbuf, ulistp, nevents, nkev);
1380 }
1381 *retval = maxevents - count;
1382
1383 return error;
1384 }
1385
1386 /*
1387 * fileops ioctl method for a kqueue descriptor.
1388 *
1389 * Two ioctls are currently supported. They both use struct kfilter_mapping:
1390 * KFILTER_BYNAME find name for filter, and return result in
1391 * name, which is of size len.
1392 * KFILTER_BYFILTER find filter for name. len is ignored.
1393 */
1394 /*ARGSUSED*/
1395 static int
1396 kqueue_ioctl(file_t *fp, u_long com, void *data)
1397 {
1398 struct kfilter_mapping *km;
1399 const struct kfilter *kfilter;
1400 char *name;
1401 int error;
1402
1403 km = data;
1404 error = 0;
1405 name = kmem_alloc(KFILTER_MAXNAME, KM_SLEEP);
1406
1407 switch (com) {
1408 case KFILTER_BYFILTER: /* convert filter -> name */
1409 rw_enter(&kqueue_filter_lock, RW_READER);
1410 kfilter = kfilter_byfilter(km->filter);
1411 if (kfilter != NULL) {
1412 strlcpy(name, kfilter->name, KFILTER_MAXNAME);
1413 rw_exit(&kqueue_filter_lock);
1414 error = copyoutstr(name, km->name, km->len, NULL);
1415 } else {
1416 rw_exit(&kqueue_filter_lock);
1417 error = ENOENT;
1418 }
1419 break;
1420
1421 case KFILTER_BYNAME: /* convert name -> filter */
1422 error = copyinstr(km->name, name, KFILTER_MAXNAME, NULL);
1423 if (error) {
1424 break;
1425 }
1426 rw_enter(&kqueue_filter_lock, RW_READER);
1427 kfilter = kfilter_byname(name);
1428 if (kfilter != NULL)
1429 km->filter = kfilter->filter;
1430 else
1431 error = ENOENT;
1432 rw_exit(&kqueue_filter_lock);
1433 break;
1434
1435 default:
1436 error = ENOTTY;
1437 break;
1438
1439 }
1440 kmem_free(name, KFILTER_MAXNAME);
1441 return (error);
1442 }
1443
1444 /*
1445 * fileops fcntl method for a kqueue descriptor.
1446 */
1447 static int
1448 kqueue_fcntl(file_t *fp, u_int com, void *data)
1449 {
1450
1451 return (ENOTTY);
1452 }
1453
1454 /*
1455 * fileops poll method for a kqueue descriptor.
1456 * Determine if kqueue has events pending.
1457 */
1458 static int
1459 kqueue_poll(file_t *fp, int events)
1460 {
1461 struct kqueue *kq;
1462 int revents;
1463
1464 kq = fp->f_kqueue;
1465
1466 revents = 0;
1467 if (events & (POLLIN | POLLRDNORM)) {
1468 mutex_spin_enter(&kq->kq_lock);
1469 if (kq->kq_count != 0) {
1470 revents |= events & (POLLIN | POLLRDNORM);
1471 } else {
1472 selrecord(curlwp, &kq->kq_sel);
1473 }
1474 kq_check(kq);
1475 mutex_spin_exit(&kq->kq_lock);
1476 }
1477
1478 return revents;
1479 }
1480
1481 /*
1482 * fileops stat method for a kqueue descriptor.
1483 * Returns dummy info, with st_size being number of events pending.
1484 */
1485 static int
1486 kqueue_stat(file_t *fp, struct stat *st)
1487 {
1488 struct kqueue *kq;
1489
1490 kq = fp->f_kqueue;
1491
1492 memset(st, 0, sizeof(*st));
1493 st->st_size = kq->kq_count;
1494 st->st_blksize = sizeof(struct kevent);
1495 st->st_mode = S_IFIFO;
1496
1497 return 0;
1498 }
1499
1500 static void
1501 kqueue_doclose(struct kqueue *kq, struct klist *list, int fd)
1502 {
1503 struct knote *kn;
1504 filedesc_t *fdp;
1505
1506 fdp = kq->kq_fdp;
1507
1508 KASSERT(mutex_owned(&fdp->fd_lock));
1509
1510 for (kn = SLIST_FIRST(list); kn != NULL;) {
1511 if (kq != kn->kn_kq) {
1512 kn = SLIST_NEXT(kn, kn_link);
1513 continue;
1514 }
1515 knote_detach(kn, fdp, true);
1516 mutex_enter(&fdp->fd_lock);
1517 kn = SLIST_FIRST(list);
1518 }
1519 }
1520
1521
1522 /*
1523 * fileops close method for a kqueue descriptor.
1524 */
1525 static int
1526 kqueue_close(file_t *fp)
1527 {
1528 struct kqueue *kq;
1529 filedesc_t *fdp;
1530 fdfile_t *ff;
1531 int i;
1532
1533 kq = fp->f_kqueue;
1534 fp->f_kqueue = NULL;
1535 fp->f_type = 0;
1536 fdp = curlwp->l_fd;
1537
1538 mutex_enter(&fdp->fd_lock);
1539 for (i = 0; i <= fdp->fd_lastkqfile; i++) {
1540 if ((ff = fdp->fd_dt->dt_ff[i]) == NULL)
1541 continue;
1542 kqueue_doclose(kq, (struct klist *)&ff->ff_knlist, i);
1543 }
1544 if (fdp->fd_knhashmask != 0) {
1545 for (i = 0; i < fdp->fd_knhashmask + 1; i++) {
1546 kqueue_doclose(kq, &fdp->fd_knhash[i], -1);
1547 }
1548 }
1549 mutex_exit(&fdp->fd_lock);
1550
1551 KASSERT(kq->kq_count == 0);
1552 mutex_destroy(&kq->kq_lock);
1553 cv_destroy(&kq->kq_cv);
1554 seldestroy(&kq->kq_sel);
1555 kmem_free(kq, sizeof(*kq));
1556
1557 return (0);
1558 }
1559
1560 /*
1561 * struct fileops kqfilter method for a kqueue descriptor.
1562 * Event triggered when monitored kqueue changes.
1563 */
1564 static int
1565 kqueue_kqfilter(file_t *fp, struct knote *kn)
1566 {
1567 struct kqueue *kq;
1568
1569 kq = ((file_t *)kn->kn_obj)->f_kqueue;
1570
1571 KASSERT(fp == kn->kn_obj);
1572
1573 if (kn->kn_filter != EVFILT_READ)
1574 return 1;
1575
1576 kn->kn_fop = &kqread_filtops;
1577 mutex_enter(&kq->kq_lock);
1578 SLIST_INSERT_HEAD(&kq->kq_sel.sel_klist, kn, kn_selnext);
1579 mutex_exit(&kq->kq_lock);
1580
1581 return 0;
1582 }
1583
1584
1585 /*
1586 * Walk down a list of knotes, activating them if their event has
1587 * triggered. The caller's object lock (e.g. device driver lock)
1588 * must be held.
1589 */
1590 void
1591 knote(struct klist *list, long hint)
1592 {
1593 struct knote *kn, *tmpkn;
1594
1595 SLIST_FOREACH_SAFE(kn, list, kn_selnext, tmpkn) {
1596 KASSERT(kn->kn_fop != NULL);
1597 KASSERT(kn->kn_fop->f_event != NULL);
1598 if ((*kn->kn_fop->f_event)(kn, hint))
1599 knote_activate(kn);
1600 }
1601 }
1602
1603 /*
1604 * Remove all knotes referencing a specified fd
1605 */
1606 void
1607 knote_fdclose(int fd)
1608 {
1609 struct klist *list;
1610 struct knote *kn;
1611 filedesc_t *fdp;
1612
1613 fdp = curlwp->l_fd;
1614 list = (struct klist *)&fdp->fd_dt->dt_ff[fd]->ff_knlist;
1615 mutex_enter(&fdp->fd_lock);
1616 while ((kn = SLIST_FIRST(list)) != NULL) {
1617 knote_detach(kn, fdp, true);
1618 mutex_enter(&fdp->fd_lock);
1619 }
1620 mutex_exit(&fdp->fd_lock);
1621 }
1622
1623 /*
1624 * Drop knote. Called with fdp->fd_lock held, and will drop before
1625 * returning.
1626 */
1627 static void
1628 knote_detach(struct knote *kn, filedesc_t *fdp, bool dofop)
1629 {
1630 struct klist *list;
1631 struct kqueue *kq;
1632
1633 kq = kn->kn_kq;
1634
1635 KASSERT((kn->kn_status & KN_MARKER) == 0);
1636 KASSERT(mutex_owned(&fdp->fd_lock));
1637
1638 KASSERT(kn->kn_fop != NULL);
1639 /* Remove from monitored object. */
1640 if (dofop) {
1641 KASSERT(kn->kn_fop->f_detach != NULL);
1642 KERNEL_LOCK(1, NULL); /* XXXSMP */
1643 (*kn->kn_fop->f_detach)(kn);
1644 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */
1645 }
1646
1647 /* Remove from descriptor table. */
1648 if (kn->kn_fop->f_isfd)
1649 list = (struct klist *)&fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
1650 else
1651 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
1652
1653 SLIST_REMOVE(list, kn, knote, kn_link);
1654
1655 /* Remove from kqueue. */
1656 again:
1657 mutex_spin_enter(&kq->kq_lock);
1658 if ((kn->kn_status & KN_QUEUED) != 0) {
1659 kq_check(kq);
1660 kq->kq_count--;
1661 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1662 kn->kn_status &= ~KN_QUEUED;
1663 kq_check(kq);
1664 } else if (kn->kn_status & KN_BUSY) {
1665 mutex_spin_exit(&kq->kq_lock);
1666 goto again;
1667 }
1668 mutex_spin_exit(&kq->kq_lock);
1669
1670 mutex_exit(&fdp->fd_lock);
1671 if (kn->kn_fop->f_isfd)
1672 fd_putfile(kn->kn_id);
1673 atomic_dec_uint(&kn->kn_kfilter->refcnt);
1674 kmem_free(kn, sizeof(*kn));
1675 }
1676
1677 /*
1678 * Queue new event for knote.
1679 */
1680 static void
1681 knote_enqueue(struct knote *kn)
1682 {
1683 struct kqueue *kq;
1684
1685 KASSERT((kn->kn_status & KN_MARKER) == 0);
1686
1687 kq = kn->kn_kq;
1688
1689 mutex_spin_enter(&kq->kq_lock);
1690 if ((kn->kn_status & KN_DISABLED) != 0) {
1691 kn->kn_status &= ~KN_DISABLED;
1692 }
1693 if ((kn->kn_status & (KN_ACTIVE | KN_QUEUED)) == KN_ACTIVE) {
1694 kq_check(kq);
1695 kn->kn_status |= KN_QUEUED;
1696 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1697 kq->kq_count++;
1698 kq_check(kq);
1699 cv_broadcast(&kq->kq_cv);
1700 selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
1701 }
1702 mutex_spin_exit(&kq->kq_lock);
1703 }
1704 /*
1705 * Queue new event for knote.
1706 */
1707 static void
1708 knote_activate(struct knote *kn)
1709 {
1710 struct kqueue *kq;
1711
1712 KASSERT((kn->kn_status & KN_MARKER) == 0);
1713
1714 kq = kn->kn_kq;
1715
1716 mutex_spin_enter(&kq->kq_lock);
1717 kn->kn_status |= KN_ACTIVE;
1718 if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) {
1719 kq_check(kq);
1720 kn->kn_status |= KN_QUEUED;
1721 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1722 kq->kq_count++;
1723 kq_check(kq);
1724 cv_broadcast(&kq->kq_cv);
1725 selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
1726 }
1727 mutex_spin_exit(&kq->kq_lock);
1728 }
1729