kern_event.c revision 1.108.2.3 1 /* $NetBSD: kern_event.c,v 1.108.2.3 2021/04/03 22:29:00 thorpej Exp $ */
2
3 /*-
4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*-
33 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon (at) FreeBSD.org>
34 * Copyright (c) 2009 Apple, Inc
35 * All rights reserved.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 *
46 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
47 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
49 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
50 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
52 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
54 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
55 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
56 * SUCH DAMAGE.
57 *
58 * FreeBSD: src/sys/kern/kern_event.c,v 1.27 2001/07/05 17:10:44 rwatson Exp
59 */
60
61 #include <sys/cdefs.h>
62 __KERNEL_RCSID(0, "$NetBSD: kern_event.c,v 1.108.2.3 2021/04/03 22:29:00 thorpej Exp $");
63
64 #include <sys/param.h>
65 #include <sys/systm.h>
66 #include <sys/kernel.h>
67 #include <sys/wait.h>
68 #include <sys/proc.h>
69 #include <sys/file.h>
70 #include <sys/select.h>
71 #include <sys/queue.h>
72 #include <sys/event.h>
73 #include <sys/eventvar.h>
74 #include <sys/poll.h>
75 #include <sys/kmem.h>
76 #include <sys/stat.h>
77 #include <sys/filedesc.h>
78 #include <sys/syscallargs.h>
79 #include <sys/kauth.h>
80 #include <sys/conf.h>
81 #include <sys/atomic.h>
82
83 static int kqueue_scan(file_t *, size_t, struct kevent *,
84 const struct timespec *, register_t *,
85 const struct kevent_ops *, struct kevent *,
86 size_t);
87 static int kqueue_ioctl(file_t *, u_long, void *);
88 static int kqueue_fcntl(file_t *, u_int, void *);
89 static int kqueue_poll(file_t *, int);
90 static int kqueue_kqfilter(file_t *, struct knote *);
91 static int kqueue_stat(file_t *, struct stat *);
92 static int kqueue_close(file_t *);
93 static int kqueue_register(struct kqueue *, struct kevent *);
94 static void kqueue_doclose(struct kqueue *, struct klist *, int);
95
96 static void knote_detach(struct knote *, filedesc_t *fdp, bool);
97 static void knote_enqueue(struct knote *);
98 static void knote_activate(struct knote *);
99
100 static void filt_kqdetach(struct knote *);
101 static int filt_kqueue(struct knote *, long hint);
102 static int filt_procattach(struct knote *);
103 static void filt_procdetach(struct knote *);
104 static int filt_proc(struct knote *, long hint);
105 static int filt_fileattach(struct knote *);
106 static void filt_timerexpire(void *x);
107 static int filt_timerattach(struct knote *);
108 static void filt_timerdetach(struct knote *);
109 static int filt_timer(struct knote *, long hint);
110 static int filt_fsattach(struct knote *kn);
111 static void filt_fsdetach(struct knote *kn);
112 static int filt_fs(struct knote *kn, long hint);
113 static int filt_userattach(struct knote *);
114 static void filt_userdetach(struct knote *);
115 static int filt_user(struct knote *, long hint);
116 static void filt_usertouch(struct knote *, struct kevent *, long type);
117
118 static const struct fileops kqueueops = {
119 .fo_name = "kqueue",
120 .fo_read = (void *)enxio,
121 .fo_write = (void *)enxio,
122 .fo_ioctl = kqueue_ioctl,
123 .fo_fcntl = kqueue_fcntl,
124 .fo_poll = kqueue_poll,
125 .fo_stat = kqueue_stat,
126 .fo_close = kqueue_close,
127 .fo_kqfilter = kqueue_kqfilter,
128 .fo_restart = fnullop_restart,
129 };
130
131 static const struct filterops kqread_filtops = {
132 .f_isfd = 1,
133 .f_attach = NULL,
134 .f_detach = filt_kqdetach,
135 .f_event = filt_kqueue,
136 };
137
138 static const struct filterops proc_filtops = {
139 .f_isfd = 0,
140 .f_attach = filt_procattach,
141 .f_detach = filt_procdetach,
142 .f_event = filt_proc,
143 };
144
145 static const struct filterops file_filtops = {
146 .f_isfd = 1,
147 .f_attach = filt_fileattach,
148 .f_detach = NULL,
149 .f_event = NULL,
150 };
151
152 static const struct filterops timer_filtops = {
153 .f_isfd = 0,
154 .f_attach = filt_timerattach,
155 .f_detach = filt_timerdetach,
156 .f_event = filt_timer,
157 };
158
159 static const struct filterops fs_filtops = {
160 .f_isfd = 0,
161 .f_attach = filt_fsattach,
162 .f_detach = filt_fsdetach,
163 .f_event = filt_fs,
164 };
165
166 static const struct filterops user_filtops = {
167 .f_isfd = 0,
168 .f_attach = filt_userattach,
169 .f_detach = filt_userdetach,
170 .f_event = filt_user,
171 .f_touch = filt_usertouch,
172 };
173
174 static u_int kq_ncallouts = 0;
175 static int kq_calloutmax = (4 * 1024);
176
177 #define KN_HASHSIZE 64 /* XXX should be tunable */
178 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
179
180 extern const struct filterops sig_filtops;
181
182 #define KQ_FLUX_WAKEUP(kq) cv_broadcast(&kq->kq_cv)
183
184 /*
185 * Table for for all system-defined filters.
186 * These should be listed in the numeric order of the EVFILT_* defines.
187 * If filtops is NULL, the filter isn't implemented in NetBSD.
188 * End of list is when name is NULL.
189 *
190 * Note that 'refcnt' is meaningless for built-in filters.
191 */
192 struct kfilter {
193 const char *name; /* name of filter */
194 uint32_t filter; /* id of filter */
195 unsigned refcnt; /* reference count */
196 const struct filterops *filtops;/* operations for filter */
197 size_t namelen; /* length of name string */
198 };
199
200 /* System defined filters */
201 static struct kfilter sys_kfilters[] = {
202 { "EVFILT_READ", EVFILT_READ, 0, &file_filtops, 0 },
203 { "EVFILT_WRITE", EVFILT_WRITE, 0, &file_filtops, 0, },
204 { "EVFILT_AIO", EVFILT_AIO, 0, NULL, 0 },
205 { "EVFILT_VNODE", EVFILT_VNODE, 0, &file_filtops, 0 },
206 { "EVFILT_PROC", EVFILT_PROC, 0, &proc_filtops, 0 },
207 { "EVFILT_SIGNAL", EVFILT_SIGNAL, 0, &sig_filtops, 0 },
208 { "EVFILT_TIMER", EVFILT_TIMER, 0, &timer_filtops, 0 },
209 { "EVFILT_FS", EVFILT_FS, 0, &fs_filtops, 0 },
210 { "EVFILT_USER", EVFILT_USER, 0, &user_filtops, 0 },
211 { NULL, 0, 0, NULL, 0 },
212 };
213
214 /* User defined kfilters */
215 static struct kfilter *user_kfilters; /* array */
216 static int user_kfilterc; /* current offset */
217 static int user_kfiltermaxc; /* max size so far */
218 static size_t user_kfiltersz; /* size of allocated memory */
219
220 /*
221 * Global Locks.
222 *
223 * Lock order:
224 *
225 * kqueue_filter_lock
226 * -> kn_kq->kq_fdp->fd_lock
227 * -> object lock (e.g., device driver lock, kqueue_misc_lock, &c.)
228 * -> kn_kq->kq_lock
229 *
230 * Locking rules:
231 *
232 * f_attach: fdp->fd_lock, KERNEL_LOCK
233 * f_detach: fdp->fd_lock, KERNEL_LOCK
234 * f_event(!NOTE_SUBMIT) via kevent: fdp->fd_lock, _no_ object lock
235 * f_event via knote: whatever caller guarantees
236 * Typically, f_event(NOTE_SUBMIT) via knote: object lock
237 * f_event(!NOTE_SUBMIT) via knote: nothing,
238 * acquires/releases object lock inside.
239 */
240 static krwlock_t kqueue_filter_lock; /* lock on filter lists */
241 static kmutex_t kqueue_misc_lock; /* miscellaneous */
242
243 static kauth_listener_t kqueue_listener;
244
245 static int
246 kqueue_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
247 void *arg0, void *arg1, void *arg2, void *arg3)
248 {
249 struct proc *p;
250 int result;
251
252 result = KAUTH_RESULT_DEFER;
253 p = arg0;
254
255 if (action != KAUTH_PROCESS_KEVENT_FILTER)
256 return result;
257
258 if ((kauth_cred_getuid(p->p_cred) != kauth_cred_getuid(cred) ||
259 ISSET(p->p_flag, PK_SUGID)))
260 return result;
261
262 result = KAUTH_RESULT_ALLOW;
263
264 return result;
265 }
266
267 /*
268 * Initialize the kqueue subsystem.
269 */
270 void
271 kqueue_init(void)
272 {
273
274 rw_init(&kqueue_filter_lock);
275 mutex_init(&kqueue_misc_lock, MUTEX_DEFAULT, IPL_NONE);
276
277 kqueue_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
278 kqueue_listener_cb, NULL);
279 }
280
281 /*
282 * Find kfilter entry by name, or NULL if not found.
283 */
284 static struct kfilter *
285 kfilter_byname_sys(const char *name)
286 {
287 int i;
288
289 KASSERT(rw_lock_held(&kqueue_filter_lock));
290
291 for (i = 0; sys_kfilters[i].name != NULL; i++) {
292 if (strcmp(name, sys_kfilters[i].name) == 0)
293 return &sys_kfilters[i];
294 }
295 return NULL;
296 }
297
298 static struct kfilter *
299 kfilter_byname_user(const char *name)
300 {
301 int i;
302
303 KASSERT(rw_lock_held(&kqueue_filter_lock));
304
305 /* user filter slots have a NULL name if previously deregistered */
306 for (i = 0; i < user_kfilterc ; i++) {
307 if (user_kfilters[i].name != NULL &&
308 strcmp(name, user_kfilters[i].name) == 0)
309 return &user_kfilters[i];
310 }
311 return NULL;
312 }
313
314 static struct kfilter *
315 kfilter_byname(const char *name)
316 {
317 struct kfilter *kfilter;
318
319 KASSERT(rw_lock_held(&kqueue_filter_lock));
320
321 if ((kfilter = kfilter_byname_sys(name)) != NULL)
322 return kfilter;
323
324 return kfilter_byname_user(name);
325 }
326
327 /*
328 * Find kfilter entry by filter id, or NULL if not found.
329 * Assumes entries are indexed in filter id order, for speed.
330 */
331 static struct kfilter *
332 kfilter_byfilter(uint32_t filter)
333 {
334 struct kfilter *kfilter;
335
336 KASSERT(rw_lock_held(&kqueue_filter_lock));
337
338 if (filter < EVFILT_SYSCOUNT) /* it's a system filter */
339 kfilter = &sys_kfilters[filter];
340 else if (user_kfilters != NULL &&
341 filter < EVFILT_SYSCOUNT + user_kfilterc)
342 /* it's a user filter */
343 kfilter = &user_kfilters[filter - EVFILT_SYSCOUNT];
344 else
345 return (NULL); /* out of range */
346 KASSERT(kfilter->filter == filter); /* sanity check! */
347 return (kfilter);
348 }
349
350 /*
351 * Register a new kfilter. Stores the entry in user_kfilters.
352 * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
353 * If retfilter != NULL, the new filterid is returned in it.
354 */
355 int
356 kfilter_register(const char *name, const struct filterops *filtops,
357 int *retfilter)
358 {
359 struct kfilter *kfilter;
360 size_t len;
361 int i;
362
363 if (name == NULL || name[0] == '\0' || filtops == NULL)
364 return (EINVAL); /* invalid args */
365
366 rw_enter(&kqueue_filter_lock, RW_WRITER);
367 if (kfilter_byname(name) != NULL) {
368 rw_exit(&kqueue_filter_lock);
369 return (EEXIST); /* already exists */
370 }
371 if (user_kfilterc > 0xffffffff - EVFILT_SYSCOUNT) {
372 rw_exit(&kqueue_filter_lock);
373 return (EINVAL); /* too many */
374 }
375
376 for (i = 0; i < user_kfilterc; i++) {
377 kfilter = &user_kfilters[i];
378 if (kfilter->name == NULL) {
379 /* Previously deregistered slot. Reuse. */
380 goto reuse;
381 }
382 }
383
384 /* check if need to grow user_kfilters */
385 if (user_kfilterc + 1 > user_kfiltermaxc) {
386 /* Grow in KFILTER_EXTENT chunks. */
387 user_kfiltermaxc += KFILTER_EXTENT;
388 len = user_kfiltermaxc * sizeof(*kfilter);
389 kfilter = kmem_alloc(len, KM_SLEEP);
390 memset((char *)kfilter + user_kfiltersz, 0, len - user_kfiltersz);
391 if (user_kfilters != NULL) {
392 memcpy(kfilter, user_kfilters, user_kfiltersz);
393 kmem_free(user_kfilters, user_kfiltersz);
394 }
395 user_kfiltersz = len;
396 user_kfilters = kfilter;
397 }
398 /* Adding new slot */
399 kfilter = &user_kfilters[user_kfilterc++];
400 reuse:
401 kfilter->name = kmem_strdupsize(name, &kfilter->namelen, KM_SLEEP);
402
403 kfilter->filter = (kfilter - user_kfilters) + EVFILT_SYSCOUNT;
404
405 kfilter->filtops = kmem_alloc(sizeof(*filtops), KM_SLEEP);
406 memcpy(__UNCONST(kfilter->filtops), filtops, sizeof(*filtops));
407
408 if (retfilter != NULL)
409 *retfilter = kfilter->filter;
410 rw_exit(&kqueue_filter_lock);
411
412 return (0);
413 }
414
415 /*
416 * Unregister a kfilter previously registered with kfilter_register.
417 * This retains the filter id, but clears the name and frees filtops (filter
418 * operations), so that the number isn't reused during a boot.
419 * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
420 */
421 int
422 kfilter_unregister(const char *name)
423 {
424 struct kfilter *kfilter;
425
426 if (name == NULL || name[0] == '\0')
427 return (EINVAL); /* invalid name */
428
429 rw_enter(&kqueue_filter_lock, RW_WRITER);
430 if (kfilter_byname_sys(name) != NULL) {
431 rw_exit(&kqueue_filter_lock);
432 return (EINVAL); /* can't detach system filters */
433 }
434
435 kfilter = kfilter_byname_user(name);
436 if (kfilter == NULL) {
437 rw_exit(&kqueue_filter_lock);
438 return (ENOENT);
439 }
440 if (kfilter->refcnt != 0) {
441 rw_exit(&kqueue_filter_lock);
442 return (EBUSY);
443 }
444
445 /* Cast away const (but we know it's safe. */
446 kmem_free(__UNCONST(kfilter->name), kfilter->namelen);
447 kfilter->name = NULL; /* mark as `not implemented' */
448
449 if (kfilter->filtops != NULL) {
450 /* Cast away const (but we know it's safe. */
451 kmem_free(__UNCONST(kfilter->filtops),
452 sizeof(*kfilter->filtops));
453 kfilter->filtops = NULL; /* mark as `not implemented' */
454 }
455 rw_exit(&kqueue_filter_lock);
456
457 return (0);
458 }
459
460
461 /*
462 * Filter attach method for EVFILT_READ and EVFILT_WRITE on normal file
463 * descriptors. Calls fileops kqfilter method for given file descriptor.
464 */
465 static int
466 filt_fileattach(struct knote *kn)
467 {
468 file_t *fp;
469
470 fp = kn->kn_obj;
471
472 return (*fp->f_ops->fo_kqfilter)(fp, kn);
473 }
474
475 /*
476 * Filter detach method for EVFILT_READ on kqueue descriptor.
477 */
478 static void
479 filt_kqdetach(struct knote *kn)
480 {
481 struct kqueue *kq;
482
483 kq = ((file_t *)kn->kn_obj)->f_kqueue;
484
485 mutex_spin_enter(&kq->kq_lock);
486 selremove_knote(&kq->kq_sel, kn);
487 mutex_spin_exit(&kq->kq_lock);
488 }
489
490 /*
491 * Filter event method for EVFILT_READ on kqueue descriptor.
492 */
493 /*ARGSUSED*/
494 static int
495 filt_kqueue(struct knote *kn, long hint)
496 {
497 struct kqueue *kq;
498 int rv;
499
500 kq = ((file_t *)kn->kn_obj)->f_kqueue;
501
502 if (hint != NOTE_SUBMIT)
503 mutex_spin_enter(&kq->kq_lock);
504 kn->kn_data = kq->kq_count;
505 rv = (kn->kn_data > 0);
506 if (hint != NOTE_SUBMIT)
507 mutex_spin_exit(&kq->kq_lock);
508
509 return rv;
510 }
511
512 /*
513 * Filter attach method for EVFILT_PROC.
514 */
515 static int
516 filt_procattach(struct knote *kn)
517 {
518 struct proc *p;
519 struct lwp *curl;
520
521 curl = curlwp;
522
523 mutex_enter(&proc_lock);
524 if (kn->kn_flags & EV_FLAG1) {
525 /*
526 * NOTE_TRACK attaches to the child process too early
527 * for proc_find, so do a raw look up and check the state
528 * explicitly.
529 */
530 p = proc_find_raw(kn->kn_id);
531 if (p != NULL && p->p_stat != SIDL)
532 p = NULL;
533 } else {
534 p = proc_find(kn->kn_id);
535 }
536
537 if (p == NULL) {
538 mutex_exit(&proc_lock);
539 return ESRCH;
540 }
541
542 /*
543 * Fail if it's not owned by you, or the last exec gave us
544 * setuid/setgid privs (unless you're root).
545 */
546 mutex_enter(p->p_lock);
547 mutex_exit(&proc_lock);
548 if (kauth_authorize_process(curl->l_cred, KAUTH_PROCESS_KEVENT_FILTER,
549 p, NULL, NULL, NULL) != 0) {
550 mutex_exit(p->p_lock);
551 return EACCES;
552 }
553
554 kn->kn_obj = p;
555 kn->kn_flags |= EV_CLEAR; /* automatically set */
556
557 /*
558 * internal flag indicating registration done by kernel
559 */
560 if (kn->kn_flags & EV_FLAG1) {
561 kn->kn_data = kn->kn_sdata; /* ppid */
562 kn->kn_fflags = NOTE_CHILD;
563 kn->kn_flags &= ~EV_FLAG1;
564 }
565 SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext);
566 mutex_exit(p->p_lock);
567
568 return 0;
569 }
570
571 /*
572 * Filter detach method for EVFILT_PROC.
573 *
574 * The knote may be attached to a different process, which may exit,
575 * leaving nothing for the knote to be attached to. So when the process
576 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
577 * it will be deleted when read out. However, as part of the knote deletion,
578 * this routine is called, so a check is needed to avoid actually performing
579 * a detach, because the original process might not exist any more.
580 */
581 static void
582 filt_procdetach(struct knote *kn)
583 {
584 struct proc *p;
585
586 if (kn->kn_status & KN_DETACHED)
587 return;
588
589 p = kn->kn_obj;
590
591 mutex_enter(p->p_lock);
592 SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
593 mutex_exit(p->p_lock);
594 }
595
596 /*
597 * Filter event method for EVFILT_PROC.
598 */
599 static int
600 filt_proc(struct knote *kn, long hint)
601 {
602 u_int event, fflag;
603 struct kevent kev;
604 struct kqueue *kq;
605 int error;
606
607 event = (u_int)hint & NOTE_PCTRLMASK;
608 kq = kn->kn_kq;
609 fflag = 0;
610
611 /* If the user is interested in this event, record it. */
612 if (kn->kn_sfflags & event)
613 fflag |= event;
614
615 if (event == NOTE_EXIT) {
616 struct proc *p = kn->kn_obj;
617
618 if (p != NULL)
619 kn->kn_data = P_WAITSTATUS(p);
620 /*
621 * Process is gone, so flag the event as finished.
622 *
623 * Detach the knote from watched process and mark
624 * it as such. We can't leave this to kqueue_scan(),
625 * since the process might not exist by then. And we
626 * have to do this now, since psignal KNOTE() is called
627 * also for zombies and we might end up reading freed
628 * memory if the kevent would already be picked up
629 * and knote g/c'ed.
630 */
631 filt_procdetach(kn);
632
633 mutex_spin_enter(&kq->kq_lock);
634 kn->kn_status |= KN_DETACHED;
635 /* Mark as ONESHOT, so that the knote it g/c'ed when read */
636 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
637 kn->kn_fflags |= fflag;
638 mutex_spin_exit(&kq->kq_lock);
639
640 return 1;
641 }
642
643 mutex_spin_enter(&kq->kq_lock);
644 if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
645 /*
646 * Process forked, and user wants to track the new process,
647 * so attach a new knote to it, and immediately report an
648 * event with the parent's pid. Register knote with new
649 * process.
650 */
651 memset(&kev, 0, sizeof(kev));
652 kev.ident = hint & NOTE_PDATAMASK; /* pid */
653 kev.filter = kn->kn_filter;
654 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
655 kev.fflags = kn->kn_sfflags;
656 kev.data = kn->kn_id; /* parent */
657 kev.udata = kn->kn_kevent.udata; /* preserve udata */
658 mutex_spin_exit(&kq->kq_lock);
659 error = kqueue_register(kq, &kev);
660 mutex_spin_enter(&kq->kq_lock);
661 if (error != 0)
662 kn->kn_fflags |= NOTE_TRACKERR;
663 }
664 kn->kn_fflags |= fflag;
665 fflag = kn->kn_fflags;
666 mutex_spin_exit(&kq->kq_lock);
667
668 return fflag != 0;
669 }
670
671 static void
672 filt_timerexpire(void *knx)
673 {
674 struct knote *kn = knx;
675 int tticks;
676
677 mutex_enter(&kqueue_misc_lock);
678 kn->kn_data++;
679 knote_activate(kn);
680 if ((kn->kn_flags & EV_ONESHOT) == 0) {
681 tticks = mstohz(kn->kn_sdata);
682 if (tticks <= 0)
683 tticks = 1;
684 callout_schedule((callout_t *)kn->kn_hook, tticks);
685 }
686 mutex_exit(&kqueue_misc_lock);
687 }
688
689 /*
690 * data contains amount of time to sleep, in milliseconds
691 */
692 static int
693 filt_timerattach(struct knote *kn)
694 {
695 callout_t *calloutp;
696 struct kqueue *kq;
697 int tticks;
698
699 tticks = mstohz(kn->kn_sdata);
700
701 /* if the supplied value is under our resolution, use 1 tick */
702 if (tticks == 0) {
703 if (kn->kn_sdata == 0)
704 return EINVAL;
705 tticks = 1;
706 }
707
708 if (atomic_inc_uint_nv(&kq_ncallouts) >= kq_calloutmax ||
709 (calloutp = kmem_alloc(sizeof(*calloutp), KM_NOSLEEP)) == NULL) {
710 atomic_dec_uint(&kq_ncallouts);
711 return ENOMEM;
712 }
713 callout_init(calloutp, CALLOUT_MPSAFE);
714
715 kq = kn->kn_kq;
716 mutex_spin_enter(&kq->kq_lock);
717 kn->kn_flags |= EV_CLEAR; /* automatically set */
718 kn->kn_hook = calloutp;
719 mutex_spin_exit(&kq->kq_lock);
720
721 callout_reset(calloutp, tticks, filt_timerexpire, kn);
722
723 return (0);
724 }
725
726 static void
727 filt_timerdetach(struct knote *kn)
728 {
729 callout_t *calloutp;
730 struct kqueue *kq = kn->kn_kq;
731
732 mutex_spin_enter(&kq->kq_lock);
733 /* prevent rescheduling when we expire */
734 kn->kn_flags |= EV_ONESHOT;
735 mutex_spin_exit(&kq->kq_lock);
736
737 calloutp = (callout_t *)kn->kn_hook;
738 callout_halt(calloutp, NULL);
739 callout_destroy(calloutp);
740 kmem_free(calloutp, sizeof(*calloutp));
741 atomic_dec_uint(&kq_ncallouts);
742 }
743
744 static int
745 filt_timer(struct knote *kn, long hint)
746 {
747 int rv;
748
749 mutex_enter(&kqueue_misc_lock);
750 rv = (kn->kn_data != 0);
751 mutex_exit(&kqueue_misc_lock);
752
753 return rv;
754 }
755
756 /*
757 * Filter event method for EVFILT_FS.
758 */
759 struct klist fs_klist = SLIST_HEAD_INITIALIZER(&fs_klist);
760
761 static int
762 filt_fsattach(struct knote *kn)
763 {
764
765 mutex_enter(&kqueue_misc_lock);
766 kn->kn_flags |= EV_CLEAR;
767 SLIST_INSERT_HEAD(&fs_klist, kn, kn_selnext);
768 mutex_exit(&kqueue_misc_lock);
769
770 return 0;
771 }
772
773 static void
774 filt_fsdetach(struct knote *kn)
775 {
776
777 mutex_enter(&kqueue_misc_lock);
778 SLIST_REMOVE(&fs_klist, kn, knote, kn_selnext);
779 mutex_exit(&kqueue_misc_lock);
780 }
781
782 static int
783 filt_fs(struct knote *kn, long hint)
784 {
785 int rv;
786
787 mutex_enter(&kqueue_misc_lock);
788 kn->kn_fflags |= hint;
789 rv = (kn->kn_fflags != 0);
790 mutex_exit(&kqueue_misc_lock);
791
792 return rv;
793 }
794
795 static int
796 filt_userattach(struct knote *kn)
797 {
798 struct kqueue *kq = kn->kn_kq;
799
800 /*
801 * EVFILT_USER knotes are not attached to anything in the kernel.
802 */
803 mutex_spin_enter(&kq->kq_lock);
804 kn->kn_hook = NULL;
805 if (kn->kn_fflags & NOTE_TRIGGER)
806 kn->kn_hookid = 1;
807 else
808 kn->kn_hookid = 0;
809 mutex_spin_exit(&kq->kq_lock);
810 return (0);
811 }
812
813 static void
814 filt_userdetach(struct knote *kn)
815 {
816
817 /*
818 * EVFILT_USER knotes are not attached to anything in the kernel.
819 */
820 }
821
822 static int
823 filt_user(struct knote *kn, long hint)
824 {
825 struct kqueue *kq = kn->kn_kq;
826 int hookid;
827
828 mutex_spin_enter(&kq->kq_lock);
829 hookid = kn->kn_hookid;
830 mutex_spin_exit(&kq->kq_lock);
831
832 return hookid;
833 }
834
835 static void
836 filt_usertouch(struct knote *kn, struct kevent *kev, long type)
837 {
838 int ffctrl;
839
840 KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
841
842 switch (type) {
843 case EVENT_REGISTER:
844 if (kev->fflags & NOTE_TRIGGER)
845 kn->kn_hookid = 1;
846
847 ffctrl = kev->fflags & NOTE_FFCTRLMASK;
848 kev->fflags &= NOTE_FFLAGSMASK;
849 switch (ffctrl) {
850 case NOTE_FFNOP:
851 break;
852
853 case NOTE_FFAND:
854 kn->kn_sfflags &= kev->fflags;
855 break;
856
857 case NOTE_FFOR:
858 kn->kn_sfflags |= kev->fflags;
859 break;
860
861 case NOTE_FFCOPY:
862 kn->kn_sfflags = kev->fflags;
863 break;
864
865 default:
866 /* XXX Return error? */
867 break;
868 }
869 kn->kn_sdata = kev->data;
870 if (kev->flags & EV_CLEAR) {
871 kn->kn_hookid = 0;
872 kn->kn_data = 0;
873 kn->kn_fflags = 0;
874 }
875 break;
876
877 case EVENT_PROCESS:
878 *kev = kn->kn_kevent;
879 kev->fflags = kn->kn_sfflags;
880 kev->data = kn->kn_sdata;
881 if (kn->kn_flags & EV_CLEAR) {
882 kn->kn_hookid = 0;
883 kn->kn_data = 0;
884 kn->kn_fflags = 0;
885 }
886 break;
887
888 default:
889 panic("filt_usertouch() - invalid type (%ld)", type);
890 break;
891 }
892 }
893
894 /*
895 * filt_seltrue:
896 *
897 * This filter "event" routine simulates seltrue().
898 */
899 int
900 filt_seltrue(struct knote *kn, long hint)
901 {
902
903 /*
904 * We don't know how much data can be read/written,
905 * but we know that it *can* be. This is about as
906 * good as select/poll does as well.
907 */
908 kn->kn_data = 0;
909 return (1);
910 }
911
912 /*
913 * This provides full kqfilter entry for device switch tables, which
914 * has same effect as filter using filt_seltrue() as filter method.
915 */
916 static void
917 filt_seltruedetach(struct knote *kn)
918 {
919 /* Nothing to do */
920 }
921
922 const struct filterops seltrue_filtops = {
923 .f_isfd = 1,
924 .f_attach = NULL,
925 .f_detach = filt_seltruedetach,
926 .f_event = filt_seltrue,
927 };
928
929 int
930 seltrue_kqfilter(dev_t dev, struct knote *kn)
931 {
932 switch (kn->kn_filter) {
933 case EVFILT_READ:
934 case EVFILT_WRITE:
935 kn->kn_fop = &seltrue_filtops;
936 break;
937 default:
938 return (EINVAL);
939 }
940
941 /* Nothing more to do */
942 return (0);
943 }
944
945 /*
946 * kqueue(2) system call.
947 */
948 static int
949 kqueue1(struct lwp *l, int flags, register_t *retval)
950 {
951 struct kqueue *kq;
952 file_t *fp;
953 int fd, error;
954
955 if ((error = fd_allocfile(&fp, &fd)) != 0)
956 return error;
957 fp->f_flag = FREAD | FWRITE | (flags & (FNONBLOCK|FNOSIGPIPE));
958 fp->f_type = DTYPE_KQUEUE;
959 fp->f_ops = &kqueueops;
960 kq = kmem_zalloc(sizeof(*kq), KM_SLEEP);
961 mutex_init(&kq->kq_lock, MUTEX_DEFAULT, IPL_SCHED);
962 cv_init(&kq->kq_cv, "kqueue");
963 selinit(&kq->kq_sel);
964 TAILQ_INIT(&kq->kq_head);
965 fp->f_kqueue = kq;
966 *retval = fd;
967 kq->kq_fdp = curlwp->l_fd;
968 fd_set_exclose(l, fd, (flags & O_CLOEXEC) != 0);
969 fd_affix(curproc, fp, fd);
970 return error;
971 }
972
973 /*
974 * kqueue(2) system call.
975 */
976 int
977 sys_kqueue(struct lwp *l, const void *v, register_t *retval)
978 {
979 return kqueue1(l, 0, retval);
980 }
981
982 int
983 sys_kqueue1(struct lwp *l, const struct sys_kqueue1_args *uap,
984 register_t *retval)
985 {
986 /* {
987 syscallarg(int) flags;
988 } */
989 return kqueue1(l, SCARG(uap, flags), retval);
990 }
991
992 /*
993 * kevent(2) system call.
994 */
995 int
996 kevent_fetch_changes(void *ctx, const struct kevent *changelist,
997 struct kevent *changes, size_t index, int n)
998 {
999
1000 return copyin(changelist + index, changes, n * sizeof(*changes));
1001 }
1002
1003 int
1004 kevent_put_events(void *ctx, struct kevent *events,
1005 struct kevent *eventlist, size_t index, int n)
1006 {
1007
1008 return copyout(events, eventlist + index, n * sizeof(*events));
1009 }
1010
1011 static const struct kevent_ops kevent_native_ops = {
1012 .keo_private = NULL,
1013 .keo_fetch_timeout = copyin,
1014 .keo_fetch_changes = kevent_fetch_changes,
1015 .keo_put_events = kevent_put_events,
1016 };
1017
1018 int
1019 sys___kevent50(struct lwp *l, const struct sys___kevent50_args *uap,
1020 register_t *retval)
1021 {
1022 /* {
1023 syscallarg(int) fd;
1024 syscallarg(const struct kevent *) changelist;
1025 syscallarg(size_t) nchanges;
1026 syscallarg(struct kevent *) eventlist;
1027 syscallarg(size_t) nevents;
1028 syscallarg(const struct timespec *) timeout;
1029 } */
1030
1031 return kevent1(retval, SCARG(uap, fd), SCARG(uap, changelist),
1032 SCARG(uap, nchanges), SCARG(uap, eventlist), SCARG(uap, nevents),
1033 SCARG(uap, timeout), &kevent_native_ops);
1034 }
1035
1036 int
1037 kevent1(register_t *retval, int fd,
1038 const struct kevent *changelist, size_t nchanges,
1039 struct kevent *eventlist, size_t nevents,
1040 const struct timespec *timeout,
1041 const struct kevent_ops *keops)
1042 {
1043 struct kevent *kevp;
1044 struct kqueue *kq;
1045 struct timespec ts;
1046 size_t i, n, ichange;
1047 int nerrors, error;
1048 struct kevent kevbuf[KQ_NEVENTS]; /* approx 300 bytes on 64-bit */
1049 file_t *fp;
1050
1051 /* check that we're dealing with a kq */
1052 fp = fd_getfile(fd);
1053 if (fp == NULL)
1054 return (EBADF);
1055
1056 if (fp->f_type != DTYPE_KQUEUE) {
1057 fd_putfile(fd);
1058 return (EBADF);
1059 }
1060
1061 if (timeout != NULL) {
1062 error = (*keops->keo_fetch_timeout)(timeout, &ts, sizeof(ts));
1063 if (error)
1064 goto done;
1065 timeout = &ts;
1066 }
1067
1068 kq = fp->f_kqueue;
1069 nerrors = 0;
1070 ichange = 0;
1071
1072 /* traverse list of events to register */
1073 while (nchanges > 0) {
1074 n = MIN(nchanges, __arraycount(kevbuf));
1075 error = (*keops->keo_fetch_changes)(keops->keo_private,
1076 changelist, kevbuf, ichange, n);
1077 if (error)
1078 goto done;
1079 for (i = 0; i < n; i++) {
1080 kevp = &kevbuf[i];
1081 kevp->flags &= ~EV_SYSFLAGS;
1082 /* register each knote */
1083 error = kqueue_register(kq, kevp);
1084 if (!error && !(kevp->flags & EV_RECEIPT))
1085 continue;
1086 if (nevents == 0)
1087 goto done;
1088 kevp->flags = EV_ERROR;
1089 kevp->data = error;
1090 error = (*keops->keo_put_events)
1091 (keops->keo_private, kevp,
1092 eventlist, nerrors, 1);
1093 if (error)
1094 goto done;
1095 nevents--;
1096 nerrors++;
1097 }
1098 nchanges -= n; /* update the results */
1099 ichange += n;
1100 }
1101 if (nerrors) {
1102 *retval = nerrors;
1103 error = 0;
1104 goto done;
1105 }
1106
1107 /* actually scan through the events */
1108 error = kqueue_scan(fp, nevents, eventlist, timeout, retval, keops,
1109 kevbuf, __arraycount(kevbuf));
1110 done:
1111 fd_putfile(fd);
1112 return (error);
1113 }
1114
1115 /*
1116 * Register a given kevent kev onto the kqueue
1117 */
1118 static int
1119 kqueue_register(struct kqueue *kq, struct kevent *kev)
1120 {
1121 struct kfilter *kfilter;
1122 filedesc_t *fdp;
1123 file_t *fp;
1124 fdfile_t *ff;
1125 struct knote *kn, *newkn;
1126 struct klist *list;
1127 int error, fd, rv;
1128
1129 fdp = kq->kq_fdp;
1130 fp = NULL;
1131 kn = NULL;
1132 error = 0;
1133 fd = 0;
1134
1135 newkn = kmem_zalloc(sizeof(*newkn), KM_SLEEP);
1136
1137 rw_enter(&kqueue_filter_lock, RW_READER);
1138 kfilter = kfilter_byfilter(kev->filter);
1139 if (kfilter == NULL || kfilter->filtops == NULL) {
1140 /* filter not found nor implemented */
1141 rw_exit(&kqueue_filter_lock);
1142 kmem_free(newkn, sizeof(*newkn));
1143 return (EINVAL);
1144 }
1145
1146 /* search if knote already exists */
1147 if (kfilter->filtops->f_isfd) {
1148 /* monitoring a file descriptor */
1149 /* validate descriptor */
1150 if (kev->ident > INT_MAX
1151 || (fp = fd_getfile(fd = kev->ident)) == NULL) {
1152 rw_exit(&kqueue_filter_lock);
1153 kmem_free(newkn, sizeof(*newkn));
1154 return EBADF;
1155 }
1156 mutex_enter(&fdp->fd_lock);
1157 ff = fdp->fd_dt->dt_ff[fd];
1158 if (ff->ff_refcnt & FR_CLOSING) {
1159 error = EBADF;
1160 goto doneunlock;
1161 }
1162 if (fd <= fdp->fd_lastkqfile) {
1163 SLIST_FOREACH(kn, &ff->ff_knlist, kn_link) {
1164 if (kq == kn->kn_kq &&
1165 kev->filter == kn->kn_filter)
1166 break;
1167 }
1168 }
1169 } else {
1170 /*
1171 * not monitoring a file descriptor, so
1172 * lookup knotes in internal hash table
1173 */
1174 mutex_enter(&fdp->fd_lock);
1175 if (fdp->fd_knhashmask != 0) {
1176 list = &fdp->fd_knhash[
1177 KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
1178 SLIST_FOREACH(kn, list, kn_link) {
1179 if (kev->ident == kn->kn_id &&
1180 kq == kn->kn_kq &&
1181 kev->filter == kn->kn_filter)
1182 break;
1183 }
1184 }
1185 }
1186
1187 /*
1188 * kn now contains the matching knote, or NULL if no match
1189 */
1190 if (kn == NULL) {
1191 if (kev->flags & EV_ADD) {
1192 /* create new knote */
1193 kn = newkn;
1194 newkn = NULL;
1195 kn->kn_obj = fp;
1196 kn->kn_id = kev->ident;
1197 kn->kn_kq = kq;
1198 kn->kn_fop = kfilter->filtops;
1199 kn->kn_kfilter = kfilter;
1200 kn->kn_sfflags = kev->fflags;
1201 kn->kn_sdata = kev->data;
1202 kev->fflags = 0;
1203 kev->data = 0;
1204 kn->kn_kevent = *kev;
1205
1206 KASSERT(kn->kn_fop != NULL);
1207 /*
1208 * apply reference count to knote structure, and
1209 * do not release it at the end of this routine.
1210 */
1211 fp = NULL;
1212
1213 if (!kn->kn_fop->f_isfd) {
1214 /*
1215 * If knote is not on an fd, store on
1216 * internal hash table.
1217 */
1218 if (fdp->fd_knhashmask == 0) {
1219 /* XXXAD can block with fd_lock held */
1220 fdp->fd_knhash = hashinit(KN_HASHSIZE,
1221 HASH_LIST, true,
1222 &fdp->fd_knhashmask);
1223 }
1224 list = &fdp->fd_knhash[KN_HASH(kn->kn_id,
1225 fdp->fd_knhashmask)];
1226 } else {
1227 /* Otherwise, knote is on an fd. */
1228 list = (struct klist *)
1229 &fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
1230 if ((int)kn->kn_id > fdp->fd_lastkqfile)
1231 fdp->fd_lastkqfile = kn->kn_id;
1232 }
1233 SLIST_INSERT_HEAD(list, kn, kn_link);
1234
1235 KERNEL_LOCK(1, NULL); /* XXXSMP */
1236 error = (*kfilter->filtops->f_attach)(kn);
1237 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */
1238 if (error != 0) {
1239 #ifdef DEBUG
1240 struct proc *p = curlwp->l_proc;
1241 const file_t *ft = kn->kn_obj;
1242 printf("%s: %s[%d]: event type %d not "
1243 "supported for file type %d/%s "
1244 "(error %d)\n", __func__,
1245 p->p_comm, p->p_pid,
1246 kn->kn_filter, ft ? ft->f_type : -1,
1247 ft ? ft->f_ops->fo_name : "?", error);
1248 #endif
1249
1250 /* knote_detach() drops fdp->fd_lock */
1251 knote_detach(kn, fdp, false);
1252 goto done;
1253 }
1254 atomic_inc_uint(&kfilter->refcnt);
1255 goto done_ev_add;
1256 } else {
1257 /* No matching knote and the EV_ADD flag is not set. */
1258 error = ENOENT;
1259 goto doneunlock;
1260 }
1261 }
1262
1263 if (kev->flags & EV_DELETE) {
1264 /* knote_detach() drops fdp->fd_lock */
1265 knote_detach(kn, fdp, true);
1266 goto done;
1267 }
1268
1269 /*
1270 * The user may change some filter values after the
1271 * initial EV_ADD, but doing so will not reset any
1272 * filter which have already been triggered.
1273 */
1274 kn->kn_kevent.udata = kev->udata;
1275 KASSERT(kn->kn_fop != NULL);
1276 if (!kn->kn_fop->f_isfd && kn->kn_fop->f_touch != NULL) {
1277 mutex_spin_enter(&kq->kq_lock);
1278 (*kn->kn_fop->f_touch)(kn, kev, EVENT_REGISTER);
1279 mutex_spin_exit(&kq->kq_lock);
1280 } else {
1281 kn->kn_sfflags = kev->fflags;
1282 kn->kn_sdata = kev->data;
1283 }
1284
1285 /*
1286 * We can get here if we are trying to attach
1287 * an event to a file descriptor that does not
1288 * support events, and the attach routine is
1289 * broken and does not return an error.
1290 */
1291 done_ev_add:
1292 KASSERT(kn->kn_fop != NULL);
1293 KASSERT(kn->kn_fop->f_event != NULL);
1294 KERNEL_LOCK(1, NULL); /* XXXSMP */
1295 rv = (*kn->kn_fop->f_event)(kn, 0);
1296 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */
1297 if (rv)
1298 knote_activate(kn);
1299
1300 /* disable knote */
1301 if ((kev->flags & EV_DISABLE)) {
1302 mutex_spin_enter(&kq->kq_lock);
1303 if ((kn->kn_status & KN_DISABLED) == 0)
1304 kn->kn_status |= KN_DISABLED;
1305 mutex_spin_exit(&kq->kq_lock);
1306 }
1307
1308 /* enable knote */
1309 if ((kev->flags & EV_ENABLE)) {
1310 knote_enqueue(kn);
1311 }
1312 doneunlock:
1313 mutex_exit(&fdp->fd_lock);
1314 done:
1315 rw_exit(&kqueue_filter_lock);
1316 if (newkn != NULL)
1317 kmem_free(newkn, sizeof(*newkn));
1318 if (fp != NULL)
1319 fd_putfile(fd);
1320 return (error);
1321 }
1322
1323 #if defined(DEBUG)
1324 #define KN_FMT(buf, kn) \
1325 (snprintb((buf), sizeof(buf), __KN_FLAG_BITS, (kn)->kn_status), buf)
1326
1327 static void
1328 kqueue_check(const char *func, size_t line, const struct kqueue *kq)
1329 {
1330 const struct knote *kn;
1331 int count;
1332 int nmarker;
1333 char buf[128];
1334
1335 KASSERT(mutex_owned(&kq->kq_lock));
1336 KASSERT(kq->kq_count >= 0);
1337
1338 count = 0;
1339 nmarker = 0;
1340 TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) {
1341 if ((kn->kn_status & (KN_MARKER | KN_QUEUED)) == 0) {
1342 panic("%s,%zu: kq=%p kn=%p !(MARKER|QUEUED) %s",
1343 func, line, kq, kn, KN_FMT(buf, kn));
1344 }
1345 if ((kn->kn_status & KN_MARKER) == 0) {
1346 if (kn->kn_kq != kq) {
1347 panic("%s,%zu: kq=%p kn(%p) != kn->kq(%p): %s",
1348 func, line, kq, kn, kn->kn_kq,
1349 KN_FMT(buf, kn));
1350 }
1351 if ((kn->kn_status & KN_ACTIVE) == 0) {
1352 panic("%s,%zu: kq=%p kn=%p: !ACTIVE %s",
1353 func, line, kq, kn, KN_FMT(buf, kn));
1354 }
1355 count++;
1356 if (count > kq->kq_count) {
1357 panic("%s,%zu: kq=%p kq->kq_count(%d) != "
1358 "count(%d), nmarker=%d",
1359 func, line, kq, kq->kq_count, count,
1360 nmarker);
1361 }
1362 } else {
1363 nmarker++;
1364 #if 0
1365 if (nmarker > 10000) {
1366 panic("%s,%zu: kq=%p too many markers: "
1367 "%d != %d, nmarker=%d",
1368 func, line, kq, kq->kq_count, count,
1369 nmarker);
1370 }
1371 #endif
1372 }
1373 }
1374 }
1375 #define kq_check(a) kqueue_check(__func__, __LINE__, (a))
1376 #else /* defined(DEBUG) */
1377 #define kq_check(a) /* nothing */
1378 #endif /* defined(DEBUG) */
1379
1380 /*
1381 * Scan through the list of events on fp (for a maximum of maxevents),
1382 * returning the results in to ulistp. Timeout is determined by tsp; if
1383 * NULL, wait indefinitely, if 0 valued, perform a poll, otherwise wait
1384 * as appropriate.
1385 */
1386 static int
1387 kqueue_scan(file_t *fp, size_t maxevents, struct kevent *ulistp,
1388 const struct timespec *tsp, register_t *retval,
1389 const struct kevent_ops *keops, struct kevent *kevbuf,
1390 size_t kevcnt)
1391 {
1392 struct kqueue *kq;
1393 struct kevent *kevp;
1394 struct timespec ats, sleepts;
1395 struct knote *kn, *marker, morker;
1396 size_t count, nkev, nevents;
1397 int timeout, error, touch, rv, influx;
1398 filedesc_t *fdp;
1399
1400 fdp = curlwp->l_fd;
1401 kq = fp->f_kqueue;
1402 count = maxevents;
1403 nkev = nevents = error = 0;
1404 if (count == 0) {
1405 *retval = 0;
1406 return 0;
1407 }
1408
1409 if (tsp) { /* timeout supplied */
1410 ats = *tsp;
1411 if (inittimeleft(&ats, &sleepts) == -1) {
1412 *retval = maxevents;
1413 return EINVAL;
1414 }
1415 timeout = tstohz(&ats);
1416 if (timeout <= 0)
1417 timeout = -1; /* do poll */
1418 } else {
1419 /* no timeout, wait forever */
1420 timeout = 0;
1421 }
1422
1423 memset(&morker, 0, sizeof(morker));
1424 marker = &morker;
1425 marker->kn_status = KN_MARKER;
1426 mutex_spin_enter(&kq->kq_lock);
1427 retry:
1428 kevp = kevbuf;
1429 if (kq->kq_count == 0) {
1430 if (timeout >= 0) {
1431 error = cv_timedwait_sig(&kq->kq_cv,
1432 &kq->kq_lock, timeout);
1433 if (error == 0) {
1434 if (tsp == NULL || (timeout =
1435 gettimeleft(&ats, &sleepts)) > 0)
1436 goto retry;
1437 } else {
1438 /* don't restart after signals... */
1439 if (error == ERESTART)
1440 error = EINTR;
1441 if (error == EWOULDBLOCK)
1442 error = 0;
1443 }
1444 }
1445 mutex_spin_exit(&kq->kq_lock);
1446 goto done;
1447 }
1448
1449 /* mark end of knote list */
1450 TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
1451 influx = 0;
1452
1453 /*
1454 * Acquire the fdp->fd_lock interlock to avoid races with
1455 * file creation/destruction from other threads.
1456 */
1457 relock:
1458 mutex_spin_exit(&kq->kq_lock);
1459 mutex_enter(&fdp->fd_lock);
1460 mutex_spin_enter(&kq->kq_lock);
1461
1462 while (count != 0) {
1463 kn = TAILQ_FIRST(&kq->kq_head); /* get next knote */
1464
1465 if ((kn->kn_status & KN_MARKER) != 0 && kn != marker) {
1466 if (influx) {
1467 influx = 0;
1468 KQ_FLUX_WAKEUP(kq);
1469 }
1470 mutex_exit(&fdp->fd_lock);
1471 (void)cv_wait(&kq->kq_cv, &kq->kq_lock);
1472 goto relock;
1473 }
1474
1475 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1476 if (kn == marker) {
1477 /* it's our marker, stop */
1478 KQ_FLUX_WAKEUP(kq);
1479 if (count == maxevents) {
1480 mutex_exit(&fdp->fd_lock);
1481 goto retry;
1482 }
1483 break;
1484 }
1485 KASSERT((kn->kn_status & KN_BUSY) == 0);
1486
1487 kq_check(kq);
1488 kn->kn_status &= ~KN_QUEUED;
1489 kn->kn_status |= KN_BUSY;
1490 kq_check(kq);
1491 if (kn->kn_status & KN_DISABLED) {
1492 kn->kn_status &= ~KN_BUSY;
1493 kq->kq_count--;
1494 /* don't want disabled events */
1495 continue;
1496 }
1497 if ((kn->kn_flags & EV_ONESHOT) == 0) {
1498 mutex_spin_exit(&kq->kq_lock);
1499 KASSERT(kn->kn_fop != NULL);
1500 KASSERT(kn->kn_fop->f_event != NULL);
1501 KERNEL_LOCK(1, NULL); /* XXXSMP */
1502 KASSERT(mutex_owned(&fdp->fd_lock));
1503 rv = (*kn->kn_fop->f_event)(kn, 0);
1504 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */
1505 mutex_spin_enter(&kq->kq_lock);
1506 /* Re-poll if note was re-enqueued. */
1507 if ((kn->kn_status & KN_QUEUED) != 0) {
1508 kn->kn_status &= ~KN_BUSY;
1509 /* Re-enqueue raised kq_count, lower it again */
1510 kq->kq_count--;
1511 influx = 1;
1512 continue;
1513 }
1514 if (rv == 0) {
1515 /*
1516 * non-ONESHOT event that hasn't
1517 * triggered again, so de-queue.
1518 */
1519 kn->kn_status &= ~(KN_ACTIVE|KN_BUSY);
1520 kq->kq_count--;
1521 influx = 1;
1522 continue;
1523 }
1524 }
1525 KASSERT(kn->kn_fop != NULL);
1526 touch = (!kn->kn_fop->f_isfd &&
1527 kn->kn_fop->f_touch != NULL);
1528 /* XXXAD should be got from f_event if !oneshot. */
1529 if (touch) {
1530 (*kn->kn_fop->f_touch)(kn, kevp, EVENT_PROCESS);
1531 } else {
1532 *kevp = kn->kn_kevent;
1533 }
1534 kevp++;
1535 nkev++;
1536 influx = 1;
1537 if (kn->kn_flags & EV_ONESHOT) {
1538 /* delete ONESHOT events after retrieval */
1539 kn->kn_status &= ~KN_BUSY;
1540 kq->kq_count--;
1541 mutex_spin_exit(&kq->kq_lock);
1542 knote_detach(kn, fdp, true);
1543 mutex_enter(&fdp->fd_lock);
1544 mutex_spin_enter(&kq->kq_lock);
1545 } else if (kn->kn_flags & EV_CLEAR) {
1546 /* clear state after retrieval */
1547 kn->kn_data = 0;
1548 kn->kn_fflags = 0;
1549 /*
1550 * Manually clear knotes who weren't
1551 * 'touch'ed.
1552 */
1553 if (touch == 0) {
1554 kn->kn_data = 0;
1555 kn->kn_fflags = 0;
1556 }
1557 kn->kn_status &= ~(KN_ACTIVE|KN_BUSY);
1558 kq->kq_count--;
1559 } else if (kn->kn_flags & EV_DISPATCH) {
1560 kn->kn_status |= KN_DISABLED;
1561 kn->kn_status &= ~(KN_ACTIVE|KN_BUSY);
1562 kq->kq_count--;
1563 } else {
1564 /* add event back on list */
1565 kq_check(kq);
1566 kn->kn_status |= KN_QUEUED;
1567 kn->kn_status &= ~KN_BUSY;
1568 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1569 kq_check(kq);
1570 }
1571
1572 if (nkev == kevcnt) {
1573 /* do copyouts in kevcnt chunks */
1574 influx = 0;
1575 KQ_FLUX_WAKEUP(kq);
1576 mutex_spin_exit(&kq->kq_lock);
1577 mutex_exit(&fdp->fd_lock);
1578 error = (*keops->keo_put_events)
1579 (keops->keo_private,
1580 kevbuf, ulistp, nevents, nkev);
1581 mutex_enter(&fdp->fd_lock);
1582 mutex_spin_enter(&kq->kq_lock);
1583 nevents += nkev;
1584 nkev = 0;
1585 kevp = kevbuf;
1586 }
1587 count--;
1588 if (error != 0 || count == 0) {
1589 /* remove marker */
1590 TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
1591 break;
1592 }
1593 }
1594 KQ_FLUX_WAKEUP(kq);
1595 mutex_spin_exit(&kq->kq_lock);
1596 mutex_exit(&fdp->fd_lock);
1597
1598 done:
1599 if (nkev != 0) {
1600 /* copyout remaining events */
1601 error = (*keops->keo_put_events)(keops->keo_private,
1602 kevbuf, ulistp, nevents, nkev);
1603 }
1604 *retval = maxevents - count;
1605
1606 return error;
1607 }
1608
1609 /*
1610 * fileops ioctl method for a kqueue descriptor.
1611 *
1612 * Two ioctls are currently supported. They both use struct kfilter_mapping:
1613 * KFILTER_BYNAME find name for filter, and return result in
1614 * name, which is of size len.
1615 * KFILTER_BYFILTER find filter for name. len is ignored.
1616 */
1617 /*ARGSUSED*/
1618 static int
1619 kqueue_ioctl(file_t *fp, u_long com, void *data)
1620 {
1621 struct kfilter_mapping *km;
1622 const struct kfilter *kfilter;
1623 char *name;
1624 int error;
1625
1626 km = data;
1627 error = 0;
1628 name = kmem_alloc(KFILTER_MAXNAME, KM_SLEEP);
1629
1630 switch (com) {
1631 case KFILTER_BYFILTER: /* convert filter -> name */
1632 rw_enter(&kqueue_filter_lock, RW_READER);
1633 kfilter = kfilter_byfilter(km->filter);
1634 if (kfilter != NULL) {
1635 strlcpy(name, kfilter->name, KFILTER_MAXNAME);
1636 rw_exit(&kqueue_filter_lock);
1637 error = copyoutstr(name, km->name, km->len, NULL);
1638 } else {
1639 rw_exit(&kqueue_filter_lock);
1640 error = ENOENT;
1641 }
1642 break;
1643
1644 case KFILTER_BYNAME: /* convert name -> filter */
1645 error = copyinstr(km->name, name, KFILTER_MAXNAME, NULL);
1646 if (error) {
1647 break;
1648 }
1649 rw_enter(&kqueue_filter_lock, RW_READER);
1650 kfilter = kfilter_byname(name);
1651 if (kfilter != NULL)
1652 km->filter = kfilter->filter;
1653 else
1654 error = ENOENT;
1655 rw_exit(&kqueue_filter_lock);
1656 break;
1657
1658 default:
1659 error = ENOTTY;
1660 break;
1661
1662 }
1663 kmem_free(name, KFILTER_MAXNAME);
1664 return (error);
1665 }
1666
1667 /*
1668 * fileops fcntl method for a kqueue descriptor.
1669 */
1670 static int
1671 kqueue_fcntl(file_t *fp, u_int com, void *data)
1672 {
1673
1674 return (ENOTTY);
1675 }
1676
1677 /*
1678 * fileops poll method for a kqueue descriptor.
1679 * Determine if kqueue has events pending.
1680 */
1681 static int
1682 kqueue_poll(file_t *fp, int events)
1683 {
1684 struct kqueue *kq;
1685 int revents;
1686
1687 kq = fp->f_kqueue;
1688
1689 revents = 0;
1690 if (events & (POLLIN | POLLRDNORM)) {
1691 mutex_spin_enter(&kq->kq_lock);
1692 if (kq->kq_count != 0) {
1693 revents |= events & (POLLIN | POLLRDNORM);
1694 } else {
1695 selrecord(curlwp, &kq->kq_sel);
1696 }
1697 kq_check(kq);
1698 mutex_spin_exit(&kq->kq_lock);
1699 }
1700
1701 return revents;
1702 }
1703
1704 /*
1705 * fileops stat method for a kqueue descriptor.
1706 * Returns dummy info, with st_size being number of events pending.
1707 */
1708 static int
1709 kqueue_stat(file_t *fp, struct stat *st)
1710 {
1711 struct kqueue *kq;
1712
1713 kq = fp->f_kqueue;
1714
1715 memset(st, 0, sizeof(*st));
1716 st->st_size = kq->kq_count;
1717 st->st_blksize = sizeof(struct kevent);
1718 st->st_mode = S_IFIFO;
1719
1720 return 0;
1721 }
1722
1723 static void
1724 kqueue_doclose(struct kqueue *kq, struct klist *list, int fd)
1725 {
1726 struct knote *kn;
1727 filedesc_t *fdp;
1728
1729 fdp = kq->kq_fdp;
1730
1731 KASSERT(mutex_owned(&fdp->fd_lock));
1732
1733 for (kn = SLIST_FIRST(list); kn != NULL;) {
1734 if (kq != kn->kn_kq) {
1735 kn = SLIST_NEXT(kn, kn_link);
1736 continue;
1737 }
1738 knote_detach(kn, fdp, true);
1739 mutex_enter(&fdp->fd_lock);
1740 kn = SLIST_FIRST(list);
1741 }
1742 }
1743
1744
1745 /*
1746 * fileops close method for a kqueue descriptor.
1747 */
1748 static int
1749 kqueue_close(file_t *fp)
1750 {
1751 struct kqueue *kq;
1752 filedesc_t *fdp;
1753 fdfile_t *ff;
1754 int i;
1755
1756 kq = fp->f_kqueue;
1757 fp->f_kqueue = NULL;
1758 fp->f_type = 0;
1759 fdp = curlwp->l_fd;
1760
1761 mutex_enter(&fdp->fd_lock);
1762 for (i = 0; i <= fdp->fd_lastkqfile; i++) {
1763 if ((ff = fdp->fd_dt->dt_ff[i]) == NULL)
1764 continue;
1765 kqueue_doclose(kq, (struct klist *)&ff->ff_knlist, i);
1766 }
1767 if (fdp->fd_knhashmask != 0) {
1768 for (i = 0; i < fdp->fd_knhashmask + 1; i++) {
1769 kqueue_doclose(kq, &fdp->fd_knhash[i], -1);
1770 }
1771 }
1772 mutex_exit(&fdp->fd_lock);
1773
1774 KASSERT(kq->kq_count == 0);
1775 mutex_destroy(&kq->kq_lock);
1776 cv_destroy(&kq->kq_cv);
1777 seldestroy(&kq->kq_sel);
1778 kmem_free(kq, sizeof(*kq));
1779
1780 return (0);
1781 }
1782
1783 /*
1784 * struct fileops kqfilter method for a kqueue descriptor.
1785 * Event triggered when monitored kqueue changes.
1786 */
1787 static int
1788 kqueue_kqfilter(file_t *fp, struct knote *kn)
1789 {
1790 struct kqueue *kq;
1791
1792 kq = ((file_t *)kn->kn_obj)->f_kqueue;
1793
1794 KASSERT(fp == kn->kn_obj);
1795
1796 if (kn->kn_filter != EVFILT_READ)
1797 return 1;
1798
1799 kn->kn_fop = &kqread_filtops;
1800 mutex_enter(&kq->kq_lock);
1801 selrecord_knote(&kq->kq_sel, kn);
1802 mutex_exit(&kq->kq_lock);
1803
1804 return 0;
1805 }
1806
1807
1808 /*
1809 * Walk down a list of knotes, activating them if their event has
1810 * triggered. The caller's object lock (e.g. device driver lock)
1811 * must be held.
1812 */
1813 void
1814 knote(struct klist *list, long hint)
1815 {
1816 struct knote *kn, *tmpkn;
1817
1818 SLIST_FOREACH_SAFE(kn, list, kn_selnext, tmpkn) {
1819 KASSERT(kn->kn_fop != NULL);
1820 KASSERT(kn->kn_fop->f_event != NULL);
1821 if ((*kn->kn_fop->f_event)(kn, hint))
1822 knote_activate(kn);
1823 }
1824 }
1825
1826 /*
1827 * Remove all knotes referencing a specified fd
1828 */
1829 void
1830 knote_fdclose(int fd)
1831 {
1832 struct klist *list;
1833 struct knote *kn;
1834 filedesc_t *fdp;
1835
1836 fdp = curlwp->l_fd;
1837 mutex_enter(&fdp->fd_lock);
1838 list = (struct klist *)&fdp->fd_dt->dt_ff[fd]->ff_knlist;
1839 while ((kn = SLIST_FIRST(list)) != NULL) {
1840 knote_detach(kn, fdp, true);
1841 mutex_enter(&fdp->fd_lock);
1842 }
1843 mutex_exit(&fdp->fd_lock);
1844 }
1845
1846 /*
1847 * Drop knote. Called with fdp->fd_lock held, and will drop before
1848 * returning.
1849 */
1850 static void
1851 knote_detach(struct knote *kn, filedesc_t *fdp, bool dofop)
1852 {
1853 struct klist *list;
1854 struct kqueue *kq;
1855
1856 kq = kn->kn_kq;
1857
1858 KASSERT((kn->kn_status & KN_MARKER) == 0);
1859 KASSERT(mutex_owned(&fdp->fd_lock));
1860
1861 KASSERT(kn->kn_fop != NULL);
1862 /* Remove from monitored object. */
1863 if (dofop) {
1864 KASSERT(kn->kn_fop->f_detach != NULL);
1865 KERNEL_LOCK(1, NULL); /* XXXSMP */
1866 (*kn->kn_fop->f_detach)(kn);
1867 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */
1868 }
1869
1870 /* Remove from descriptor table. */
1871 if (kn->kn_fop->f_isfd)
1872 list = (struct klist *)&fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
1873 else
1874 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
1875
1876 SLIST_REMOVE(list, kn, knote, kn_link);
1877
1878 /* Remove from kqueue. */
1879 again:
1880 mutex_spin_enter(&kq->kq_lock);
1881 if ((kn->kn_status & KN_QUEUED) != 0) {
1882 kq_check(kq);
1883 kq->kq_count--;
1884 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1885 kn->kn_status &= ~KN_QUEUED;
1886 kq_check(kq);
1887 } else if (kn->kn_status & KN_BUSY) {
1888 mutex_spin_exit(&kq->kq_lock);
1889 goto again;
1890 }
1891 mutex_spin_exit(&kq->kq_lock);
1892
1893 mutex_exit(&fdp->fd_lock);
1894 if (kn->kn_fop->f_isfd)
1895 fd_putfile(kn->kn_id);
1896 atomic_dec_uint(&kn->kn_kfilter->refcnt);
1897 kmem_free(kn, sizeof(*kn));
1898 }
1899
1900 /*
1901 * Queue new event for knote.
1902 */
1903 static void
1904 knote_enqueue(struct knote *kn)
1905 {
1906 struct kqueue *kq;
1907
1908 KASSERT((kn->kn_status & KN_MARKER) == 0);
1909
1910 kq = kn->kn_kq;
1911
1912 mutex_spin_enter(&kq->kq_lock);
1913 if ((kn->kn_status & KN_DISABLED) != 0) {
1914 kn->kn_status &= ~KN_DISABLED;
1915 }
1916 if ((kn->kn_status & (KN_ACTIVE | KN_QUEUED)) == KN_ACTIVE) {
1917 kq_check(kq);
1918 kn->kn_status |= KN_QUEUED;
1919 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1920 kq->kq_count++;
1921 kq_check(kq);
1922 cv_broadcast(&kq->kq_cv);
1923 selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
1924 }
1925 mutex_spin_exit(&kq->kq_lock);
1926 }
1927 /*
1928 * Queue new event for knote.
1929 */
1930 static void
1931 knote_activate(struct knote *kn)
1932 {
1933 struct kqueue *kq;
1934
1935 KASSERT((kn->kn_status & KN_MARKER) == 0);
1936
1937 kq = kn->kn_kq;
1938
1939 mutex_spin_enter(&kq->kq_lock);
1940 kn->kn_status |= KN_ACTIVE;
1941 if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) {
1942 kq_check(kq);
1943 kn->kn_status |= KN_QUEUED;
1944 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1945 kq->kq_count++;
1946 kq_check(kq);
1947 cv_broadcast(&kq->kq_cv);
1948 selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
1949 }
1950 mutex_spin_exit(&kq->kq_lock);
1951 }
1952