kern_event.c revision 1.7 1 /* $NetBSD: kern_event.c,v 1.7 2003/02/01 06:23:42 thorpej Exp $ */
2 /*-
3 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon (at) FreeBSD.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 *
27 * $FreeBSD: src/sys/kern/kern_event.c,v 1.27 2001/07/05 17:10:44 rwatson Exp $
28 */
29
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/kernel.h>
33 #include <sys/proc.h>
34 #include <sys/malloc.h>
35 #include <sys/unistd.h>
36 #include <sys/file.h>
37 #include <sys/fcntl.h>
38 #include <sys/select.h>
39 #include <sys/queue.h>
40 #include <sys/event.h>
41 #include <sys/eventvar.h>
42 #include <sys/poll.h>
43 #include <sys/pool.h>
44 #include <sys/protosw.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/stat.h>
48 #include <sys/uio.h>
49 #include <sys/mount.h>
50 #include <sys/filedesc.h>
51 #include <sys/sa.h>
52 #include <sys/syscallargs.h>
53
54 static int kqueue_scan(struct file *fp, size_t maxevents,
55 struct kevent *ulistp, const struct timespec *timeout,
56 struct proc *p, register_t *retval);
57 static void kqueue_wakeup(struct kqueue *kq);
58
59 static int kqueue_read(struct file *fp, off_t *offset, struct uio *uio,
60 struct ucred *cred, int flags);
61 static int kqueue_write(struct file *fp, off_t *offset, struct uio *uio,
62 struct ucred *cred, int flags);
63 static int kqueue_ioctl(struct file *fp, u_long com, caddr_t data,
64 struct proc *p);
65 static int kqueue_fcntl(struct file *fp, u_int com, caddr_t data,
66 struct proc *p);
67 static int kqueue_poll(struct file *fp, int events, struct proc *p);
68 static int kqueue_kqfilter(struct file *fp, struct knote *kn);
69 static int kqueue_stat(struct file *fp, struct stat *sp, struct proc *p);
70 static int kqueue_close(struct file *fp, struct proc *p);
71
72 static struct fileops kqueueops = {
73 kqueue_read, kqueue_write, kqueue_ioctl, kqueue_fcntl, kqueue_poll,
74 kqueue_stat, kqueue_close, kqueue_kqfilter
75 };
76
77 static void knote_attach(struct knote *kn, struct filedesc *fdp);
78 static void knote_drop(struct knote *kn, struct proc *p,
79 struct filedesc *fdp);
80 static void knote_enqueue(struct knote *kn);
81 static void knote_dequeue(struct knote *kn);
82
83 static void filt_kqdetach(struct knote *kn);
84 static int filt_kqueue(struct knote *kn, long hint);
85 static int filt_procattach(struct knote *kn);
86 static void filt_procdetach(struct knote *kn);
87 static int filt_proc(struct knote *kn, long hint);
88 static int filt_fileattach(struct knote *kn);
89
90 static const struct filterops kqread_filtops =
91 { 1, NULL, filt_kqdetach, filt_kqueue };
92 static const struct filterops proc_filtops =
93 { 0, filt_procattach, filt_procdetach, filt_proc };
94 static const struct filterops file_filtops =
95 { 1, filt_fileattach, NULL, NULL };
96
97 struct pool kqueue_pool;
98 struct pool knote_pool;
99
100 MALLOC_DEFINE(M_KEVENT, "kevent", "kevents/knotes");
101
102 #define KNOTE_ACTIVATE(kn) \
103 do { \
104 kn->kn_status |= KN_ACTIVE; \
105 if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \
106 knote_enqueue(kn); \
107 } while(0)
108
109 #define KN_HASHSIZE 64 /* XXX should be tunable */
110 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
111
112 extern const struct filterops sig_filtops;
113
114 /*
115 * Table for for all system-defined filters.
116 * These should be listed in the numeric order of the EVFILT_* defines.
117 * If filtops is NULL, the filter isn't implemented in NetBSD.
118 * End of list is when name is NULL.
119 */
120 struct kfilter {
121 const char *name; /* name of filter */
122 uint32_t filter; /* id of filter */
123 const struct filterops *filtops;/* operations for filter */
124 };
125
126 /* System defined filters */
127 static const struct kfilter sys_kfilters[] = {
128 { "EVFILT_READ", EVFILT_READ, &file_filtops },
129 { "EVFILT_WRITE", EVFILT_WRITE, &file_filtops },
130 { "EVFILT_AIO", EVFILT_AIO, NULL },
131 { "EVFILT_VNODE", EVFILT_VNODE, &file_filtops },
132 { "EVFILT_PROC", EVFILT_PROC, &proc_filtops },
133 { "EVFILT_SIGNAL", EVFILT_SIGNAL, &sig_filtops },
134 { NULL, 0, NULL }, /* end of list */
135 };
136
137 /* User defined kfilters */
138 static struct kfilter *user_kfilters; /* array */
139 static int user_kfilterc; /* current offset */
140 static int user_kfiltermaxc; /* max size so far */
141
142 /*
143 * kqueue_init:
144 *
145 * Initialize the kqueue/knote facility.
146 */
147 void
148 kqueue_init(void)
149 {
150
151 pool_init(&kqueue_pool, sizeof(struct kqueue), 0, 0, 0, "kqueuepl",
152 NULL);
153 pool_init(&knote_pool, sizeof(struct knote), 0, 0, 0, "knotepl",
154 NULL);
155 }
156
157 /*
158 * Find kfilter entry by name, or NULL if not found.
159 */
160 static const struct kfilter *
161 kfilter_byname_sys(const char *name)
162 {
163 int i;
164
165 for (i = 0; sys_kfilters[i].name != NULL; i++) {
166 if (strcmp(name, sys_kfilters[i].name) == 0)
167 return (&sys_kfilters[i]);
168 }
169 return (NULL);
170 }
171
172 static struct kfilter *
173 kfilter_byname_user(const char *name)
174 {
175 int i;
176
177 /* user_kfilters[] could be NULL if no filters were registered */
178 if (!user_kfilters)
179 return (NULL);
180
181 for (i = 0; user_kfilters[i].name != NULL; i++) {
182 if (user_kfilters[i].name != '\0' &&
183 strcmp(name, user_kfilters[i].name) == 0)
184 return (&user_kfilters[i]);
185 }
186 return (NULL);
187 }
188
189 static const struct kfilter *
190 kfilter_byname(const char *name)
191 {
192 const struct kfilter *kfilter;
193
194 if ((kfilter = kfilter_byname_sys(name)) != NULL)
195 return (kfilter);
196
197 return (kfilter_byname_user(name));
198 }
199
200 /*
201 * Find kfilter entry by filter id, or NULL if not found.
202 * Assumes entries are indexed in filter id order, for speed.
203 */
204 static const struct kfilter *
205 kfilter_byfilter(uint32_t filter)
206 {
207 const struct kfilter *kfilter;
208
209 if (filter < EVFILT_SYSCOUNT) /* it's a system filter */
210 kfilter = &sys_kfilters[filter];
211 else if (user_kfilters != NULL &&
212 filter < EVFILT_SYSCOUNT + user_kfilterc)
213 /* it's a user filter */
214 kfilter = &user_kfilters[filter - EVFILT_SYSCOUNT];
215 else
216 return (NULL); /* out of range */
217 KASSERT(kfilter->filter == filter); /* sanity check! */
218 return (kfilter);
219 }
220
221 /*
222 * Register a new kfilter. Stores the entry in user_kfilters.
223 * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
224 * If retfilter != NULL, the new filterid is returned in it.
225 */
226 int
227 kfilter_register(const char *name, const struct filterops *filtops,
228 int *retfilter)
229 {
230 struct kfilter *kfilter;
231 void *space;
232 int len;
233
234 if (name == NULL || name[0] == '\0' || filtops == NULL)
235 return (EINVAL); /* invalid args */
236 if (kfilter_byname(name) != NULL)
237 return (EEXIST); /* already exists */
238 if (user_kfilterc > 0xffffffff - EVFILT_SYSCOUNT)
239 return (EINVAL); /* too many */
240
241 /* check if need to grow user_kfilters */
242 if (user_kfilterc + 1 > user_kfiltermaxc) {
243 /*
244 * Grow in KFILTER_EXTENT chunks. Use malloc(9), because we
245 * want to traverse user_kfilters as an array.
246 */
247 user_kfiltermaxc += KFILTER_EXTENT;
248 kfilter = malloc(user_kfiltermaxc * sizeof(struct filter *),
249 M_KEVENT, M_WAITOK);
250
251 /* copy existing user_kfilters */
252 if (user_kfilters != NULL)
253 memcpy((caddr_t)kfilter, (caddr_t)user_kfilters,
254 user_kfilterc * sizeof(struct kfilter *));
255 /* zero new sections */
256 memset((caddr_t)kfilter +
257 user_kfilterc * sizeof(struct kfilter *), 0,
258 (user_kfiltermaxc - user_kfilterc) *
259 sizeof(struct kfilter *));
260 /* switch to new kfilter */
261 if (user_kfilters != NULL)
262 free(user_kfilters, M_KEVENT);
263 user_kfilters = kfilter;
264 }
265 len = strlen(name) + 1; /* copy name */
266 space = malloc(len, M_KEVENT, M_WAITOK);
267 memcpy(space, name, len);
268 user_kfilters[user_kfilterc].name = space;
269
270 user_kfilters[user_kfilterc].filter = user_kfilterc + EVFILT_SYSCOUNT;
271
272 len = sizeof(struct filterops); /* copy filtops */
273 space = malloc(len, M_KEVENT, M_WAITOK);
274 memcpy(space, filtops, len);
275 user_kfilters[user_kfilterc].filtops = space;
276
277 if (retfilter != NULL)
278 *retfilter = user_kfilters[user_kfilterc].filter;
279 user_kfilterc++; /* finally, increment count */
280 return (0);
281 }
282
283 /*
284 * Unregister a kfilter previously registered with kfilter_register.
285 * This retains the filter id, but clears the name and frees filtops (filter
286 * operations), so that the number isn't reused during a boot.
287 * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
288 */
289 int
290 kfilter_unregister(const char *name)
291 {
292 struct kfilter *kfilter;
293
294 if (name == NULL || name[0] == '\0')
295 return (EINVAL); /* invalid name */
296
297 if (kfilter_byname_sys(name) != NULL)
298 return (EINVAL); /* can't detach system filters */
299
300 kfilter = kfilter_byname_user(name);
301 if (kfilter == NULL) /* not found */
302 return (ENOENT);
303
304 if (kfilter->name[0] != '\0') {
305 /* XXX Cast away const (but we know it's safe. */
306 free((void *) kfilter->name, M_KEVENT);
307 kfilter->name = ""; /* mark as `not implemented' */
308 }
309 if (kfilter->filtops != NULL) {
310 /* XXX Cast away const (but we know it's safe. */
311 free((void *) kfilter->filtops, M_KEVENT);
312 kfilter->filtops = NULL; /* mark as `not implemented' */
313 }
314 return (0);
315 }
316
317
318 /*
319 * Filter attach method for EVFILT_READ and EVFILT_WRITE on normal file
320 * descriptors. Calls struct fileops kqfilter method for given file descriptor.
321 */
322 static int
323 filt_fileattach(struct knote *kn)
324 {
325 struct file *fp;
326
327 fp = kn->kn_fp;
328 return ((*fp->f_ops->fo_kqfilter)(fp, kn));
329 }
330
331 /*
332 * Filter detach method for EVFILT_READ on kqueue descriptor.
333 */
334 static void
335 filt_kqdetach(struct knote *kn)
336 {
337 struct kqueue *kq;
338
339 kq = (struct kqueue *)kn->kn_fp->f_data;
340 SLIST_REMOVE(&kq->kq_sel.sel_klist, kn, knote, kn_selnext);
341 }
342
343 /*
344 * Filter event method for EVFILT_READ on kqueue descriptor.
345 */
346 /*ARGSUSED*/
347 static int
348 filt_kqueue(struct knote *kn, long hint)
349 {
350 struct kqueue *kq;
351
352 kq = (struct kqueue *)kn->kn_fp->f_data;
353 kn->kn_data = kq->kq_count;
354 return (kn->kn_data > 0);
355 }
356
357 /*
358 * Filter attach method for EVFILT_PROC.
359 */
360 static int
361 filt_procattach(struct knote *kn)
362 {
363 struct proc *p;
364
365 p = pfind(kn->kn_id);
366 if (p == NULL)
367 return (ESRCH);
368
369 /*
370 * Fail if it's not owned by you, or the last exec gave us
371 * setuid/setgid privs (unless you're root).
372 */
373 if ((p->p_cred->p_ruid != curproc->p_cred->p_ruid ||
374 (p->p_flag & P_SUGID))
375 && suser(curproc->p_ucred, &curproc->p_acflag) != 0)
376 return (EACCES);
377
378 kn->kn_ptr.p_proc = p;
379 kn->kn_flags |= EV_CLEAR; /* automatically set */
380
381 /*
382 * internal flag indicating registration done by kernel
383 */
384 if (kn->kn_flags & EV_FLAG1) {
385 kn->kn_data = kn->kn_sdata; /* ppid */
386 kn->kn_fflags = NOTE_CHILD;
387 kn->kn_flags &= ~EV_FLAG1;
388 }
389
390 /* XXXSMP lock the process? */
391 SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext);
392
393 return (0);
394 }
395
396 /*
397 * Filter detach method for EVFILT_PROC.
398 *
399 * The knote may be attached to a different process, which may exit,
400 * leaving nothing for the knote to be attached to. So when the process
401 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
402 * it will be deleted when read out. However, as part of the knote deletion,
403 * this routine is called, so a check is needed to avoid actually performing
404 * a detach, because the original process might not exist any more.
405 */
406 static void
407 filt_procdetach(struct knote *kn)
408 {
409 struct proc *p;
410
411 if (kn->kn_status & KN_DETACHED)
412 return;
413
414 p = kn->kn_ptr.p_proc;
415 KASSERT(p->p_stat == SDEAD || pfind(kn->kn_id) == p);
416
417 /* XXXSMP lock the process? */
418 SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
419 }
420
421 /*
422 * Filter event method for EVFILT_PROC.
423 */
424 static int
425 filt_proc(struct knote *kn, long hint)
426 {
427 u_int event;
428
429 /*
430 * mask off extra data
431 */
432 event = (u_int)hint & NOTE_PCTRLMASK;
433
434 /*
435 * if the user is interested in this event, record it.
436 */
437 if (kn->kn_sfflags & event)
438 kn->kn_fflags |= event;
439
440 /*
441 * process is gone, so flag the event as finished.
442 */
443 if (event == NOTE_EXIT) {
444 /*
445 * Detach the knote from watched process and mark
446 * it as such. We can't leave this to kqueue_scan(),
447 * since the process might not exist by then. And we
448 * have to do this now, since psignal KNOTE() is called
449 * also for zombies and we might end up reading freed
450 * memory if the kevent would already be picked up
451 * and knote g/c'ed.
452 */
453 kn->kn_fop->f_detach(kn);
454 kn->kn_status |= KN_DETACHED;
455
456 /* Mark as ONESHOT, so that the knote it g/c'ed when read */
457 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
458 return (1);
459 }
460
461 /*
462 * process forked, and user wants to track the new process,
463 * so attach a new knote to it, and immediately report an
464 * event with the parent's pid.
465 */
466 if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
467 struct kevent kev;
468 int error;
469
470 /*
471 * register knote with new process.
472 */
473 kev.ident = hint & NOTE_PDATAMASK; /* pid */
474 kev.filter = kn->kn_filter;
475 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
476 kev.fflags = kn->kn_sfflags;
477 kev.data = kn->kn_id; /* parent */
478 kev.udata = kn->kn_kevent.udata; /* preserve udata */
479 error = kqueue_register(kn->kn_kq, &kev, NULL);
480 if (error)
481 kn->kn_fflags |= NOTE_TRACKERR;
482 }
483
484 return (kn->kn_fflags != 0);
485 }
486
487 /*
488 * filt_seltrue:
489 *
490 * This filter "event" routine simulates seltrue().
491 */
492 int
493 filt_seltrue(struct knote *kn, long hint)
494 {
495
496 /*
497 * We don't know how much data can be read/written,
498 * but we know that it *can* be. This is about as
499 * good as select/poll does as well.
500 */
501 kn->kn_data = 0;
502 return (1);
503 }
504
505 /*
506 * This provides full kqfilter entry for device switch tables, which
507 * has same effect as filter using filt_seltrue() as filter method.
508 */
509 static void
510 filt_seltruedetach(struct knote *kn)
511 {
512 /* Nothing to do */
513 }
514
515 static const struct filterops seltrue_filtops =
516 { 1, NULL, filt_seltruedetach, filt_seltrue };
517
518 int
519 seltrue_kqfilter(dev_t dev, struct knote *kn)
520 {
521 switch (kn->kn_filter) {
522 case EVFILT_READ:
523 case EVFILT_WRITE:
524 kn->kn_fop = &seltrue_filtops;
525 break;
526 default:
527 return (1);
528 }
529
530 /* Nothing more to do */
531 return (0);
532 }
533
534 /*
535 * kqueue(2) system call.
536 */
537 int
538 sys_kqueue(struct lwp *l, void *v, register_t *retval)
539 {
540 struct filedesc *fdp;
541 struct kqueue *kq;
542 struct file *fp;
543 struct proc *p;
544 int fd, error;
545
546 p = l->l_proc;
547 fdp = p->p_fd;
548 error = falloc(p, &fp, &fd); /* setup a new file descriptor */
549 if (error)
550 return (error);
551 fp->f_flag = FREAD | FWRITE;
552 fp->f_type = DTYPE_KQUEUE;
553 fp->f_ops = &kqueueops;
554 kq = pool_get(&kqueue_pool, PR_WAITOK);
555 memset((char *)kq, 0, sizeof(struct kqueue));
556 TAILQ_INIT(&kq->kq_head);
557 fp->f_data = (caddr_t)kq; /* store the kqueue with the fp */
558 *retval = fd;
559 if (fdp->fd_knlistsize < 0)
560 fdp->fd_knlistsize = 0; /* this process has a kq */
561 kq->kq_fdp = fdp;
562 FILE_SET_MATURE(fp);
563 FILE_UNUSE(fp, p); /* falloc() does FILE_USE() */
564 return (error);
565 }
566
567 /*
568 * kevent(2) system call.
569 */
570 int
571 sys_kevent(struct lwp *l, void *v, register_t *retval)
572 {
573 struct sys_kevent_args /* {
574 syscallarg(int) fd;
575 syscallarg(const struct kevent *) changelist;
576 syscallarg(size_t) nchanges;
577 syscallarg(struct kevent *) eventlist;
578 syscallarg(size_t) nevents;
579 syscallarg(const struct timespec *) timeout;
580 } */ *uap = v;
581 struct kevent *kevp;
582 struct kqueue *kq;
583 struct file *fp;
584 struct timespec ts;
585 struct proc *p;
586 size_t i, n;
587 int nerrors, error;
588
589 p = l->l_proc;
590 /* check that we're dealing with a kq */
591 fp = fd_getfile(p->p_fd, SCARG(uap, fd));
592 if (!fp || fp->f_type != DTYPE_KQUEUE)
593 return (EBADF);
594
595 FILE_USE(fp);
596
597 if (SCARG(uap, timeout) != NULL) {
598 error = copyin(SCARG(uap, timeout), &ts, sizeof(ts));
599 if (error)
600 goto done;
601 SCARG(uap, timeout) = &ts;
602 }
603
604 kq = (struct kqueue *)fp->f_data;
605 nerrors = 0;
606
607 /* traverse list of events to register */
608 while (SCARG(uap, nchanges) > 0) {
609 /* copyin a maximum of KQ_EVENTS at each pass */
610 n = MIN(SCARG(uap, nchanges), KQ_NEVENTS);
611 error = copyin(SCARG(uap, changelist), kq->kq_kev,
612 n * sizeof(struct kevent));
613 if (error)
614 goto done;
615 for (i = 0; i < n; i++) {
616 kevp = &kq->kq_kev[i];
617 kevp->flags &= ~EV_SYSFLAGS;
618 /* register each knote */
619 error = kqueue_register(kq, kevp, p);
620 if (error) {
621 if (SCARG(uap, nevents) != 0) {
622 kevp->flags = EV_ERROR;
623 kevp->data = error;
624 error = copyout((caddr_t)kevp,
625 (caddr_t)SCARG(uap, eventlist),
626 sizeof(*kevp));
627 if (error)
628 goto done;
629 SCARG(uap, eventlist)++;
630 SCARG(uap, nevents)--;
631 nerrors++;
632 } else {
633 goto done;
634 }
635 }
636 }
637 SCARG(uap, nchanges) -= n; /* update the results */
638 SCARG(uap, changelist) += n;
639 }
640 if (nerrors) {
641 *retval = nerrors;
642 error = 0;
643 goto done;
644 }
645
646 /* actually scan through the events */
647 error = kqueue_scan(fp, SCARG(uap, nevents), SCARG(uap, eventlist),
648 SCARG(uap, timeout), p, retval);
649 done:
650 FILE_UNUSE(fp, p);
651 return (error);
652 }
653
654 /*
655 * Register a given kevent kev onto the kqueue
656 */
657 int
658 kqueue_register(struct kqueue *kq, struct kevent *kev, struct proc *p)
659 {
660 const struct kfilter *kfilter;
661 struct filedesc *fdp;
662 struct file *fp;
663 struct knote *kn;
664 int s, error;
665
666 fdp = kq->kq_fdp;
667 fp = NULL;
668 kn = NULL;
669 error = 0;
670 kfilter = kfilter_byfilter(kev->filter);
671 if (kfilter == NULL || kfilter->filtops == NULL) {
672 /* filter not found nor implemented */
673 return (EINVAL);
674 }
675
676 /* search if knote already exists */
677 if (kfilter->filtops->f_isfd) {
678 /* monitoring a file descriptor */
679 if ((fp = fd_getfile(fdp, kev->ident)) == NULL)
680 return (EBADF); /* validate descriptor */
681 FILE_USE(fp);
682
683 if (kev->ident < fdp->fd_knlistsize) {
684 SLIST_FOREACH(kn, &fdp->fd_knlist[kev->ident], kn_link)
685 if (kq == kn->kn_kq &&
686 kev->filter == kn->kn_filter)
687 break;
688 }
689 } else {
690 /*
691 * not monitoring a file descriptor, so
692 * lookup knotes in internal hash table
693 */
694 if (fdp->fd_knhashmask != 0) {
695 struct klist *list;
696
697 list = &fdp->fd_knhash[
698 KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
699 SLIST_FOREACH(kn, list, kn_link)
700 if (kev->ident == kn->kn_id &&
701 kq == kn->kn_kq &&
702 kev->filter == kn->kn_filter)
703 break;
704 }
705 }
706
707 if (kn == NULL && ((kev->flags & EV_ADD) == 0)) {
708 error = ENOENT; /* filter not found */
709 goto done;
710 }
711
712 /*
713 * kn now contains the matching knote, or NULL if no match
714 */
715 if (kev->flags & EV_ADD) {
716 /* add knote */
717
718 if (kn == NULL) {
719 /* create new knote */
720 kn = pool_get(&knote_pool, PR_WAITOK);
721 if (kn == NULL) {
722 error = ENOMEM;
723 goto done;
724 }
725 kn->kn_fp = fp;
726 kn->kn_kq = kq;
727 kn->kn_fop = kfilter->filtops;
728
729 /*
730 * apply reference count to knote structure, and
731 * do not release it at the end of this routine.
732 */
733 fp = NULL;
734
735 kn->kn_sfflags = kev->fflags;
736 kn->kn_sdata = kev->data;
737 kev->fflags = 0;
738 kev->data = 0;
739 kn->kn_kevent = *kev;
740
741 knote_attach(kn, fdp);
742 if ((error = kfilter->filtops->f_attach(kn)) != 0) {
743 knote_drop(kn, p, fdp);
744 goto done;
745 }
746 } else {
747 /* modify existing knote */
748
749 /*
750 * The user may change some filter values after the
751 * initial EV_ADD, but doing so will not reset any
752 * filter which have already been triggered.
753 */
754 kn->kn_sfflags = kev->fflags;
755 kn->kn_sdata = kev->data;
756 kn->kn_kevent.udata = kev->udata;
757 }
758
759 s = splhigh();
760 if (kn->kn_fop->f_event(kn, 0))
761 KNOTE_ACTIVATE(kn);
762 splx(s);
763
764 } else if (kev->flags & EV_DELETE) { /* delete knote */
765 kn->kn_fop->f_detach(kn);
766 knote_drop(kn, p, fdp);
767 goto done;
768 }
769
770 /* disable knote */
771 if ((kev->flags & EV_DISABLE) &&
772 ((kn->kn_status & KN_DISABLED) == 0)) {
773 s = splhigh();
774 kn->kn_status |= KN_DISABLED;
775 splx(s);
776 }
777
778 /* enable knote */
779 if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
780 s = splhigh();
781 kn->kn_status &= ~KN_DISABLED;
782 if ((kn->kn_status & KN_ACTIVE) &&
783 ((kn->kn_status & KN_QUEUED) == 0))
784 knote_enqueue(kn);
785 splx(s);
786 }
787
788 done:
789 if (fp != NULL)
790 FILE_UNUSE(fp, p);
791 return (error);
792 }
793
794 /*
795 * Scan through the list of events on fp (for a maximum of maxevents),
796 * returning the results in to ulistp. Timeout is determined by tsp; if
797 * NULL, wait indefinitely, if 0 valued, perform a poll, otherwise wait
798 * as appropriate.
799 */
800 static int
801 kqueue_scan(struct file *fp, size_t maxevents, struct kevent *ulistp,
802 const struct timespec *tsp, struct proc *p, register_t *retval)
803 {
804 struct kqueue *kq;
805 struct kevent *kevp;
806 struct timeval atv;
807 struct knote *kn, marker;
808 size_t count, nkev;
809 int s, timeout, error;
810
811 kq = (struct kqueue *)fp->f_data;
812 count = maxevents;
813 nkev = error = 0;
814 if (count == 0)
815 goto done;
816
817 if (tsp != NULL) { /* timeout supplied */
818 TIMESPEC_TO_TIMEVAL(&atv, tsp);
819 if (itimerfix(&atv)) {
820 error = EINVAL;
821 goto done;
822 }
823 s = splclock();
824 timeradd(&atv, &time, &atv); /* calc. time to wait until */
825 splx(s);
826 if (tsp->tv_sec == 0 && tsp->tv_nsec < 1000 /*<1us*/)
827 timeout = -1; /* perform a poll */
828 else
829 timeout = hzto(&atv); /* calculate hz till timeout */
830 } else {
831 atv.tv_sec = 0; /* no timeout, wait forever */
832 atv.tv_usec = 0;
833 timeout = 0;
834 }
835 goto start;
836
837 retry:
838 if (atv.tv_sec || atv.tv_usec) { /* timeout requested */
839 s = splclock();
840 if (timercmp(&time, &atv, >=)) {
841 splx(s);
842 goto done; /* timeout reached */
843 }
844 splx(s);
845 timeout = hzto(&atv); /* recalc. timeout remaining */
846 }
847
848 start:
849 kevp = kq->kq_kev;
850 s = splhigh();
851 if (kq->kq_count == 0) {
852 if (timeout < 0) {
853 error = EWOULDBLOCK;
854 } else {
855 kq->kq_state |= KQ_SLEEP;
856 error = tsleep(kq, PSOCK | PCATCH, "kqread", timeout);
857 }
858 splx(s);
859 if (error == 0)
860 goto retry;
861 /* don't restart after signals... */
862 if (error == ERESTART)
863 error = EINTR;
864 else if (error == EWOULDBLOCK)
865 error = 0;
866 goto done;
867 }
868
869 /* mark end of knote list */
870 TAILQ_INSERT_TAIL(&kq->kq_head, &marker, kn_tqe);
871
872 while (count) { /* while user wants data ... */
873 kn = TAILQ_FIRST(&kq->kq_head); /* get next knote */
874 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
875 if (kn == &marker) { /* if it's our marker, stop */
876 splx(s);
877 if (count == maxevents)
878 goto retry;
879 goto done;
880 }
881 if (kn->kn_status & KN_DISABLED) {
882 /* don't want disabled events */
883 kn->kn_status &= ~KN_QUEUED;
884 kq->kq_count--;
885 continue;
886 }
887 if ((kn->kn_flags & EV_ONESHOT) == 0 &&
888 kn->kn_fop->f_event(kn, 0) == 0) {
889 /*
890 * non-ONESHOT event that hasn't
891 * triggered again, so de-queue.
892 */
893 kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
894 kq->kq_count--;
895 continue;
896 }
897 *kevp = kn->kn_kevent;
898 kevp++;
899 nkev++;
900 if (kn->kn_flags & EV_ONESHOT) {
901 /* delete ONESHOT events after retrieval */
902 kn->kn_status &= ~KN_QUEUED;
903 kq->kq_count--;
904 splx(s);
905 kn->kn_fop->f_detach(kn);
906 knote_drop(kn, p, p->p_fd);
907 s = splhigh();
908 } else if (kn->kn_flags & EV_CLEAR) {
909 /* clear state after retrieval */
910 kn->kn_data = 0;
911 kn->kn_fflags = 0;
912 kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
913 kq->kq_count--;
914 } else {
915 /* add event back on list */
916 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
917 }
918 count--;
919 if (nkev == KQ_NEVENTS) {
920 /* do copyouts in KQ_NEVENTS chunks */
921 splx(s);
922 error = copyout((caddr_t)&kq->kq_kev, (caddr_t)ulistp,
923 sizeof(struct kevent) * nkev);
924 ulistp += nkev;
925 nkev = 0;
926 kevp = kq->kq_kev;
927 s = splhigh();
928 if (error)
929 break;
930 }
931 }
932
933 /* remove marker */
934 TAILQ_REMOVE(&kq->kq_head, &marker, kn_tqe);
935 splx(s);
936 done:
937 if (nkev != 0) {
938 /* copyout remaining events */
939 error = copyout((caddr_t)&kq->kq_kev, (caddr_t)ulistp,
940 sizeof(struct kevent) * nkev);
941 }
942 *retval = maxevents - count;
943
944 return (error);
945 }
946
947 /*
948 * struct fileops read method for a kqueue descriptor.
949 * Not implemented.
950 * XXX: This could be expanded to call kqueue_scan, if desired.
951 */
952 /*ARGSUSED*/
953 static int
954 kqueue_read(struct file *fp, off_t *offset, struct uio *uio,
955 struct ucred *cred, int flags)
956 {
957
958 return (ENXIO);
959 }
960
961 /*
962 * struct fileops write method for a kqueue descriptor.
963 * Not implemented.
964 */
965 /*ARGSUSED*/
966 static int
967 kqueue_write(struct file *fp, off_t *offset, struct uio *uio,
968 struct ucred *cred, int flags)
969 {
970
971 return (ENXIO);
972 }
973
974 /*
975 * struct fileops ioctl method for a kqueue descriptor.
976 *
977 * Two ioctls are currently supported. They both use struct kfilter_mapping:
978 * KFILTER_BYNAME find name for filter, and return result in
979 * name, which is of size len.
980 * KFILTER_BYFILTER find filter for name. len is ignored.
981 */
982 /*ARGSUSED*/
983 static int
984 kqueue_ioctl(struct file *fp, u_long com, caddr_t data, struct proc *p)
985 {
986 struct kfilter_mapping *km;
987 const struct kfilter *kfilter;
988 char *name;
989 int error;
990
991 km = (struct kfilter_mapping *)data;
992 error = 0;
993
994 switch (com) {
995 case KFILTER_BYFILTER: /* convert filter -> name */
996 kfilter = kfilter_byfilter(km->filter);
997 if (kfilter != NULL)
998 error = copyoutstr(kfilter->name, km->name, km->len,
999 NULL);
1000 else
1001 error = ENOENT;
1002 break;
1003
1004 case KFILTER_BYNAME: /* convert name -> filter */
1005 MALLOC(name, char *, KFILTER_MAXNAME, M_KEVENT, M_WAITOK);
1006 error = copyinstr(km->name, name, KFILTER_MAXNAME, NULL);
1007 if (error) {
1008 FREE(name, M_KEVENT);
1009 break;
1010 }
1011 kfilter = kfilter_byname(name);
1012 if (kfilter != NULL)
1013 km->filter = kfilter->filter;
1014 else
1015 error = ENOENT;
1016 FREE(name, M_KEVENT);
1017 break;
1018
1019 default:
1020 error = ENOTTY;
1021
1022 }
1023 return (error);
1024 }
1025
1026 /*
1027 * struct fileops fcntl method for a kqueue descriptor.
1028 * Not implemented.
1029 */
1030 /*ARGSUSED*/
1031 static int
1032 kqueue_fcntl(struct file *fp, u_int com, caddr_t data, struct proc *p)
1033 {
1034
1035 return (ENOTTY);
1036 }
1037
1038 /*
1039 * struct fileops poll method for a kqueue descriptor.
1040 * Determine if kqueue has events pending.
1041 */
1042 static int
1043 kqueue_poll(struct file *fp, int events, struct proc *p)
1044 {
1045 struct kqueue *kq;
1046 int revents;
1047
1048 kq = (struct kqueue *)fp->f_data;
1049 revents = 0;
1050 if (events & (POLLIN | POLLRDNORM)) {
1051 if (kq->kq_count) {
1052 revents |= events & (POLLIN | POLLRDNORM);
1053 } else {
1054 selrecord(p, &kq->kq_sel);
1055 }
1056 }
1057 return (revents);
1058 }
1059
1060 /*
1061 * struct fileops stat method for a kqueue descriptor.
1062 * Returns dummy info, with st_size being number of events pending.
1063 */
1064 static int
1065 kqueue_stat(struct file *fp, struct stat *st, struct proc *p)
1066 {
1067 struct kqueue *kq;
1068
1069 kq = (struct kqueue *)fp->f_data;
1070 memset((void *)st, 0, sizeof(*st));
1071 st->st_size = kq->kq_count;
1072 st->st_blksize = sizeof(struct kevent);
1073 st->st_mode = S_IFIFO;
1074 return (0);
1075 }
1076
1077 /*
1078 * struct fileops close method for a kqueue descriptor.
1079 * Cleans up kqueue.
1080 */
1081 static int
1082 kqueue_close(struct file *fp, struct proc *p)
1083 {
1084 struct kqueue *kq;
1085 struct filedesc *fdp;
1086 struct knote **knp, *kn, *kn0;
1087 int i;
1088
1089 kq = (struct kqueue *)fp->f_data;
1090 fdp = p->p_fd;
1091 for (i = 0; i < fdp->fd_knlistsize; i++) {
1092 knp = &SLIST_FIRST(&fdp->fd_knlist[i]);
1093 kn = *knp;
1094 while (kn != NULL) {
1095 kn0 = SLIST_NEXT(kn, kn_link);
1096 if (kq == kn->kn_kq) {
1097 kn->kn_fop->f_detach(kn);
1098 FILE_UNUSE(kn->kn_fp, p);
1099 pool_put(&knote_pool, kn);
1100 *knp = kn0;
1101 } else {
1102 knp = &SLIST_NEXT(kn, kn_link);
1103 }
1104 kn = kn0;
1105 }
1106 }
1107 if (fdp->fd_knhashmask != 0) {
1108 for (i = 0; i < fdp->fd_knhashmask + 1; i++) {
1109 knp = &SLIST_FIRST(&fdp->fd_knhash[i]);
1110 kn = *knp;
1111 while (kn != NULL) {
1112 kn0 = SLIST_NEXT(kn, kn_link);
1113 if (kq == kn->kn_kq) {
1114 kn->kn_fop->f_detach(kn);
1115 /* XXX non-fd release of kn->kn_ptr */
1116 pool_put(&knote_pool, kn);
1117 *knp = kn0;
1118 } else {
1119 knp = &SLIST_NEXT(kn, kn_link);
1120 }
1121 kn = kn0;
1122 }
1123 }
1124 }
1125 pool_put(&kqueue_pool, kq);
1126 fp->f_data = NULL;
1127
1128 return (0);
1129 }
1130
1131 /*
1132 * wakeup a kqueue
1133 */
1134 static void
1135 kqueue_wakeup(struct kqueue *kq)
1136 {
1137
1138 if (kq->kq_state & KQ_SLEEP) { /* if currently sleeping ... */
1139 kq->kq_state &= ~KQ_SLEEP;
1140 wakeup(kq); /* ... wakeup */
1141 }
1142
1143 /* Notify select/poll and kevent. */
1144 selnotify(&kq->kq_sel, 0);
1145 }
1146
1147 /*
1148 * struct fileops kqfilter method for a kqueue descriptor.
1149 * Event triggered when monitored kqueue changes.
1150 */
1151 /*ARGSUSED*/
1152 static int
1153 kqueue_kqfilter(struct file *fp, struct knote *kn)
1154 {
1155 struct kqueue *kq;
1156
1157 KASSERT(fp == kn->kn_fp);
1158 kq = (struct kqueue *)kn->kn_fp->f_data;
1159 if (kn->kn_filter != EVFILT_READ)
1160 return (1);
1161 kn->kn_fop = &kqread_filtops;
1162 SLIST_INSERT_HEAD(&kq->kq_sel.sel_klist, kn, kn_selnext);
1163 return (0);
1164 }
1165
1166
1167 /*
1168 * Walk down a list of knotes, activating them if their event has triggered.
1169 */
1170 void
1171 knote(struct klist *list, long hint)
1172 {
1173 struct knote *kn;
1174
1175 SLIST_FOREACH(kn, list, kn_selnext)
1176 if (kn->kn_fop->f_event(kn, hint))
1177 KNOTE_ACTIVATE(kn);
1178 }
1179
1180 /*
1181 * Remove all knotes from a specified klist
1182 */
1183 void
1184 knote_remove(struct proc *p, struct klist *list)
1185 {
1186 struct knote *kn;
1187
1188 while ((kn = SLIST_FIRST(list)) != NULL) {
1189 kn->kn_fop->f_detach(kn);
1190 knote_drop(kn, p, p->p_fd);
1191 }
1192 }
1193
1194 /*
1195 * Remove all knotes referencing a specified fd
1196 */
1197 void
1198 knote_fdclose(struct proc *p, int fd)
1199 {
1200 struct filedesc *fdp;
1201 struct klist *list;
1202
1203 fdp = p->p_fd;
1204 list = &fdp->fd_knlist[fd];
1205 knote_remove(p, list);
1206 }
1207
1208 /*
1209 * Attach a new knote to a file descriptor
1210 */
1211 static void
1212 knote_attach(struct knote *kn, struct filedesc *fdp)
1213 {
1214 struct klist *list;
1215 int size;
1216
1217 if (! kn->kn_fop->f_isfd) {
1218 /* if knote is not on an fd, store on internal hash table */
1219 if (fdp->fd_knhashmask == 0)
1220 fdp->fd_knhash = hashinit(KN_HASHSIZE, HASH_LIST,
1221 M_KEVENT, M_WAITOK, &fdp->fd_knhashmask);
1222 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
1223 goto done;
1224 }
1225
1226 /*
1227 * otherwise, knote is on an fd.
1228 * knotes are stored in fd_knlist indexed by kn->kn_id.
1229 */
1230 if (fdp->fd_knlistsize <= kn->kn_id) {
1231 /* expand list, it's too small */
1232 size = fdp->fd_knlistsize;
1233 while (size <= kn->kn_id) {
1234 /* grow in KQ_EXTENT chunks */
1235 size += KQ_EXTENT;
1236 }
1237 list = malloc(size * sizeof(struct klist *), M_KEVENT,M_WAITOK);
1238 if (fdp->fd_knlist) {
1239 /* copy existing knlist */
1240 memcpy((caddr_t)list, (caddr_t)fdp->fd_knlist,
1241 fdp->fd_knlistsize * sizeof(struct klist *));
1242 }
1243 /*
1244 * Zero new memory. Stylistically, SLIST_INIT() should be
1245 * used here, but that does same thing as the memset() anyway.
1246 */
1247 memset(&list[fdp->fd_knlistsize], 0,
1248 (size - fdp->fd_knlistsize) * sizeof(struct klist *));
1249
1250 /* switch to new knlist */
1251 if (fdp->fd_knlist != NULL)
1252 free(fdp->fd_knlist, M_KEVENT);
1253 fdp->fd_knlistsize = size;
1254 fdp->fd_knlist = list;
1255 }
1256
1257 /* get list head for this fd */
1258 list = &fdp->fd_knlist[kn->kn_id];
1259 done:
1260 /* add new knote */
1261 SLIST_INSERT_HEAD(list, kn, kn_link);
1262 kn->kn_status = 0;
1263 }
1264
1265 /*
1266 * Drop knote.
1267 * Should be called at spl == 0, since we don't want to hold spl
1268 * while calling FILE_UNUSE and free.
1269 */
1270 static void
1271 knote_drop(struct knote *kn, struct proc *p, struct filedesc *fdp)
1272 {
1273 struct klist *list;
1274
1275 if (kn->kn_fop->f_isfd)
1276 list = &fdp->fd_knlist[kn->kn_id];
1277 else
1278 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
1279
1280 SLIST_REMOVE(list, kn, knote, kn_link);
1281 if (kn->kn_status & KN_QUEUED)
1282 knote_dequeue(kn);
1283 if (kn->kn_fop->f_isfd)
1284 FILE_UNUSE(kn->kn_fp, p);
1285 pool_put(&knote_pool, kn);
1286 }
1287
1288
1289 /*
1290 * Queue new event for knote.
1291 */
1292 static void
1293 knote_enqueue(struct knote *kn)
1294 {
1295 struct kqueue *kq;
1296 int s;
1297
1298 kq = kn->kn_kq;
1299 s = splhigh();
1300 KASSERT((kn->kn_status & KN_QUEUED) == 0);
1301
1302 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1303 kn->kn_status |= KN_QUEUED;
1304 kq->kq_count++;
1305 splx(s);
1306 kqueue_wakeup(kq);
1307 }
1308
1309 /*
1310 * Dequeue event for knote.
1311 */
1312 static void
1313 knote_dequeue(struct knote *kn)
1314 {
1315 struct kqueue *kq;
1316 int s;
1317
1318 kq = kn->kn_kq;
1319 s = splhigh();
1320 KASSERT(kn->kn_status & KN_QUEUED);
1321
1322 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1323 kn->kn_status &= ~KN_QUEUED;
1324 kq->kq_count--;
1325 splx(s);
1326 }
1327