kern_event.c revision 1.4.2.3 1 /* $NetBSD: kern_event.c,v 1.4.2.3 2002/11/12 12:49:10 skrll Exp $ */
2 /*-
3 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon (at) FreeBSD.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 *
27 * $FreeBSD: src/sys/kern/kern_event.c,v 1.27 2001/07/05 17:10:44 rwatson Exp $
28 */
29
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/kernel.h>
33 #include <sys/proc.h>
34 #include <sys/malloc.h>
35 #include <sys/unistd.h>
36 #include <sys/file.h>
37 #include <sys/fcntl.h>
38 #include <sys/select.h>
39 #include <sys/queue.h>
40 #include <sys/event.h>
41 #include <sys/eventvar.h>
42 #include <sys/poll.h>
43 #include <sys/pool.h>
44 #include <sys/protosw.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/stat.h>
48 #include <sys/uio.h>
49 #include <sys/mount.h>
50 #include <sys/filedesc.h>
51 #include <sys/syscallargs.h>
52
53 static int kqueue_scan(struct file *fp, size_t maxevents,
54 struct kevent *ulistp, const struct timespec *timeout,
55 struct proc *p, register_t *retval);
56 static void kqueue_wakeup(struct kqueue *kq);
57
58 static int kqueue_read(struct file *fp, off_t *offset, struct uio *uio,
59 struct ucred *cred, int flags);
60 static int kqueue_write(struct file *fp, off_t *offset, struct uio *uio,
61 struct ucred *cred, int flags);
62 static int kqueue_ioctl(struct file *fp, u_long com, caddr_t data,
63 struct proc *p);
64 static int kqueue_fcntl(struct file *fp, u_int com, caddr_t data,
65 struct proc *p);
66 static int kqueue_poll(struct file *fp, int events, struct proc *p);
67 static int kqueue_kqfilter(struct file *fp, struct knote *kn);
68 static int kqueue_stat(struct file *fp, struct stat *sp, struct proc *p);
69 static int kqueue_close(struct file *fp, struct proc *p);
70
71 static struct fileops kqueueops = {
72 kqueue_read, kqueue_write, kqueue_ioctl, kqueue_fcntl, kqueue_poll,
73 kqueue_stat, kqueue_close, kqueue_kqfilter
74 };
75
76 static void knote_attach(struct knote *kn, struct filedesc *fdp);
77 static void knote_drop(struct knote *kn, struct proc *p,
78 struct filedesc *fdp);
79 static void knote_enqueue(struct knote *kn);
80 static void knote_dequeue(struct knote *kn);
81
82 static void filt_kqdetach(struct knote *kn);
83 static int filt_kqueue(struct knote *kn, long hint);
84 static int filt_procattach(struct knote *kn);
85 static void filt_procdetach(struct knote *kn);
86 static int filt_proc(struct knote *kn, long hint);
87 static int filt_fileattach(struct knote *kn);
88
89 static const struct filterops kqread_filtops =
90 { 1, NULL, filt_kqdetach, filt_kqueue };
91 static const struct filterops proc_filtops =
92 { 0, filt_procattach, filt_procdetach, filt_proc };
93 static const struct filterops file_filtops =
94 { 1, filt_fileattach, NULL, NULL };
95
96 struct pool kqueue_pool;
97 struct pool knote_pool;
98
99 #define KNOTE_ACTIVATE(kn) \
100 do { \
101 kn->kn_status |= KN_ACTIVE; \
102 if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \
103 knote_enqueue(kn); \
104 } while(0)
105
106 #define KN_HASHSIZE 64 /* XXX should be tunable */
107 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
108
109 extern const struct filterops sig_filtops;
110
111 /*
112 * Table for for all system-defined filters.
113 * These should be listed in the numeric order of the EVFILT_* defines.
114 * If filtops is NULL, the filter isn't implemented in NetBSD.
115 * End of list is when name is NULL.
116 */
117 struct kfilter {
118 const char *name; /* name of filter */
119 uint32_t filter; /* id of filter */
120 const struct filterops *filtops;/* operations for filter */
121 };
122
123 /* System defined filters */
124 static const struct kfilter sys_kfilters[] = {
125 { "EVFILT_READ", EVFILT_READ, &file_filtops },
126 { "EVFILT_WRITE", EVFILT_WRITE, &file_filtops },
127 { "EVFILT_AIO", EVFILT_AIO, NULL },
128 { "EVFILT_VNODE", EVFILT_VNODE, &file_filtops },
129 { "EVFILT_PROC", EVFILT_PROC, &proc_filtops },
130 { "EVFILT_SIGNAL", EVFILT_SIGNAL, &sig_filtops },
131 { NULL, 0, NULL }, /* end of list */
132 };
133
134 /* User defined kfilters */
135 static struct kfilter *user_kfilters; /* array */
136 static int user_kfilterc; /* current offset */
137 static int user_kfiltermaxc; /* max size so far */
138
139 /*
140 * kqueue_init:
141 *
142 * Initialize the kqueue/knote facility.
143 */
144 void
145 kqueue_init(void)
146 {
147
148 pool_init(&kqueue_pool, sizeof(struct kqueue), 0, 0, 0, "kqueuepl",
149 NULL);
150 pool_init(&knote_pool, sizeof(struct knote), 0, 0, 0, "knotepl",
151 NULL);
152 }
153
154 /*
155 * Find kfilter entry by name, or NULL if not found.
156 */
157 static const struct kfilter *
158 kfilter_byname_sys(const char *name)
159 {
160 int i;
161
162 for (i = 0; sys_kfilters[i].name != NULL; i++) {
163 if (strcmp(name, sys_kfilters[i].name) == 0)
164 return (&sys_kfilters[i]);
165 }
166 return (NULL);
167 }
168
169 static struct kfilter *
170 kfilter_byname_user(const char *name)
171 {
172 int i;
173
174 /* user_kfilters[] could be NULL if no filters were registered */
175 if (!user_kfilters)
176 return (NULL);
177
178 for (i = 0; user_kfilters[i].name != NULL; i++) {
179 if (user_kfilters[i].name != '\0' &&
180 strcmp(name, user_kfilters[i].name) == 0)
181 return (&user_kfilters[i]);
182 }
183 return (NULL);
184 }
185
186 static const struct kfilter *
187 kfilter_byname(const char *name)
188 {
189 const struct kfilter *kfilter;
190
191 if ((kfilter = kfilter_byname_sys(name)) != NULL)
192 return (kfilter);
193
194 return (kfilter_byname_user(name));
195 }
196
197 /*
198 * Find kfilter entry by filter id, or NULL if not found.
199 * Assumes entries are indexed in filter id order, for speed.
200 */
201 static const struct kfilter *
202 kfilter_byfilter(uint32_t filter)
203 {
204 const struct kfilter *kfilter;
205
206 if (filter < EVFILT_SYSCOUNT) /* it's a system filter */
207 kfilter = &sys_kfilters[filter];
208 else if (user_kfilters != NULL &&
209 filter < EVFILT_SYSCOUNT + user_kfilterc)
210 /* it's a user filter */
211 kfilter = &user_kfilters[filter - EVFILT_SYSCOUNT];
212 else
213 return (NULL); /* out of range */
214 KASSERT(kfilter->filter == filter); /* sanity check! */
215 return (kfilter);
216 }
217
218 /*
219 * Register a new kfilter. Stores the entry in user_kfilters.
220 * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
221 * If retfilter != NULL, the new filterid is returned in it.
222 */
223 int
224 kfilter_register(const char *name, const struct filterops *filtops,
225 int *retfilter)
226 {
227 struct kfilter *kfilter;
228 void *space;
229 int len;
230
231 if (name == NULL || name[0] == '\0' || filtops == NULL)
232 return (EINVAL); /* invalid args */
233 if (kfilter_byname(name) != NULL)
234 return (EEXIST); /* already exists */
235 if (user_kfilterc > 0xffffffff - EVFILT_SYSCOUNT)
236 return (EINVAL); /* too many */
237
238 /* check if need to grow user_kfilters */
239 if (user_kfilterc + 1 > user_kfiltermaxc) {
240 /*
241 * Grow in KFILTER_EXTENT chunks. Use malloc(9), because we
242 * want to traverse user_kfilters as an array.
243 */
244 user_kfiltermaxc += KFILTER_EXTENT;
245 kfilter = malloc(user_kfiltermaxc * sizeof(struct filter *),
246 M_KEVENT, M_WAITOK);
247
248 /* copy existing user_kfilters */
249 if (user_kfilters != NULL)
250 memcpy((caddr_t)kfilter, (caddr_t)user_kfilters,
251 user_kfilterc * sizeof(struct kfilter *));
252 /* zero new sections */
253 memset((caddr_t)kfilter +
254 user_kfilterc * sizeof(struct kfilter *), 0,
255 (user_kfiltermaxc - user_kfilterc) *
256 sizeof(struct kfilter *));
257 /* switch to new kfilter */
258 if (user_kfilters != NULL)
259 free(user_kfilters, M_KEVENT);
260 user_kfilters = kfilter;
261 }
262 len = strlen(name) + 1; /* copy name */
263 space = malloc(len, M_KEVENT, M_WAITOK);
264 memcpy(space, name, len);
265 user_kfilters[user_kfilterc].name = space;
266
267 user_kfilters[user_kfilterc].filter = user_kfilterc + EVFILT_SYSCOUNT;
268
269 len = sizeof(struct filterops); /* copy filtops */
270 space = malloc(len, M_KEVENT, M_WAITOK);
271 memcpy(space, filtops, len);
272 user_kfilters[user_kfilterc].filtops = space;
273
274 if (retfilter != NULL)
275 *retfilter = user_kfilters[user_kfilterc].filter;
276 user_kfilterc++; /* finally, increment count */
277 return (0);
278 }
279
280 /*
281 * Unregister a kfilter previously registered with kfilter_register.
282 * This retains the filter id, but clears the name and frees filtops (filter
283 * operations), so that the number isn't reused during a boot.
284 * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
285 */
286 int
287 kfilter_unregister(const char *name)
288 {
289 struct kfilter *kfilter;
290
291 if (name == NULL || name[0] == '\0')
292 return (EINVAL); /* invalid name */
293
294 if (kfilter_byname_sys(name) != NULL)
295 return (EINVAL); /* can't detach system filters */
296
297 kfilter = kfilter_byname_user(name);
298 if (kfilter == NULL) /* not found */
299 return (ENOENT);
300
301 if (kfilter->name[0] != '\0') {
302 /* XXX Cast away const (but we know it's safe. */
303 free((void *) kfilter->name, M_KEVENT);
304 kfilter->name = ""; /* mark as `not implemented' */
305 }
306 if (kfilter->filtops != NULL) {
307 /* XXX Cast away const (but we know it's safe. */
308 free((void *) kfilter->filtops, M_KEVENT);
309 kfilter->filtops = NULL; /* mark as `not implemented' */
310 }
311 return (0);
312 }
313
314
315 /*
316 * Filter attach method for EVFILT_READ and EVFILT_WRITE on normal file
317 * descriptors. Calls struct fileops kqfilter method for given file descriptor.
318 */
319 static int
320 filt_fileattach(struct knote *kn)
321 {
322 struct file *fp;
323
324 fp = kn->kn_fp;
325 return ((*fp->f_ops->fo_kqfilter)(fp, kn));
326 }
327
328 /*
329 * Filter detach method for EVFILT_READ on kqueue descriptor.
330 */
331 static void
332 filt_kqdetach(struct knote *kn)
333 {
334 struct kqueue *kq;
335
336 kq = (struct kqueue *)kn->kn_fp->f_data;
337 SLIST_REMOVE(&kq->kq_sel.si_klist, kn, knote, kn_selnext);
338 }
339
340 /*
341 * Filter event method for EVFILT_READ on kqueue descriptor.
342 */
343 /*ARGSUSED*/
344 static int
345 filt_kqueue(struct knote *kn, long hint)
346 {
347 struct kqueue *kq;
348
349 kq = (struct kqueue *)kn->kn_fp->f_data;
350 kn->kn_data = kq->kq_count;
351 return (kn->kn_data > 0);
352 }
353
354 /*
355 * Filter attach method for EVFILT_PROC.
356 */
357 static int
358 filt_procattach(struct knote *kn)
359 {
360 struct proc *p;
361
362 p = pfind(kn->kn_id);
363 if (p == NULL)
364 return (ESRCH);
365
366 /*
367 * Fail if it's not owned by you, or the last exec gave us
368 * setuid/setgid privs (unless you're root).
369 */
370 if ((p->p_cred->p_ruid != curproc->p_cred->p_ruid ||
371 (p->p_flag & P_SUGID))
372 && suser(curproc->p_ucred, &curproc->p_acflag) != 0)
373 return (EACCES);
374
375 kn->kn_ptr.p_proc = p;
376 kn->kn_flags |= EV_CLEAR; /* automatically set */
377
378 /*
379 * internal flag indicating registration done by kernel
380 */
381 if (kn->kn_flags & EV_FLAG1) {
382 kn->kn_data = kn->kn_sdata; /* ppid */
383 kn->kn_fflags = NOTE_CHILD;
384 kn->kn_flags &= ~EV_FLAG1;
385 }
386
387 /* XXXSMP lock the process? */
388 SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext);
389
390 return (0);
391 }
392
393 /*
394 * Filter detach method for EVFILT_PROC.
395 *
396 * The knote may be attached to a different process, which may exit,
397 * leaving nothing for the knote to be attached to. So when the process
398 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
399 * it will be deleted when read out. However, as part of the knote deletion,
400 * this routine is called, so a check is needed to avoid actually performing
401 * a detach, because the original process might not exist any more.
402 */
403 static void
404 filt_procdetach(struct knote *kn)
405 {
406 struct proc *p;
407
408 if (kn->kn_status & KN_DETACHED)
409 return;
410
411 p = kn->kn_ptr.p_proc;
412 KASSERT(p->p_stat == SDEAD || pfind(kn->kn_id) == p);
413
414 /* XXXSMP lock the process? */
415 SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
416 }
417
418 /*
419 * Filter event method for EVFILT_PROC.
420 */
421 static int
422 filt_proc(struct knote *kn, long hint)
423 {
424 u_int event;
425
426 /*
427 * mask off extra data
428 */
429 event = (u_int)hint & NOTE_PCTRLMASK;
430
431 /*
432 * if the user is interested in this event, record it.
433 */
434 if (kn->kn_sfflags & event)
435 kn->kn_fflags |= event;
436
437 /*
438 * process is gone, so flag the event as finished.
439 */
440 if (event == NOTE_EXIT) {
441 /*
442 * Detach the knote from watched process and mark
443 * it as such. We can't leave this to kqueue_scan(),
444 * since the process might not exist by then. And we
445 * have to do this now, since psignal KNOTE() is called
446 * also for zombies and we might end up reading freed
447 * memory if the kevent would already be picked up
448 * and knote g/c'ed.
449 */
450 kn->kn_fop->f_detach(kn);
451 kn->kn_status |= KN_DETACHED;
452
453 /* Mark as ONESHOT, so that the knote it g/c'ed when read */
454 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
455 return (1);
456 }
457
458 /*
459 * process forked, and user wants to track the new process,
460 * so attach a new knote to it, and immediately report an
461 * event with the parent's pid.
462 */
463 if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
464 struct kevent kev;
465 int error;
466
467 /*
468 * register knote with new process.
469 */
470 kev.ident = hint & NOTE_PDATAMASK; /* pid */
471 kev.filter = kn->kn_filter;
472 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
473 kev.fflags = kn->kn_sfflags;
474 kev.data = kn->kn_id; /* parent */
475 kev.udata = kn->kn_kevent.udata; /* preserve udata */
476 error = kqueue_register(kn->kn_kq, &kev, NULL);
477 if (error)
478 kn->kn_fflags |= NOTE_TRACKERR;
479 }
480
481 return (kn->kn_fflags != 0);
482 }
483
484 /*
485 * filt_seltrue:
486 *
487 * This filter "event" routine simulates seltrue().
488 */
489 int
490 filt_seltrue(struct knote *kn, long hint)
491 {
492
493 /*
494 * We don't know how much data can be read/written,
495 * but we know that it *can* be. This is about as
496 * good as select/poll does as well.
497 */
498 kn->kn_data = 0;
499 return (1);
500 }
501
502 /*
503 * This provides full kqfilter entry for device switch tables, which
504 * has same effect as filter using filt_seltrue() as filter method.
505 */
506 static void
507 filt_seltruedetach(struct knote *kn)
508 {
509 /* Nothing to do */
510 }
511
512 static const struct filterops seltrue_filtops =
513 { 1, NULL, filt_seltruedetach, filt_seltrue };
514
515 int
516 seltrue_kqfilter(dev_t dev, struct knote *kn)
517 {
518 switch (kn->kn_filter) {
519 case EVFILT_READ:
520 case EVFILT_WRITE:
521 kn->kn_fop = &seltrue_filtops;
522 break;
523 default:
524 return (1);
525 }
526
527 /* Nothing more to do */
528 return (0);
529 }
530
531 /*
532 * kqueue(2) system call.
533 */
534 int
535 sys_kqueue(struct lwp *l, void *v, register_t *retval)
536 {
537 struct filedesc *fdp;
538 struct kqueue *kq;
539 struct file *fp;
540 struct proc *p;
541 int fd, error;
542
543 p = l->l_proc;
544 fdp = p->p_fd;
545 error = falloc(p, &fp, &fd); /* setup a new file descriptor */
546 if (error)
547 return (error);
548 fp->f_flag = FREAD | FWRITE;
549 fp->f_type = DTYPE_KQUEUE;
550 fp->f_ops = &kqueueops;
551 kq = pool_get(&kqueue_pool, PR_WAITOK);
552 memset((char *)kq, 0, sizeof(struct kqueue));
553 TAILQ_INIT(&kq->kq_head);
554 fp->f_data = (caddr_t)kq; /* store the kqueue with the fp */
555 *retval = fd;
556 if (fdp->fd_knlistsize < 0)
557 fdp->fd_knlistsize = 0; /* this process has a kq */
558 kq->kq_fdp = fdp;
559 FILE_SET_MATURE(fp);
560 FILE_UNUSE(fp, p); /* falloc() does FILE_USE() */
561 return (error);
562 }
563
564 /*
565 * kevent(2) system call.
566 */
567 int
568 sys_kevent(struct lwp *l, void *v, register_t *retval)
569 {
570 struct sys_kevent_args /* {
571 syscallarg(int) fd;
572 syscallarg(const struct kevent *) changelist;
573 syscallarg(size_t) nchanges;
574 syscallarg(struct kevent *) eventlist;
575 syscallarg(size_t) nevents;
576 syscallarg(const struct timespec *) timeout;
577 } */ *uap = v;
578 struct kevent *kevp;
579 struct kqueue *kq;
580 struct file *fp;
581 struct timespec ts;
582 struct proc *p;
583 size_t i, n;
584 int nerrors, error;
585
586 p = l->l_proc;
587 /* check that we're dealing with a kq */
588 fp = fd_getfile(p->p_fd, SCARG(uap, fd));
589 if (!fp || fp->f_type != DTYPE_KQUEUE)
590 return (EBADF);
591
592 FILE_USE(fp);
593
594 if (SCARG(uap, timeout) != NULL) {
595 error = copyin(SCARG(uap, timeout), &ts, sizeof(ts));
596 if (error)
597 goto done;
598 SCARG(uap, timeout) = &ts;
599 }
600
601 kq = (struct kqueue *)fp->f_data;
602 nerrors = 0;
603
604 /* traverse list of events to register */
605 while (SCARG(uap, nchanges) > 0) {
606 /* copyin a maximum of KQ_EVENTS at each pass */
607 n = MIN(SCARG(uap, nchanges), KQ_NEVENTS);
608 error = copyin(SCARG(uap, changelist), kq->kq_kev,
609 n * sizeof(struct kevent));
610 if (error)
611 goto done;
612 for (i = 0; i < n; i++) {
613 kevp = &kq->kq_kev[i];
614 kevp->flags &= ~EV_SYSFLAGS;
615 /* register each knote */
616 error = kqueue_register(kq, kevp, p);
617 if (error) {
618 if (SCARG(uap, nevents) != 0) {
619 kevp->flags = EV_ERROR;
620 kevp->data = error;
621 error = copyout((caddr_t)kevp,
622 (caddr_t)SCARG(uap, eventlist),
623 sizeof(*kevp));
624 if (error)
625 goto done;
626 SCARG(uap, eventlist)++;
627 SCARG(uap, nevents)--;
628 nerrors++;
629 } else {
630 goto done;
631 }
632 }
633 }
634 SCARG(uap, nchanges) -= n; /* update the results */
635 SCARG(uap, changelist) += n;
636 }
637 if (nerrors) {
638 *retval = nerrors;
639 error = 0;
640 goto done;
641 }
642
643 /* actually scan through the events */
644 error = kqueue_scan(fp, SCARG(uap, nevents), SCARG(uap, eventlist),
645 SCARG(uap, timeout), p, retval);
646 done:
647 FILE_UNUSE(fp, p);
648 return (error);
649 }
650
651 /*
652 * Register a given kevent kev onto the kqueue
653 */
654 int
655 kqueue_register(struct kqueue *kq, struct kevent *kev, struct proc *p)
656 {
657 const struct kfilter *kfilter;
658 struct filedesc *fdp;
659 struct file *fp;
660 struct knote *kn;
661 int s, error;
662
663 fdp = kq->kq_fdp;
664 fp = NULL;
665 kn = NULL;
666 error = 0;
667 kfilter = kfilter_byfilter(kev->filter);
668 if (kfilter == NULL || kfilter->filtops == NULL) {
669 /* filter not found nor implemented */
670 return (EINVAL);
671 }
672
673 /* search if knote already exists */
674 if (kfilter->filtops->f_isfd) {
675 /* monitoring a file descriptor */
676 if ((fp = fd_getfile(fdp, kev->ident)) == NULL)
677 return (EBADF); /* validate descriptor */
678 FILE_USE(fp);
679
680 if (kev->ident < fdp->fd_knlistsize) {
681 SLIST_FOREACH(kn, &fdp->fd_knlist[kev->ident], kn_link)
682 if (kq == kn->kn_kq &&
683 kev->filter == kn->kn_filter)
684 break;
685 }
686 } else {
687 /*
688 * not monitoring a file descriptor, so
689 * lookup knotes in internal hash table
690 */
691 if (fdp->fd_knhashmask != 0) {
692 struct klist *list;
693
694 list = &fdp->fd_knhash[
695 KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
696 SLIST_FOREACH(kn, list, kn_link)
697 if (kev->ident == kn->kn_id &&
698 kq == kn->kn_kq &&
699 kev->filter == kn->kn_filter)
700 break;
701 }
702 }
703
704 if (kn == NULL && ((kev->flags & EV_ADD) == 0)) {
705 error = ENOENT; /* filter not found */
706 goto done;
707 }
708
709 /*
710 * kn now contains the matching knote, or NULL if no match
711 */
712 if (kev->flags & EV_ADD) {
713 /* add knote */
714
715 if (kn == NULL) {
716 /* create new knote */
717 kn = pool_get(&knote_pool, PR_WAITOK);
718 if (kn == NULL) {
719 error = ENOMEM;
720 goto done;
721 }
722 kn->kn_fp = fp;
723 kn->kn_kq = kq;
724 kn->kn_fop = kfilter->filtops;
725
726 /*
727 * apply reference count to knote structure, and
728 * do not release it at the end of this routine.
729 */
730 fp = NULL;
731
732 kn->kn_sfflags = kev->fflags;
733 kn->kn_sdata = kev->data;
734 kev->fflags = 0;
735 kev->data = 0;
736 kn->kn_kevent = *kev;
737
738 knote_attach(kn, fdp);
739 if ((error = kfilter->filtops->f_attach(kn)) != 0) {
740 knote_drop(kn, p, fdp);
741 goto done;
742 }
743 } else {
744 /* modify existing knote */
745
746 /*
747 * The user may change some filter values after the
748 * initial EV_ADD, but doing so will not reset any
749 * filter which have already been triggered.
750 */
751 kn->kn_sfflags = kev->fflags;
752 kn->kn_sdata = kev->data;
753 kn->kn_kevent.udata = kev->udata;
754 }
755
756 s = splhigh();
757 if (kn->kn_fop->f_event(kn, 0))
758 KNOTE_ACTIVATE(kn);
759 splx(s);
760
761 } else if (kev->flags & EV_DELETE) { /* delete knote */
762 kn->kn_fop->f_detach(kn);
763 knote_drop(kn, p, fdp);
764 goto done;
765 }
766
767 /* disable knote */
768 if ((kev->flags & EV_DISABLE) &&
769 ((kn->kn_status & KN_DISABLED) == 0)) {
770 s = splhigh();
771 kn->kn_status |= KN_DISABLED;
772 splx(s);
773 }
774
775 /* enable knote */
776 if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
777 s = splhigh();
778 kn->kn_status &= ~KN_DISABLED;
779 if ((kn->kn_status & KN_ACTIVE) &&
780 ((kn->kn_status & KN_QUEUED) == 0))
781 knote_enqueue(kn);
782 splx(s);
783 }
784
785 done:
786 if (fp != NULL)
787 FILE_UNUSE(fp, p);
788 return (error);
789 }
790
791 /*
792 * Scan through the list of events on fp (for a maximum of maxevents),
793 * returning the results in to ulistp. Timeout is determined by tsp; if
794 * NULL, wait indefinitely, if 0 valued, perform a poll, otherwise wait
795 * as appropriate.
796 */
797 static int
798 kqueue_scan(struct file *fp, size_t maxevents, struct kevent *ulistp,
799 const struct timespec *tsp, struct proc *p, register_t *retval)
800 {
801 struct kqueue *kq;
802 struct kevent *kevp;
803 struct timeval atv;
804 struct knote *kn, marker;
805 size_t count, nkev;
806 int s, timeout, error;
807
808 kq = (struct kqueue *)fp->f_data;
809 count = maxevents;
810 nkev = error = 0;
811 if (count == 0)
812 goto done;
813
814 if (tsp != NULL) { /* timeout supplied */
815 TIMESPEC_TO_TIMEVAL(&atv, tsp);
816 if (itimerfix(&atv)) {
817 error = EINVAL;
818 goto done;
819 }
820 s = splclock();
821 timeradd(&atv, &time, &atv); /* calc. time to wait until */
822 splx(s);
823 if (tsp->tv_sec == 0 && tsp->tv_nsec < 1000 /*<1us*/)
824 timeout = -1; /* perform a poll */
825 else
826 timeout = hzto(&atv); /* calculate hz till timeout */
827 } else {
828 atv.tv_sec = 0; /* no timeout, wait forever */
829 atv.tv_usec = 0;
830 timeout = 0;
831 }
832 goto start;
833
834 retry:
835 if (atv.tv_sec || atv.tv_usec) { /* timeout requested */
836 s = splclock();
837 if (timercmp(&time, &atv, >=)) {
838 splx(s);
839 goto done; /* timeout reached */
840 }
841 splx(s);
842 timeout = hzto(&atv); /* recalc. timeout remaining */
843 }
844
845 start:
846 kevp = kq->kq_kev;
847 s = splhigh();
848 if (kq->kq_count == 0) {
849 if (timeout < 0) {
850 error = EWOULDBLOCK;
851 } else {
852 kq->kq_state |= KQ_SLEEP;
853 error = tsleep(kq, PSOCK | PCATCH, "kqread", timeout);
854 }
855 splx(s);
856 if (error == 0)
857 goto retry;
858 /* don't restart after signals... */
859 if (error == ERESTART)
860 error = EINTR;
861 else if (error == EWOULDBLOCK)
862 error = 0;
863 goto done;
864 }
865
866 /* mark end of knote list */
867 TAILQ_INSERT_TAIL(&kq->kq_head, &marker, kn_tqe);
868
869 while (count) { /* while user wants data ... */
870 kn = TAILQ_FIRST(&kq->kq_head); /* get next knote */
871 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
872 if (kn == &marker) { /* if it's our marker, stop */
873 splx(s);
874 if (count == maxevents)
875 goto retry;
876 goto done;
877 }
878 if (kn->kn_status & KN_DISABLED) {
879 /* don't want disabled events */
880 kn->kn_status &= ~KN_QUEUED;
881 kq->kq_count--;
882 continue;
883 }
884 if ((kn->kn_flags & EV_ONESHOT) == 0 &&
885 kn->kn_fop->f_event(kn, 0) == 0) {
886 /*
887 * non-ONESHOT event that hasn't
888 * triggered again, so de-queue.
889 */
890 kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
891 kq->kq_count--;
892 continue;
893 }
894 *kevp = kn->kn_kevent;
895 kevp++;
896 nkev++;
897 if (kn->kn_flags & EV_ONESHOT) {
898 /* delete ONESHOT events after retrieval */
899 kn->kn_status &= ~KN_QUEUED;
900 kq->kq_count--;
901 splx(s);
902 kn->kn_fop->f_detach(kn);
903 knote_drop(kn, p, p->p_fd);
904 s = splhigh();
905 } else if (kn->kn_flags & EV_CLEAR) {
906 /* clear state after retrieval */
907 kn->kn_data = 0;
908 kn->kn_fflags = 0;
909 kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
910 kq->kq_count--;
911 } else {
912 /* add event back on list */
913 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
914 }
915 count--;
916 if (nkev == KQ_NEVENTS) {
917 /* do copyouts in KQ_NEVENTS chunks */
918 splx(s);
919 error = copyout((caddr_t)&kq->kq_kev, (caddr_t)ulistp,
920 sizeof(struct kevent) * nkev);
921 ulistp += nkev;
922 nkev = 0;
923 kevp = kq->kq_kev;
924 s = splhigh();
925 if (error)
926 break;
927 }
928 }
929
930 /* remove marker */
931 TAILQ_REMOVE(&kq->kq_head, &marker, kn_tqe);
932 splx(s);
933 done:
934 if (nkev != 0) {
935 /* copyout remaining events */
936 error = copyout((caddr_t)&kq->kq_kev, (caddr_t)ulistp,
937 sizeof(struct kevent) * nkev);
938 }
939 *retval = maxevents - count;
940
941 return (error);
942 }
943
944 /*
945 * struct fileops read method for a kqueue descriptor.
946 * Not implemented.
947 * XXX: This could be expanded to call kqueue_scan, if desired.
948 */
949 /*ARGSUSED*/
950 static int
951 kqueue_read(struct file *fp, off_t *offset, struct uio *uio,
952 struct ucred *cred, int flags)
953 {
954
955 return (ENXIO);
956 }
957
958 /*
959 * struct fileops write method for a kqueue descriptor.
960 * Not implemented.
961 */
962 /*ARGSUSED*/
963 static int
964 kqueue_write(struct file *fp, off_t *offset, struct uio *uio,
965 struct ucred *cred, int flags)
966 {
967
968 return (ENXIO);
969 }
970
971 /*
972 * struct fileops ioctl method for a kqueue descriptor.
973 *
974 * Two ioctls are currently supported. They both use struct kfilter_mapping:
975 * KFILTER_BYNAME find name for filter, and return result in
976 * name, which is of size len.
977 * KFILTER_BYFILTER find filter for name. len is ignored.
978 */
979 /*ARGSUSED*/
980 static int
981 kqueue_ioctl(struct file *fp, u_long com, caddr_t data, struct proc *p)
982 {
983 struct kfilter_mapping *km;
984 const struct kfilter *kfilter;
985 char *name;
986 int error;
987
988 km = (struct kfilter_mapping *)data;
989 error = 0;
990
991 switch (com) {
992 case KFILTER_BYFILTER: /* convert filter -> name */
993 kfilter = kfilter_byfilter(km->filter);
994 if (kfilter != NULL)
995 error = copyoutstr(kfilter->name, km->name, km->len,
996 NULL);
997 else
998 error = ENOENT;
999 break;
1000
1001 case KFILTER_BYNAME: /* convert name -> filter */
1002 MALLOC(name, char *, KFILTER_MAXNAME, M_KEVENT, M_WAITOK);
1003 error = copyinstr(km->name, name, KFILTER_MAXNAME, NULL);
1004 if (error) {
1005 FREE(name, M_KEVENT);
1006 break;
1007 }
1008 kfilter = kfilter_byname(name);
1009 if (kfilter != NULL)
1010 km->filter = kfilter->filter;
1011 else
1012 error = ENOENT;
1013 FREE(name, M_KEVENT);
1014 break;
1015
1016 default:
1017 error = ENOTTY;
1018
1019 }
1020 return (error);
1021 }
1022
1023 /*
1024 * struct fileops fcntl method for a kqueue descriptor.
1025 * Not implemented.
1026 */
1027 /*ARGSUSED*/
1028 static int
1029 kqueue_fcntl(struct file *fp, u_int com, caddr_t data, struct proc *p)
1030 {
1031
1032 return (ENOTTY);
1033 }
1034
1035 /*
1036 * struct fileops poll method for a kqueue descriptor.
1037 * Determine if kqueue has events pending.
1038 */
1039 static int
1040 kqueue_poll(struct file *fp, int events, struct proc *p)
1041 {
1042 struct kqueue *kq;
1043 int revents;
1044
1045 kq = (struct kqueue *)fp->f_data;
1046 revents = 0;
1047 if (events & (POLLIN | POLLRDNORM)) {
1048 if (kq->kq_count) {
1049 revents |= events & (POLLIN | POLLRDNORM);
1050 } else {
1051 selrecord(p, &kq->kq_sel);
1052 }
1053 }
1054 return (revents);
1055 }
1056
1057 /*
1058 * struct fileops stat method for a kqueue descriptor.
1059 * Returns dummy info, with st_size being number of events pending.
1060 */
1061 static int
1062 kqueue_stat(struct file *fp, struct stat *st, struct proc *p)
1063 {
1064 struct kqueue *kq;
1065
1066 kq = (struct kqueue *)fp->f_data;
1067 memset((void *)st, 0, sizeof(*st));
1068 st->st_size = kq->kq_count;
1069 st->st_blksize = sizeof(struct kevent);
1070 st->st_mode = S_IFIFO;
1071 return (0);
1072 }
1073
1074 /*
1075 * struct fileops close method for a kqueue descriptor.
1076 * Cleans up kqueue.
1077 */
1078 static int
1079 kqueue_close(struct file *fp, struct proc *p)
1080 {
1081 struct kqueue *kq;
1082 struct filedesc *fdp;
1083 struct knote **knp, *kn, *kn0;
1084 int i;
1085
1086 kq = (struct kqueue *)fp->f_data;
1087 fdp = p->p_fd;
1088 for (i = 0; i < fdp->fd_knlistsize; i++) {
1089 knp = &SLIST_FIRST(&fdp->fd_knlist[i]);
1090 kn = *knp;
1091 while (kn != NULL) {
1092 kn0 = SLIST_NEXT(kn, kn_link);
1093 if (kq == kn->kn_kq) {
1094 kn->kn_fop->f_detach(kn);
1095 FILE_UNUSE(kn->kn_fp, p);
1096 pool_put(&knote_pool, kn);
1097 *knp = kn0;
1098 } else {
1099 knp = &SLIST_NEXT(kn, kn_link);
1100 }
1101 kn = kn0;
1102 }
1103 }
1104 if (fdp->fd_knhashmask != 0) {
1105 for (i = 0; i < fdp->fd_knhashmask + 1; i++) {
1106 knp = &SLIST_FIRST(&fdp->fd_knhash[i]);
1107 kn = *knp;
1108 while (kn != NULL) {
1109 kn0 = SLIST_NEXT(kn, kn_link);
1110 if (kq == kn->kn_kq) {
1111 kn->kn_fop->f_detach(kn);
1112 /* XXX non-fd release of kn->kn_ptr */
1113 pool_put(&knote_pool, kn);
1114 *knp = kn0;
1115 } else {
1116 knp = &SLIST_NEXT(kn, kn_link);
1117 }
1118 kn = kn0;
1119 }
1120 }
1121 }
1122 pool_put(&kqueue_pool, kq);
1123 fp->f_data = NULL;
1124
1125 return (0);
1126 }
1127
1128 /*
1129 * wakeup a kqueue
1130 */
1131 static void
1132 kqueue_wakeup(struct kqueue *kq)
1133 {
1134
1135 if (kq->kq_state & KQ_SLEEP) { /* if currently sleeping ... */
1136 kq->kq_state &= ~KQ_SLEEP;
1137 wakeup(kq); /* ... wakeup */
1138 }
1139
1140 /* Notify select/poll and kevent. */
1141 selnotify(&kq->kq_sel, 0);
1142 }
1143
1144 /*
1145 * struct fileops kqfilter method for a kqueue descriptor.
1146 * Event triggered when monitored kqueue changes.
1147 */
1148 /*ARGSUSED*/
1149 static int
1150 kqueue_kqfilter(struct file *fp, struct knote *kn)
1151 {
1152 struct kqueue *kq;
1153
1154 KASSERT(fp == kn->kn_fp);
1155 kq = (struct kqueue *)kn->kn_fp->f_data;
1156 if (kn->kn_filter != EVFILT_READ)
1157 return (1);
1158 kn->kn_fop = &kqread_filtops;
1159 SLIST_INSERT_HEAD(&kq->kq_sel.si_klist, kn, kn_selnext);
1160 return (0);
1161 }
1162
1163
1164 /*
1165 * Walk down a list of knotes, activating them if their event has triggered.
1166 */
1167 void
1168 knote(struct klist *list, long hint)
1169 {
1170 struct knote *kn;
1171
1172 SLIST_FOREACH(kn, list, kn_selnext)
1173 if (kn->kn_fop->f_event(kn, hint))
1174 KNOTE_ACTIVATE(kn);
1175 }
1176
1177 /*
1178 * Remove all knotes from a specified klist
1179 */
1180 void
1181 knote_remove(struct proc *p, struct klist *list)
1182 {
1183 struct knote *kn;
1184
1185 while ((kn = SLIST_FIRST(list)) != NULL) {
1186 kn->kn_fop->f_detach(kn);
1187 knote_drop(kn, p, p->p_fd);
1188 }
1189 }
1190
1191 /*
1192 * Remove all knotes referencing a specified fd
1193 */
1194 void
1195 knote_fdclose(struct proc *p, int fd)
1196 {
1197 struct filedesc *fdp;
1198 struct klist *list;
1199
1200 fdp = p->p_fd;
1201 list = &fdp->fd_knlist[fd];
1202 knote_remove(p, list);
1203 }
1204
1205 /*
1206 * Attach a new knote to a file descriptor
1207 */
1208 static void
1209 knote_attach(struct knote *kn, struct filedesc *fdp)
1210 {
1211 struct klist *list;
1212 int size;
1213
1214 if (! kn->kn_fop->f_isfd) {
1215 /* if knote is not on an fd, store on internal hash table */
1216 if (fdp->fd_knhashmask == 0)
1217 fdp->fd_knhash = hashinit(KN_HASHSIZE, HASH_LIST,
1218 M_KEVENT, M_WAITOK, &fdp->fd_knhashmask);
1219 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
1220 goto done;
1221 }
1222
1223 /*
1224 * otherwise, knote is on an fd.
1225 * knotes are stored in fd_knlist indexed by kn->kn_id.
1226 */
1227 if (fdp->fd_knlistsize <= kn->kn_id) {
1228 /* expand list, it's too small */
1229 size = fdp->fd_knlistsize;
1230 while (size <= kn->kn_id) {
1231 /* grow in KQ_EXTENT chunks */
1232 size += KQ_EXTENT;
1233 }
1234 list = malloc(size * sizeof(struct klist *), M_KEVENT,M_WAITOK);
1235 if (fdp->fd_knlist) {
1236 /* copy existing knlist */
1237 memcpy((caddr_t)list, (caddr_t)fdp->fd_knlist,
1238 fdp->fd_knlistsize * sizeof(struct klist *));
1239 }
1240 /*
1241 * Zero new memory. Stylistically, SLIST_INIT() should be
1242 * used here, but that does same thing as the memset() anyway.
1243 */
1244 memset(&list[fdp->fd_knlistsize], 0,
1245 (size - fdp->fd_knlistsize) * sizeof(struct klist *));
1246
1247 /* switch to new knlist */
1248 if (fdp->fd_knlist != NULL)
1249 free(fdp->fd_knlist, M_KEVENT);
1250 fdp->fd_knlistsize = size;
1251 fdp->fd_knlist = list;
1252 }
1253
1254 /* get list head for this fd */
1255 list = &fdp->fd_knlist[kn->kn_id];
1256 done:
1257 /* add new knote */
1258 SLIST_INSERT_HEAD(list, kn, kn_link);
1259 kn->kn_status = 0;
1260 }
1261
1262 /*
1263 * Drop knote.
1264 * Should be called at spl == 0, since we don't want to hold spl
1265 * while calling FILE_UNUSE and free.
1266 */
1267 static void
1268 knote_drop(struct knote *kn, struct proc *p, struct filedesc *fdp)
1269 {
1270 struct klist *list;
1271
1272 if (kn->kn_fop->f_isfd)
1273 list = &fdp->fd_knlist[kn->kn_id];
1274 else
1275 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
1276
1277 SLIST_REMOVE(list, kn, knote, kn_link);
1278 if (kn->kn_status & KN_QUEUED)
1279 knote_dequeue(kn);
1280 if (kn->kn_fop->f_isfd)
1281 FILE_UNUSE(kn->kn_fp, p);
1282 pool_put(&knote_pool, kn);
1283 }
1284
1285
1286 /*
1287 * Queue new event for knote.
1288 */
1289 static void
1290 knote_enqueue(struct knote *kn)
1291 {
1292 struct kqueue *kq;
1293 int s;
1294
1295 kq = kn->kn_kq;
1296 s = splhigh();
1297 KASSERT((kn->kn_status & KN_QUEUED) == 0);
1298
1299 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1300 kn->kn_status |= KN_QUEUED;
1301 kq->kq_count++;
1302 splx(s);
1303 kqueue_wakeup(kq);
1304 }
1305
1306 /*
1307 * Dequeue event for knote.
1308 */
1309 static void
1310 knote_dequeue(struct knote *kn)
1311 {
1312 struct kqueue *kq;
1313 int s;
1314
1315 kq = kn->kn_kq;
1316 s = splhigh();
1317 KASSERT(kn->kn_status & KN_QUEUED);
1318
1319 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1320 kn->kn_status &= ~KN_QUEUED;
1321 kq->kq_count--;
1322 splx(s);
1323 }
1324