sys_generic.c revision 1.92.2.2 1 /* $NetBSD: sys_generic.c,v 1.92.2.2 2006/10/21 15:20:47 ad Exp $ */
2
3 /*
4 * Copyright (c) 1982, 1986, 1989, 1993
5 * The Regents of the University of California. All rights reserved.
6 * (c) UNIX System Laboratories, Inc.
7 * All or some portions of this file are derived from material licensed
8 * to the University of California by American Telephone and Telegraph
9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10 * the permission of UNIX System Laboratories, Inc.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95
37 */
38
39 #include <sys/cdefs.h>
40 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.92.2.2 2006/10/21 15:20:47 ad Exp $");
41
42 #include "opt_ktrace.h"
43
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/filedesc.h>
47 #include <sys/ioctl.h>
48 #include <sys/file.h>
49 #include <sys/proc.h>
50 #include <sys/socketvar.h>
51 #include <sys/signalvar.h>
52 #include <sys/uio.h>
53 #include <sys/kernel.h>
54 #include <sys/stat.h>
55 #include <sys/malloc.h>
56 #include <sys/poll.h>
57 #ifdef KTRACE
58 #include <sys/ktrace.h>
59 #endif
60
61 #include <sys/mount.h>
62 #include <sys/sa.h>
63 #include <sys/syscallargs.h>
64
65 #include <uvm/uvm_extern.h>
66
67 int selscan(struct lwp *, fd_mask *, fd_mask *, int, register_t *);
68 int pollscan(struct lwp *, struct pollfd *, int, register_t *);
69
70
71 /*
72 * Read system call.
73 */
74 /* ARGSUSED */
75 int
76 sys_read(struct lwp *l, void *v, register_t *retval)
77 {
78 struct sys_read_args /* {
79 syscallarg(int) fd;
80 syscallarg(void *) buf;
81 syscallarg(size_t) nbyte;
82 } */ *uap = v;
83 int fd;
84 struct file *fp;
85 struct proc *p;
86 struct filedesc *fdp;
87
88 fd = SCARG(uap, fd);
89 p = l->l_proc;
90 fdp = p->p_fd;
91
92 if ((fp = fd_getfile(fdp, fd)) == NULL)
93 return (EBADF);
94
95 if ((fp->f_flag & FREAD) == 0) {
96 simple_unlock(&fp->f_slock);
97 return (EBADF);
98 }
99
100 FILE_USE(fp);
101
102 /* dofileread() will unuse the descriptor for us */
103 return (dofileread(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
104 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
105 }
106
107 int
108 dofileread(struct lwp *l, int fd, struct file *fp, void *buf, size_t nbyte,
109 off_t *offset, int flags, register_t *retval)
110 {
111 struct iovec aiov;
112 struct uio auio;
113 struct proc *p;
114 struct vmspace *vm;
115 size_t cnt;
116 int error;
117 #ifdef KTRACE
118 struct iovec ktriov = { .iov_base = NULL, };
119 #endif
120 p = l->l_proc;
121
122 error = proc_vmspace_getref(p, &vm);
123 if (error) {
124 goto out;
125 }
126
127 aiov.iov_base = (caddr_t)buf;
128 aiov.iov_len = nbyte;
129 auio.uio_iov = &aiov;
130 auio.uio_iovcnt = 1;
131 auio.uio_resid = nbyte;
132 auio.uio_rw = UIO_READ;
133 auio.uio_vmspace = vm;
134
135 /*
136 * Reads return ssize_t because -1 is returned on error. Therefore
137 * we must restrict the length to SSIZE_MAX to avoid garbage return
138 * values.
139 */
140 if (auio.uio_resid > SSIZE_MAX) {
141 error = EINVAL;
142 goto out;
143 }
144
145 #ifdef KTRACE
146 /*
147 * if tracing, save a copy of iovec
148 */
149 if (KTRPOINT(p, KTR_GENIO))
150 ktriov = aiov;
151 #endif
152 cnt = auio.uio_resid;
153 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
154 if (error)
155 if (auio.uio_resid != cnt && (error == ERESTART ||
156 error == EINTR || error == EWOULDBLOCK))
157 error = 0;
158 cnt -= auio.uio_resid;
159 #ifdef KTRACE
160 if (KTRPOINT(p, KTR_GENIO) && error == 0)
161 ktrgenio(l, fd, UIO_READ, &ktriov, cnt, error);
162 #endif
163 *retval = cnt;
164 out:
165 FILE_UNUSE(fp, l);
166 uvmspace_free(vm);
167 return (error);
168 }
169
170 /*
171 * Scatter read system call.
172 */
173 int
174 sys_readv(struct lwp *l, void *v, register_t *retval)
175 {
176 struct sys_readv_args /* {
177 syscallarg(int) fd;
178 syscallarg(const struct iovec *) iovp;
179 syscallarg(int) iovcnt;
180 } */ *uap = v;
181 struct filedesc *fdp;
182 struct file *fp;
183 struct proc *p;
184 int fd;
185
186 fd = SCARG(uap, fd);
187 p = l->l_proc;
188 fdp = p->p_fd;
189
190 if ((fp = fd_getfile(fdp, fd)) == NULL)
191 return (EBADF);
192
193 if ((fp->f_flag & FREAD) == 0) {
194 simple_unlock(&fp->f_slock);
195 return (EBADF);
196 }
197
198 FILE_USE(fp);
199
200 /* dofilereadv() will unuse the descriptor for us */
201 return (dofilereadv(l, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
202 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
203 }
204
205 int
206 dofilereadv(struct lwp *l, int fd, struct file *fp, const struct iovec *iovp,
207 int iovcnt, off_t *offset, int flags, register_t *retval)
208 {
209 struct proc *p;
210 struct uio auio;
211 struct iovec *iov, *needfree, aiov[UIO_SMALLIOV];
212 struct vmspace *vm;
213 int i, error;
214 size_t cnt;
215 u_int iovlen;
216 #ifdef KTRACE
217 struct iovec *ktriov;
218 #endif
219
220 p = l->l_proc;
221 error = proc_vmspace_getref(p, &vm);
222 if (error) {
223 goto out;
224 }
225
226 #ifdef KTRACE
227 ktriov = NULL;
228 #endif
229 /* note: can't use iovlen until iovcnt is validated */
230 iovlen = iovcnt * sizeof(struct iovec);
231 if ((u_int)iovcnt > UIO_SMALLIOV) {
232 if ((u_int)iovcnt > IOV_MAX) {
233 error = EINVAL;
234 goto out;
235 }
236 iov = malloc(iovlen, M_IOV, M_WAITOK);
237 needfree = iov;
238 } else if ((u_int)iovcnt > 0) {
239 iov = aiov;
240 needfree = NULL;
241 } else {
242 error = EINVAL;
243 goto out;
244 }
245
246 auio.uio_iov = iov;
247 auio.uio_iovcnt = iovcnt;
248 auio.uio_rw = UIO_READ;
249 auio.uio_vmspace = vm;
250 error = copyin(iovp, iov, iovlen);
251 if (error)
252 goto done;
253 auio.uio_resid = 0;
254 for (i = 0; i < iovcnt; i++) {
255 auio.uio_resid += iov->iov_len;
256 /*
257 * Reads return ssize_t because -1 is returned on error.
258 * Therefore we must restrict the length to SSIZE_MAX to
259 * avoid garbage return values.
260 */
261 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
262 error = EINVAL;
263 goto done;
264 }
265 iov++;
266 }
267 #ifdef KTRACE
268 /*
269 * if tracing, save a copy of iovec
270 */
271 if (KTRPOINT(p, KTR_GENIO)) {
272 ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
273 memcpy((caddr_t)ktriov, (caddr_t)auio.uio_iov, iovlen);
274 }
275 #endif
276 cnt = auio.uio_resid;
277 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
278 if (error)
279 if (auio.uio_resid != cnt && (error == ERESTART ||
280 error == EINTR || error == EWOULDBLOCK))
281 error = 0;
282 cnt -= auio.uio_resid;
283 #ifdef KTRACE
284 if (ktriov != NULL) {
285 if (KTRPOINT(p, KTR_GENIO) && (error == 0))
286 ktrgenio(l, fd, UIO_READ, ktriov, cnt, error);
287 free(ktriov, M_TEMP);
288 }
289 #endif
290 *retval = cnt;
291 done:
292 if (needfree)
293 free(needfree, M_IOV);
294 out:
295 FILE_UNUSE(fp, l);
296 uvmspace_free(vm);
297 return (error);
298 }
299
300 /*
301 * Write system call
302 */
303 int
304 sys_write(struct lwp *l, void *v, register_t *retval)
305 {
306 struct sys_write_args /* {
307 syscallarg(int) fd;
308 syscallarg(const void *) buf;
309 syscallarg(size_t) nbyte;
310 } */ *uap = v;
311 int fd;
312 struct file *fp;
313 struct proc *p;
314 struct filedesc *fdp;
315
316 fd = SCARG(uap, fd);
317 p = l->l_proc;
318 fdp = p->p_fd;
319
320 if ((fp = fd_getfile(fdp, fd)) == NULL)
321 return (EBADF);
322
323 if ((fp->f_flag & FWRITE) == 0) {
324 simple_unlock(&fp->f_slock);
325 return (EBADF);
326 }
327
328 FILE_USE(fp);
329
330 /* dofilewrite() will unuse the descriptor for us */
331 return (dofilewrite(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
332 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
333 }
334
335 int
336 dofilewrite(struct lwp *l, int fd, struct file *fp, const void *buf,
337 size_t nbyte, off_t *offset, int flags, register_t *retval)
338 {
339 struct iovec aiov;
340 struct uio auio;
341 struct proc *p;
342 struct vmspace *vm;
343 size_t cnt;
344 int error;
345 #ifdef KTRACE
346 struct iovec ktriov = { .iov_base = NULL, };
347 #endif
348
349 p = l->l_proc;
350 error = proc_vmspace_getref(p, &vm);
351 if (error) {
352 goto out;
353 }
354 aiov.iov_base = __UNCONST(buf); /* XXXUNCONST kills const */
355 aiov.iov_len = nbyte;
356 auio.uio_iov = &aiov;
357 auio.uio_iovcnt = 1;
358 auio.uio_resid = nbyte;
359 auio.uio_rw = UIO_WRITE;
360 auio.uio_vmspace = vm;
361
362 /*
363 * Writes return ssize_t because -1 is returned on error. Therefore
364 * we must restrict the length to SSIZE_MAX to avoid garbage return
365 * values.
366 */
367 if (auio.uio_resid > SSIZE_MAX) {
368 error = EINVAL;
369 goto out;
370 }
371
372 #ifdef KTRACE
373 /*
374 * if tracing, save a copy of iovec
375 */
376 if (KTRPOINT(p, KTR_GENIO))
377 ktriov = aiov;
378 #endif
379 cnt = auio.uio_resid;
380 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
381 if (error) {
382 if (auio.uio_resid != cnt && (error == ERESTART ||
383 error == EINTR || error == EWOULDBLOCK))
384 error = 0;
385 if (error == EPIPE) {
386 rw_enter(&proclist_lock, RW_READER);
387 psignal(p, SIGPIPE);
388 rw_exit(&proclist_lock);
389 }
390 }
391 cnt -= auio.uio_resid;
392 #ifdef KTRACE
393 if (KTRPOINT(p, KTR_GENIO) && error == 0)
394 ktrgenio(l, fd, UIO_WRITE, &ktriov, cnt, error);
395 #endif
396 *retval = cnt;
397 out:
398 FILE_UNUSE(fp, l);
399 uvmspace_free(vm);
400 return (error);
401 }
402
403 /*
404 * Gather write system call
405 */
406 int
407 sys_writev(struct lwp *l, void *v, register_t *retval)
408 {
409 struct sys_writev_args /* {
410 syscallarg(int) fd;
411 syscallarg(const struct iovec *) iovp;
412 syscallarg(int) iovcnt;
413 } */ *uap = v;
414 int fd;
415 struct file *fp;
416 struct proc *p;
417 struct filedesc *fdp;
418
419 fd = SCARG(uap, fd);
420 p = l->l_proc;
421 fdp = p->p_fd;
422
423 if ((fp = fd_getfile(fdp, fd)) == NULL)
424 return (EBADF);
425
426 if ((fp->f_flag & FWRITE) == 0) {
427 simple_unlock(&fp->f_slock);
428 return (EBADF);
429 }
430
431 FILE_USE(fp);
432
433 /* dofilewritev() will unuse the descriptor for us */
434 return (dofilewritev(l, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
435 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
436 }
437
438 int
439 dofilewritev(struct lwp *l, int fd, struct file *fp, const struct iovec *iovp,
440 int iovcnt, off_t *offset, int flags, register_t *retval)
441 {
442 struct proc *p;
443 struct uio auio;
444 struct iovec *iov, *needfree, aiov[UIO_SMALLIOV];
445 struct vmspace *vm;
446 int i, error;
447 size_t cnt;
448 u_int iovlen;
449 #ifdef KTRACE
450 struct iovec *ktriov;
451 #endif
452
453 p = l->l_proc;
454 error = proc_vmspace_getref(p, &vm);
455 if (error) {
456 goto out;
457 }
458 #ifdef KTRACE
459 ktriov = NULL;
460 #endif
461 /* note: can't use iovlen until iovcnt is validated */
462 iovlen = iovcnt * sizeof(struct iovec);
463 if ((u_int)iovcnt > UIO_SMALLIOV) {
464 if ((u_int)iovcnt > IOV_MAX) {
465 error = EINVAL;
466 goto out;
467 }
468 iov = malloc(iovlen, M_IOV, M_WAITOK);
469 needfree = iov;
470 } else if ((u_int)iovcnt > 0) {
471 iov = aiov;
472 needfree = NULL;
473 } else {
474 error = EINVAL;
475 goto out;
476 }
477
478 auio.uio_iov = iov;
479 auio.uio_iovcnt = iovcnt;
480 auio.uio_rw = UIO_WRITE;
481 auio.uio_vmspace = vm;
482 error = copyin(iovp, iov, iovlen);
483 if (error)
484 goto done;
485 auio.uio_resid = 0;
486 for (i = 0; i < iovcnt; i++) {
487 auio.uio_resid += iov->iov_len;
488 /*
489 * Writes return ssize_t because -1 is returned on error.
490 * Therefore we must restrict the length to SSIZE_MAX to
491 * avoid garbage return values.
492 */
493 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
494 error = EINVAL;
495 goto done;
496 }
497 iov++;
498 }
499 #ifdef KTRACE
500 /*
501 * if tracing, save a copy of iovec
502 */
503 if (KTRPOINT(p, KTR_GENIO)) {
504 ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
505 memcpy((caddr_t)ktriov, (caddr_t)auio.uio_iov, iovlen);
506 }
507 #endif
508 cnt = auio.uio_resid;
509 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
510 if (error) {
511 if (auio.uio_resid != cnt && (error == ERESTART ||
512 error == EINTR || error == EWOULDBLOCK))
513 error = 0;
514 if (error == EPIPE) {
515 rw_enter(&proclist_lock, RW_READER);
516 psignal(p, SIGPIPE);
517 rw_exit(&proclist_lock);
518 }
519 }
520 cnt -= auio.uio_resid;
521 #ifdef KTRACE
522 if (ktriov != NULL) {
523 if (KTRPOINT(p, KTR_GENIO) && (error == 0))
524 ktrgenio(l, fd, UIO_WRITE, ktriov, cnt, error);
525 free(ktriov, M_TEMP);
526 }
527 #endif
528 *retval = cnt;
529 done:
530 if (needfree)
531 free(needfree, M_IOV);
532 out:
533 FILE_UNUSE(fp, l);
534 uvmspace_free(vm);
535 return (error);
536 }
537
538 /*
539 * Ioctl system call
540 */
541 /* ARGSUSED */
542 int
543 sys_ioctl(struct lwp *l, void *v, register_t *retval)
544 {
545 struct sys_ioctl_args /* {
546 syscallarg(int) fd;
547 syscallarg(u_long) com;
548 syscallarg(caddr_t) data;
549 } */ *uap = v;
550 struct file *fp;
551 struct proc *p;
552 struct filedesc *fdp;
553 u_long com;
554 int error;
555 u_int size;
556 caddr_t data, memp;
557 #define STK_PARAMS 128
558 u_long stkbuf[STK_PARAMS/sizeof(u_long)];
559
560 error = 0;
561 p = l->l_proc;
562 fdp = p->p_fd;
563
564 if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
565 return (EBADF);
566
567 FILE_USE(fp);
568
569 if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
570 error = EBADF;
571 com = 0;
572 goto out;
573 }
574
575 switch (com = SCARG(uap, com)) {
576 case FIONCLEX:
577 fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
578 goto out;
579
580 case FIOCLEX:
581 fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
582 goto out;
583 }
584
585 /*
586 * Interpret high order word to find amount of data to be
587 * copied to/from the user's address space.
588 */
589 size = IOCPARM_LEN(com);
590 if (size > IOCPARM_MAX) {
591 error = ENOTTY;
592 goto out;
593 }
594 memp = NULL;
595 if (size > sizeof(stkbuf)) {
596 memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
597 data = memp;
598 } else
599 data = (caddr_t)stkbuf;
600 if (com&IOC_IN) {
601 if (size) {
602 error = copyin(SCARG(uap, data), data, size);
603 if (error) {
604 if (memp)
605 free(memp, M_IOCTLOPS);
606 goto out;
607 }
608 #ifdef KTRACE
609 if (KTRPOINT(p, KTR_GENIO)) {
610 struct iovec iov;
611 iov.iov_base = SCARG(uap, data);
612 iov.iov_len = size;
613 ktrgenio(l, SCARG(uap, fd), UIO_WRITE, &iov,
614 size, 0);
615 }
616 #endif
617 } else
618 *(caddr_t *)data = SCARG(uap, data);
619 } else if ((com&IOC_OUT) && size)
620 /*
621 * Zero the buffer so the user always
622 * gets back something deterministic.
623 */
624 memset(data, 0, size);
625 else if (com&IOC_VOID)
626 *(caddr_t *)data = SCARG(uap, data);
627
628 switch (com) {
629
630 case FIONBIO:
631 if (*(int *)data != 0)
632 fp->f_flag |= FNONBLOCK;
633 else
634 fp->f_flag &= ~FNONBLOCK;
635 error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data, l);
636 break;
637
638 case FIOASYNC:
639 if (*(int *)data != 0)
640 fp->f_flag |= FASYNC;
641 else
642 fp->f_flag &= ~FASYNC;
643 error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data, l);
644 break;
645
646 default:
647 error = (*fp->f_ops->fo_ioctl)(fp, com, data, l);
648 /*
649 * Copy any data to user, size was
650 * already set and checked above.
651 */
652 if (error == 0 && (com&IOC_OUT) && size) {
653 error = copyout(data, SCARG(uap, data), size);
654 #ifdef KTRACE
655 if (KTRPOINT(p, KTR_GENIO)) {
656 struct iovec iov;
657 iov.iov_base = SCARG(uap, data);
658 iov.iov_len = size;
659 ktrgenio(l, SCARG(uap, fd), UIO_READ, &iov,
660 size, error);
661 }
662 #endif
663 }
664 break;
665 }
666 if (memp)
667 free(memp, M_IOCTLOPS);
668 out:
669 FILE_UNUSE(fp, l);
670 switch (error) {
671 case -1:
672 printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
673 "pid=%d comm=%s\n",
674 (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
675 (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
676 p->p_pid, p->p_comm);
677 /* FALLTHROUGH */
678 case EPASSTHROUGH:
679 error = ENOTTY;
680 /* FALLTHROUGH */
681 default:
682 return (error);
683 }
684 }
685
686 int selwait, nselcoll;
687
688 /*
689 * Select system call.
690 */
691 int
692 sys_pselect(struct lwp *l, void *v, register_t *retval)
693 {
694 struct sys_pselect_args /* {
695 syscallarg(int) nd;
696 syscallarg(fd_set *) in;
697 syscallarg(fd_set *) ou;
698 syscallarg(fd_set *) ex;
699 syscallarg(const struct timespec *) ts;
700 syscallarg(sigset_t *) mask;
701 } */ * const uap = v;
702 struct timespec ats;
703 struct timeval atv, *tv = NULL;
704 sigset_t amask, *mask = NULL;
705 int error;
706
707 if (SCARG(uap, ts)) {
708 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
709 if (error)
710 return error;
711 atv.tv_sec = ats.tv_sec;
712 atv.tv_usec = ats.tv_nsec / 1000;
713 tv = &atv;
714 }
715 if (SCARG(uap, mask) != NULL) {
716 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
717 if (error)
718 return error;
719 mask = &amask;
720 }
721
722 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
723 SCARG(uap, ou), SCARG(uap, ex), tv, mask);
724 }
725
726 int
727 inittimeleft(struct timeval *tv, struct timeval *sleeptv)
728 {
729 if (itimerfix(tv))
730 return -1;
731 getmicrouptime(sleeptv);
732 return 0;
733 }
734
735 int
736 gettimeleft(struct timeval *tv, struct timeval *sleeptv)
737 {
738 /*
739 * We have to recalculate the timeout on every retry.
740 */
741 struct timeval slepttv;
742 /*
743 * reduce tv by elapsed time
744 * based on monotonic time scale
745 */
746 getmicrouptime(&slepttv);
747 timeradd(tv, sleeptv, tv);
748 timersub(tv, &slepttv, tv);
749 *sleeptv = slepttv;
750 return tvtohz(tv);
751 }
752
753 int
754 sys_select(struct lwp *l, void *v, register_t *retval)
755 {
756 struct sys_select_args /* {
757 syscallarg(int) nd;
758 syscallarg(fd_set *) in;
759 syscallarg(fd_set *) ou;
760 syscallarg(fd_set *) ex;
761 syscallarg(struct timeval *) tv;
762 } */ * const uap = v;
763 struct timeval atv, *tv = NULL;
764 int error;
765
766 if (SCARG(uap, tv)) {
767 error = copyin(SCARG(uap, tv), (caddr_t)&atv,
768 sizeof(atv));
769 if (error)
770 return error;
771 tv = &atv;
772 }
773
774 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
775 SCARG(uap, ou), SCARG(uap, ex), tv, NULL);
776 }
777
778 int
779 selcommon(struct lwp *l, register_t *retval, int nd, fd_set *u_in,
780 fd_set *u_ou, fd_set *u_ex, struct timeval *tv, sigset_t *mask)
781 {
782 char smallbits[howmany(FD_SETSIZE, NFDBITS) *
783 sizeof(fd_mask) * 6];
784 struct proc * const p = l->l_proc;
785 caddr_t bits;
786 int s, ncoll, error, timo;
787 size_t ni;
788 sigset_t oldmask;
789 struct timeval sleeptv;
790
791 error = 0;
792 if (nd < 0)
793 return (EINVAL);
794 if (nd > p->p_fd->fd_nfiles) {
795 /* forgiving; slightly wrong */
796 nd = p->p_fd->fd_nfiles;
797 }
798 ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
799 if (ni * 6 > sizeof(smallbits))
800 bits = malloc(ni * 6, M_TEMP, M_WAITOK);
801 else
802 bits = smallbits;
803
804 #define getbits(name, x) \
805 if (u_ ## name) { \
806 error = copyin(u_ ## name, bits + ni * x, ni); \
807 if (error) \
808 goto done; \
809 } else \
810 memset(bits + ni * x, 0, ni);
811 getbits(in, 0);
812 getbits(ou, 1);
813 getbits(ex, 2);
814 #undef getbits
815
816 timo = 0;
817 if (tv && inittimeleft(tv, &sleeptv) == -1) {
818 error = EINVAL;
819 goto done;
820 }
821
822 if (mask)
823 (void)sigprocmask1(l, SIG_SETMASK, mask, &oldmask);
824
825 retry:
826 ncoll = nselcoll;
827 l->l_flag |= L_SELECT;
828 error = selscan(l, (fd_mask *)(bits + ni * 0),
829 (fd_mask *)(bits + ni * 3), nd, retval);
830 if (error || *retval)
831 goto done;
832 if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
833 goto done;
834 s = splsched();
835 if ((l->l_flag & L_SELECT) == 0 || nselcoll != ncoll) {
836 splx(s);
837 goto retry;
838 }
839 l->l_flag &= ~L_SELECT;
840 error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
841 splx(s);
842 if (error == 0)
843 goto retry;
844 done:
845 if (mask)
846 (void)sigprocmask1(l, SIG_SETMASK, &oldmask, NULL);
847 l->l_flag &= ~L_SELECT;
848 /* select is not restarted after signals... */
849 if (error == ERESTART)
850 error = EINTR;
851 if (error == EWOULDBLOCK)
852 error = 0;
853 if (error == 0) {
854
855 #define putbits(name, x) \
856 if (u_ ## name) { \
857 error = copyout(bits + ni * x, u_ ## name, ni); \
858 if (error) \
859 goto out; \
860 }
861 putbits(in, 3);
862 putbits(ou, 4);
863 putbits(ex, 5);
864 #undef putbits
865 }
866 out:
867 if (ni * 6 > sizeof(smallbits))
868 free(bits, M_TEMP);
869 return (error);
870 }
871
872 int
873 selscan(struct lwp *l, fd_mask *ibitp, fd_mask *obitp, int nfd,
874 register_t *retval)
875 {
876 static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
877 POLLWRNORM | POLLHUP | POLLERR,
878 POLLRDBAND };
879 struct proc *p = l->l_proc;
880 struct filedesc *fdp;
881 int msk, i, j, fd, n;
882 fd_mask ibits, obits;
883 struct file *fp;
884
885 fdp = p->p_fd;
886 n = 0;
887 for (msk = 0; msk < 3; msk++) {
888 for (i = 0; i < nfd; i += NFDBITS) {
889 ibits = *ibitp++;
890 obits = 0;
891 while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
892 ibits &= ~(1 << j);
893 if ((fp = fd_getfile(fdp, fd)) == NULL)
894 return (EBADF);
895 FILE_USE(fp);
896 if ((*fp->f_ops->fo_poll)(fp, flag[msk], l)) {
897 obits |= (1 << j);
898 n++;
899 }
900 FILE_UNUSE(fp, l);
901 }
902 *obitp++ = obits;
903 }
904 }
905 *retval = n;
906 return (0);
907 }
908
909 /*
910 * Poll system call.
911 */
912 int
913 sys_poll(struct lwp *l, void *v, register_t *retval)
914 {
915 struct sys_poll_args /* {
916 syscallarg(struct pollfd *) fds;
917 syscallarg(u_int) nfds;
918 syscallarg(int) timeout;
919 } */ * const uap = v;
920 struct timeval atv, *tv = NULL;
921
922 if (SCARG(uap, timeout) != INFTIM) {
923 atv.tv_sec = SCARG(uap, timeout) / 1000;
924 atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
925 tv = &atv;
926 }
927
928 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
929 tv, NULL);
930 }
931
932 /*
933 * Poll system call.
934 */
935 int
936 sys_pollts(struct lwp *l, void *v, register_t *retval)
937 {
938 struct sys_pollts_args /* {
939 syscallarg(struct pollfd *) fds;
940 syscallarg(u_int) nfds;
941 syscallarg(const struct timespec *) ts;
942 syscallarg(const sigset_t *) mask;
943 } */ * const uap = v;
944 struct timespec ats;
945 struct timeval atv, *tv = NULL;
946 sigset_t amask, *mask = NULL;
947 int error;
948
949 if (SCARG(uap, ts)) {
950 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
951 if (error)
952 return error;
953 atv.tv_sec = ats.tv_sec;
954 atv.tv_usec = ats.tv_nsec / 1000;
955 tv = &atv;
956 }
957 if (SCARG(uap, mask)) {
958 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
959 if (error)
960 return error;
961 mask = &amask;
962 }
963
964 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
965 tv, mask);
966 }
967
968 int
969 pollcommon(struct lwp *l, register_t *retval,
970 struct pollfd *u_fds, u_int nfds,
971 struct timeval *tv, sigset_t *mask)
972 {
973 char smallbits[32 * sizeof(struct pollfd)];
974 struct proc * const p = l->l_proc;
975 caddr_t bits;
976 sigset_t oldmask;
977 int s, ncoll, error, timo;
978 size_t ni;
979 struct timeval sleeptv;
980
981 if (nfds > p->p_fd->fd_nfiles) {
982 /* forgiving; slightly wrong */
983 nfds = p->p_fd->fd_nfiles;
984 }
985 ni = nfds * sizeof(struct pollfd);
986 if (ni > sizeof(smallbits))
987 bits = malloc(ni, M_TEMP, M_WAITOK);
988 else
989 bits = smallbits;
990
991 error = copyin(u_fds, bits, ni);
992 if (error)
993 goto done;
994
995 timo = 0;
996 if (tv && inittimeleft(tv, &sleeptv) == -1) {
997 error = EINVAL;
998 goto done;
999 }
1000
1001 if (mask != NULL)
1002 (void)sigprocmask1(l, SIG_SETMASK, mask, &oldmask);
1003
1004 retry:
1005 ncoll = nselcoll;
1006 l->l_flag |= L_SELECT;
1007 error = pollscan(l, (struct pollfd *)bits, nfds, retval);
1008 if (error || *retval)
1009 goto done;
1010 if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
1011 goto done;
1012 s = splsched();
1013 if ((l->l_flag & L_SELECT) == 0 || nselcoll != ncoll) {
1014 splx(s);
1015 goto retry;
1016 }
1017 l->l_flag &= ~L_SELECT;
1018 error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo);
1019 splx(s);
1020 if (error == 0)
1021 goto retry;
1022 done:
1023 if (mask != NULL)
1024 (void)sigprocmask1(l, SIG_SETMASK, &oldmask, NULL);
1025 l->l_flag &= ~L_SELECT;
1026 /* poll is not restarted after signals... */
1027 if (error == ERESTART)
1028 error = EINTR;
1029 if (error == EWOULDBLOCK)
1030 error = 0;
1031 if (error == 0) {
1032 error = copyout(bits, u_fds, ni);
1033 if (error)
1034 goto out;
1035 }
1036 out:
1037 if (ni > sizeof(smallbits))
1038 free(bits, M_TEMP);
1039 return (error);
1040 }
1041
1042 int
1043 pollscan(struct lwp *l, struct pollfd *fds, int nfd, register_t *retval)
1044 {
1045 struct proc *p = l->l_proc;
1046 struct filedesc *fdp;
1047 int i, n;
1048 struct file *fp;
1049
1050 fdp = p->p_fd;
1051 n = 0;
1052 for (i = 0; i < nfd; i++, fds++) {
1053 if (fds->fd >= fdp->fd_nfiles) {
1054 fds->revents = POLLNVAL;
1055 n++;
1056 } else if (fds->fd < 0) {
1057 fds->revents = 0;
1058 } else {
1059 if ((fp = fd_getfile(fdp, fds->fd)) == NULL) {
1060 fds->revents = POLLNVAL;
1061 n++;
1062 } else {
1063 FILE_USE(fp);
1064 fds->revents = (*fp->f_ops->fo_poll)(fp,
1065 fds->events | POLLERR | POLLHUP, l);
1066 if (fds->revents != 0)
1067 n++;
1068 FILE_UNUSE(fp, l);
1069 }
1070 }
1071 }
1072 *retval = n;
1073 return (0);
1074 }
1075
1076 /*ARGSUSED*/
1077 int
1078 seltrue(dev_t dev, int events, struct lwp *l)
1079 {
1080
1081 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1082 }
1083
1084 /*
1085 * Record a select request.
1086 */
1087 void
1088 selrecord(struct lwp *selector, struct selinfo *sip)
1089 {
1090 struct lwp *l;
1091 struct proc *p;
1092 pid_t mypid;
1093
1094 mypid = selector->l_proc->p_pid;
1095 if (sip->sel_pid == mypid)
1096 return;
1097 if (sip->sel_pid && (p = p_find(sip->sel_pid, PFIND_UNLOCK_FAIL))) {
1098 mutex_enter(&p->p_smutex);
1099 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1100 lwp_lock(l);
1101 if (l->l_wchan == (caddr_t)&selwait &&
1102 l->l_stat == LSSLEEP) {
1103 sip->sel_collision = 1;
1104 lwp_unlock(l);
1105 mutex_exit(&p->p_smutex);
1106 return;
1107 }
1108 lwp_unlock(l);
1109 }
1110 mutex_exit(&p->p_smutex);
1111 }
1112
1113 sip->sel_pid = mypid;
1114 }
1115
1116 /*
1117 * Do a wakeup when a selectable event occurs.
1118 */
1119 void
1120 selwakeup(sip)
1121 struct selinfo *sip;
1122 {
1123 struct lwp *l;
1124 struct proc *p;
1125
1126 if (sip->sel_pid == 0)
1127 return;
1128 if (sip->sel_collision) {
1129 sip->sel_pid = 0;
1130 nselcoll++;
1131 sip->sel_collision = 0;
1132 wakeup((caddr_t)&selwait);
1133 return;
1134 }
1135
1136 /*
1137 * We must use the proclist_mutex as we can be called from an
1138 * interrupt context.
1139 */
1140 mutex_enter(&proclist_mutex);
1141 p = p_find(sip->sel_pid, PFIND_LOCKED);
1142 sip->sel_pid = 0;
1143 if (p == NULL) {
1144 mutex_exit(&proclist_mutex);
1145 return;
1146 }
1147
1148 mutex_enter(&p->p_smutex);
1149 mutex_exit_linked(&proclist_mutex, &p->p_smutex);
1150 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1151 lwp_lock(l);
1152 if (l->l_wchan == (wchan_t)&selwait && l->l_stat == LSSLEEP) {
1153 /* setrunnable() will release the lock. */
1154 setrunnable(l);
1155 } else {
1156 if (l->l_flag & L_SELECT)
1157 l->l_flag &= ~L_SELECT;
1158 lwp_unlock(l);
1159 }
1160 }
1161 mutex_exit(&p->p_smutex);
1162 }
1163