sys_generic.c revision 1.104 1 /* $NetBSD: sys_generic.c,v 1.104 2007/08/15 12:07:34 ad Exp $ */
2
3 /*-
4 * Copyright (c) 2007 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38
39 /*
40 * Copyright (c) 1982, 1986, 1989, 1993
41 * The Regents of the University of California. All rights reserved.
42 * (c) UNIX System Laboratories, Inc.
43 * All or some portions of this file are derived from material licensed
44 * to the University of California by American Telephone and Telegraph
45 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
46 * the permission of UNIX System Laboratories, Inc.
47 *
48 * Redistribution and use in source and binary forms, with or without
49 * modification, are permitted provided that the following conditions
50 * are met:
51 * 1. Redistributions of source code must retain the above copyright
52 * notice, this list of conditions and the following disclaimer.
53 * 2. Redistributions in binary form must reproduce the above copyright
54 * notice, this list of conditions and the following disclaimer in the
55 * documentation and/or other materials provided with the distribution.
56 * 3. Neither the name of the University nor the names of its contributors
57 * may be used to endorse or promote products derived from this software
58 * without specific prior written permission.
59 *
60 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
61 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
62 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
63 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
64 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
65 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
66 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
67 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
68 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
69 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
70 * SUCH DAMAGE.
71 *
72 * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95
73 */
74
75 /*
76 * System calls relating to files.
77 */
78
79 #include <sys/cdefs.h>
80 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.104 2007/08/15 12:07:34 ad Exp $");
81
82 #include <sys/param.h>
83 #include <sys/systm.h>
84 #include <sys/filedesc.h>
85 #include <sys/ioctl.h>
86 #include <sys/file.h>
87 #include <sys/proc.h>
88 #include <sys/socketvar.h>
89 #include <sys/signalvar.h>
90 #include <sys/uio.h>
91 #include <sys/kernel.h>
92 #include <sys/stat.h>
93 #include <sys/kmem.h>
94 #include <sys/poll.h>
95 #include <sys/vnode.h>
96 #include <sys/mount.h>
97 #include <sys/syscallargs.h>
98 #include <sys/ktrace.h>
99
100 #include <uvm/uvm_extern.h>
101
102 /* Flags for lwp::l_selflag. */
103 #define SEL_RESET 0 /* awoken, interrupted, or not yet polling */
104 #define SEL_SCANNING 1 /* polling descriptors */
105 #define SEL_BLOCKING 2 /* about to block on select_cv */
106
107 static int selscan(lwp_t *, fd_mask *, fd_mask *, int, register_t *);
108 static int pollscan(lwp_t *, struct pollfd *, int, register_t *);
109 static void selclear(void);
110
111 /* Global state for select()/poll(). */
112 kmutex_t select_lock;
113 kcondvar_t select_cv;
114 int nselcoll;
115
116 /*
117 * Read system call.
118 */
119 /* ARGSUSED */
120 int
121 sys_read(lwp_t *l, void *v, register_t *retval)
122 {
123 struct sys_read_args /* {
124 syscallarg(int) fd;
125 syscallarg(void *) buf;
126 syscallarg(size_t) nbyte;
127 } */ *uap = v;
128 int fd;
129 struct file *fp;
130 proc_t *p;
131 struct filedesc *fdp;
132
133 fd = SCARG(uap, fd);
134 p = l->l_proc;
135 fdp = p->p_fd;
136
137 if ((fp = fd_getfile(fdp, fd)) == NULL)
138 return (EBADF);
139
140 if ((fp->f_flag & FREAD) == 0) {
141 simple_unlock(&fp->f_slock);
142 return (EBADF);
143 }
144
145 FILE_USE(fp);
146
147 /* dofileread() will unuse the descriptor for us */
148 return (dofileread(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
149 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
150 }
151
152 int
153 dofileread(lwp_t *l, int fd, struct file *fp, void *buf, size_t nbyte,
154 off_t *offset, int flags, register_t *retval)
155 {
156 struct iovec aiov;
157 struct uio auio;
158 proc_t *p;
159 struct vmspace *vm;
160 size_t cnt;
161 int error;
162 p = l->l_proc;
163
164 error = proc_vmspace_getref(p, &vm);
165 if (error) {
166 goto out;
167 }
168
169 aiov.iov_base = (void *)buf;
170 aiov.iov_len = nbyte;
171 auio.uio_iov = &aiov;
172 auio.uio_iovcnt = 1;
173 auio.uio_resid = nbyte;
174 auio.uio_rw = UIO_READ;
175 auio.uio_vmspace = vm;
176
177 /*
178 * Reads return ssize_t because -1 is returned on error. Therefore
179 * we must restrict the length to SSIZE_MAX to avoid garbage return
180 * values.
181 */
182 if (auio.uio_resid > SSIZE_MAX) {
183 error = EINVAL;
184 goto out;
185 }
186
187 cnt = auio.uio_resid;
188 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
189 if (error)
190 if (auio.uio_resid != cnt && (error == ERESTART ||
191 error == EINTR || error == EWOULDBLOCK))
192 error = 0;
193 cnt -= auio.uio_resid;
194 ktrgenio(fd, UIO_READ, buf, nbyte, error);
195 *retval = cnt;
196 out:
197 FILE_UNUSE(fp, l);
198 uvmspace_free(vm);
199 return (error);
200 }
201
202 /*
203 * Scatter read system call.
204 */
205 int
206 sys_readv(lwp_t *l, void *v, register_t *retval)
207 {
208 struct sys_readv_args /* {
209 syscallarg(int) fd;
210 syscallarg(const struct iovec *) iovp;
211 syscallarg(int) iovcnt;
212 } */ *uap = v;
213
214 return do_filereadv(l, SCARG(uap, fd), SCARG(uap, iovp),
215 SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
216 }
217
218 int
219 do_filereadv(struct lwp *l, int fd, const struct iovec *iovp, int iovcnt,
220 off_t *offset, int flags, register_t *retval)
221 {
222 struct proc *p;
223 struct uio auio;
224 struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV];
225 struct vmspace *vm;
226 int i, error;
227 size_t cnt;
228 u_int iovlen;
229 struct file *fp;
230 struct filedesc *fdp;
231 struct iovec *ktriov = NULL;
232
233 if (iovcnt == 0)
234 return EINVAL;
235
236 p = l->l_proc;
237 fdp = p->p_fd;
238
239 if ((fp = fd_getfile(fdp, fd)) == NULL)
240 return EBADF;
241
242 if ((fp->f_flag & FREAD) == 0) {
243 simple_unlock(&fp->f_slock);
244 return EBADF;
245 }
246
247 FILE_USE(fp);
248
249 if (offset == NULL)
250 offset = &fp->f_offset;
251 else {
252 struct vnode *vp = fp->f_data;
253 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
254 error = ESPIPE;
255 goto out;
256 }
257 /*
258 * Test that the device is seekable ?
259 * XXX This works because no file systems actually
260 * XXX take any action on the seek operation.
261 */
262 error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
263 if (error != 0)
264 goto out;
265 }
266
267 error = proc_vmspace_getref(p, &vm);
268 if (error)
269 goto out;
270
271 iovlen = iovcnt * sizeof(struct iovec);
272 if (flags & FOF_IOV_SYSSPACE)
273 iov = __UNCONST(iovp);
274 else {
275 iov = aiov;
276 if ((u_int)iovcnt > UIO_SMALLIOV) {
277 if ((u_int)iovcnt > IOV_MAX) {
278 error = EINVAL;
279 goto out;
280 }
281 iov = kmem_alloc(iovlen, KM_SLEEP);
282 if (iov == NULL) {
283 error = ENOMEM;
284 goto out;
285 }
286 needfree = iov;
287 }
288 error = copyin(iovp, iov, iovlen);
289 if (error)
290 goto done;
291 }
292
293 auio.uio_iov = iov;
294 auio.uio_iovcnt = iovcnt;
295 auio.uio_rw = UIO_READ;
296 auio.uio_vmspace = vm;
297
298 auio.uio_resid = 0;
299 for (i = 0; i < iovcnt; i++, iov++) {
300 auio.uio_resid += iov->iov_len;
301 /*
302 * Reads return ssize_t because -1 is returned on error.
303 * Therefore we must restrict the length to SSIZE_MAX to
304 * avoid garbage return values.
305 */
306 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
307 error = EINVAL;
308 goto done;
309 }
310 }
311
312 /*
313 * if tracing, save a copy of iovec
314 */
315 if (ktrpoint(KTR_GENIO)) {
316 ktriov = kmem_alloc(iovlen, KM_SLEEP);
317 if (ktriov != NULL)
318 memcpy(ktriov, auio.uio_iov, iovlen);
319 }
320
321 cnt = auio.uio_resid;
322 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
323 if (error)
324 if (auio.uio_resid != cnt && (error == ERESTART ||
325 error == EINTR || error == EWOULDBLOCK))
326 error = 0;
327 cnt -= auio.uio_resid;
328 *retval = cnt;
329
330 if (ktriov != NULL) {
331 ktrgeniov(fd, UIO_READ, ktriov, cnt, error);
332 kmem_free(ktriov, iovlen);
333 }
334
335 done:
336 if (needfree)
337 kmem_free(needfree, iovlen);
338 out:
339 FILE_UNUSE(fp, l);
340 uvmspace_free(vm);
341 return (error);
342 }
343
344 /*
345 * Write system call
346 */
347 int
348 sys_write(lwp_t *l, void *v, register_t *retval)
349 {
350 struct sys_write_args /* {
351 syscallarg(int) fd;
352 syscallarg(const void *) buf;
353 syscallarg(size_t) nbyte;
354 } */ *uap = v;
355 int fd;
356 struct file *fp;
357 proc_t *p;
358 struct filedesc *fdp;
359
360 fd = SCARG(uap, fd);
361 p = l->l_proc;
362 fdp = p->p_fd;
363
364 if ((fp = fd_getfile(fdp, fd)) == NULL)
365 return (EBADF);
366
367 if ((fp->f_flag & FWRITE) == 0) {
368 simple_unlock(&fp->f_slock);
369 return (EBADF);
370 }
371
372 FILE_USE(fp);
373
374 /* dofilewrite() will unuse the descriptor for us */
375 return (dofilewrite(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
376 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
377 }
378
379 int
380 dofilewrite(lwp_t *l, int fd, struct file *fp, const void *buf,
381 size_t nbyte, off_t *offset, int flags, register_t *retval)
382 {
383 struct iovec aiov;
384 struct uio auio;
385 proc_t *p;
386 struct vmspace *vm;
387 size_t cnt;
388 int error;
389
390 p = l->l_proc;
391 error = proc_vmspace_getref(p, &vm);
392 if (error) {
393 goto out;
394 }
395 aiov.iov_base = __UNCONST(buf); /* XXXUNCONST kills const */
396 aiov.iov_len = nbyte;
397 auio.uio_iov = &aiov;
398 auio.uio_iovcnt = 1;
399 auio.uio_resid = nbyte;
400 auio.uio_rw = UIO_WRITE;
401 auio.uio_vmspace = vm;
402
403 /*
404 * Writes return ssize_t because -1 is returned on error. Therefore
405 * we must restrict the length to SSIZE_MAX to avoid garbage return
406 * values.
407 */
408 if (auio.uio_resid > SSIZE_MAX) {
409 error = EINVAL;
410 goto out;
411 }
412
413 cnt = auio.uio_resid;
414 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
415 if (error) {
416 if (auio.uio_resid != cnt && (error == ERESTART ||
417 error == EINTR || error == EWOULDBLOCK))
418 error = 0;
419 if (error == EPIPE) {
420 mutex_enter(&proclist_mutex);
421 psignal(p, SIGPIPE);
422 mutex_exit(&proclist_mutex);
423 }
424 }
425 cnt -= auio.uio_resid;
426 ktrgenio(fd, UIO_WRITE, buf, nbyte, error);
427 *retval = cnt;
428 out:
429 FILE_UNUSE(fp, l);
430 uvmspace_free(vm);
431 return (error);
432 }
433
434 /*
435 * Gather write system call
436 */
437 int
438 sys_writev(lwp_t *l, void *v, register_t *retval)
439 {
440 struct sys_writev_args /* {
441 syscallarg(int) fd;
442 syscallarg(const struct iovec *) iovp;
443 syscallarg(int) iovcnt;
444 } */ *uap = v;
445
446 return do_filewritev(l, SCARG(uap, fd), SCARG(uap, iovp),
447 SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
448 }
449
450 int
451 do_filewritev(struct lwp *l, int fd, const struct iovec *iovp, int iovcnt,
452 off_t *offset, int flags, register_t *retval)
453 {
454 struct proc *p;
455 struct uio auio;
456 struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV];
457 struct vmspace *vm;
458 int i, error;
459 size_t cnt;
460 u_int iovlen;
461 struct file *fp;
462 struct filedesc *fdp;
463 struct iovec *ktriov = NULL;
464
465 if (iovcnt == 0)
466 return EINVAL;
467
468 p = l->l_proc;
469 fdp = p->p_fd;
470
471 if ((fp = fd_getfile(fdp, fd)) == NULL)
472 return EBADF;
473
474 if ((fp->f_flag & FWRITE) == 0) {
475 simple_unlock(&fp->f_slock);
476 return EBADF;
477 }
478
479 FILE_USE(fp);
480
481 if (offset == NULL)
482 offset = &fp->f_offset;
483 else {
484 struct vnode *vp = fp->f_data;
485 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
486 error = ESPIPE;
487 goto out;
488 }
489 /*
490 * Test that the device is seekable ?
491 * XXX This works because no file systems actually
492 * XXX take any action on the seek operation.
493 */
494 error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
495 if (error != 0)
496 goto out;
497 }
498
499 error = proc_vmspace_getref(p, &vm);
500 if (error)
501 goto out;
502
503 iovlen = iovcnt * sizeof(struct iovec);
504 if (flags & FOF_IOV_SYSSPACE)
505 iov = __UNCONST(iovp);
506 else {
507 iov = aiov;
508 if ((u_int)iovcnt > UIO_SMALLIOV) {
509 if ((u_int)iovcnt > IOV_MAX) {
510 error = EINVAL;
511 goto out;
512 }
513 iov = kmem_alloc(iovlen, KM_SLEEP);
514 if (iov == NULL) {
515 error = ENOMEM;
516 goto out;
517 }
518 needfree = iov;
519 }
520 error = copyin(iovp, iov, iovlen);
521 if (error)
522 goto done;
523 }
524
525 auio.uio_iov = iov;
526 auio.uio_iovcnt = iovcnt;
527 auio.uio_rw = UIO_WRITE;
528 auio.uio_vmspace = vm;
529
530 auio.uio_resid = 0;
531 for (i = 0; i < iovcnt; i++, iov++) {
532 auio.uio_resid += iov->iov_len;
533 /*
534 * Writes return ssize_t because -1 is returned on error.
535 * Therefore we must restrict the length to SSIZE_MAX to
536 * avoid garbage return values.
537 */
538 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
539 error = EINVAL;
540 goto done;
541 }
542 }
543
544 /*
545 * if tracing, save a copy of iovec
546 */
547 if (ktrpoint(KTR_GENIO)) {
548 ktriov = kmem_alloc(iovlen, KM_SLEEP);
549 if (ktriov != NULL)
550 memcpy(ktriov, auio.uio_iov, iovlen);
551 }
552
553 cnt = auio.uio_resid;
554 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
555 if (error) {
556 if (auio.uio_resid != cnt && (error == ERESTART ||
557 error == EINTR || error == EWOULDBLOCK))
558 error = 0;
559 if (error == EPIPE) {
560 mutex_enter(&proclist_mutex);
561 psignal(p, SIGPIPE);
562 mutex_exit(&proclist_mutex);
563 }
564 }
565 cnt -= auio.uio_resid;
566 *retval = cnt;
567
568 if (ktriov != NULL) {
569 ktrgeniov(fd, UIO_WRITE, ktriov, cnt, error);
570 kmem_free(ktriov, iovlen);
571 }
572
573 done:
574 if (needfree)
575 kmem_free(needfree, iovlen);
576 out:
577 FILE_UNUSE(fp, l);
578 uvmspace_free(vm);
579 return (error);
580 }
581
582 /*
583 * Ioctl system call
584 */
585 /* ARGSUSED */
586 int
587 sys_ioctl(struct lwp *l, void *v, register_t *retval)
588 {
589 struct sys_ioctl_args /* {
590 syscallarg(int) fd;
591 syscallarg(u_long) com;
592 syscallarg(void *) data;
593 } */ *uap = v;
594 struct file *fp;
595 proc_t *p;
596 struct filedesc *fdp;
597 u_long com;
598 int error;
599 u_int size;
600 void *data, *memp;
601 #define STK_PARAMS 128
602 u_long stkbuf[STK_PARAMS/sizeof(u_long)];
603
604 error = 0;
605 p = l->l_proc;
606 fdp = p->p_fd;
607
608 if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
609 return (EBADF);
610
611 FILE_USE(fp);
612
613 if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
614 error = EBADF;
615 com = 0;
616 goto out;
617 }
618
619 switch (com = SCARG(uap, com)) {
620 case FIONCLEX:
621 fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
622 goto out;
623
624 case FIOCLEX:
625 fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
626 goto out;
627 }
628
629 /*
630 * Interpret high order word to find amount of data to be
631 * copied to/from the user's address space.
632 */
633 size = IOCPARM_LEN(com);
634 if (size > IOCPARM_MAX) {
635 error = ENOTTY;
636 goto out;
637 }
638 memp = NULL;
639 if (size > sizeof(stkbuf)) {
640 memp = kmem_alloc(size, KM_SLEEP);
641 data = memp;
642 } else
643 data = (void *)stkbuf;
644 if (com&IOC_IN) {
645 if (size) {
646 error = copyin(SCARG(uap, data), data, size);
647 if (error) {
648 if (memp)
649 kmem_free(memp, size);
650 goto out;
651 }
652 ktrgenio(SCARG(uap, fd), UIO_WRITE, SCARG(uap, data),
653 size, 0);
654 } else
655 *(void **)data = SCARG(uap, data);
656 } else if ((com&IOC_OUT) && size)
657 /*
658 * Zero the buffer so the user always
659 * gets back something deterministic.
660 */
661 memset(data, 0, size);
662 else if (com&IOC_VOID)
663 *(void **)data = SCARG(uap, data);
664
665 switch (com) {
666
667 case FIONBIO:
668 if (*(int *)data != 0)
669 fp->f_flag |= FNONBLOCK;
670 else
671 fp->f_flag &= ~FNONBLOCK;
672 error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data, l);
673 break;
674
675 case FIOASYNC:
676 if (*(int *)data != 0)
677 fp->f_flag |= FASYNC;
678 else
679 fp->f_flag &= ~FASYNC;
680 error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data, l);
681 break;
682
683 default:
684 error = (*fp->f_ops->fo_ioctl)(fp, com, data, l);
685 /*
686 * Copy any data to user, size was
687 * already set and checked above.
688 */
689 if (error == 0 && (com&IOC_OUT) && size) {
690 error = copyout(data, SCARG(uap, data), size);
691 ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, data),
692 size, error);
693 }
694 break;
695 }
696 if (memp)
697 kmem_free(memp, size);
698 out:
699 FILE_UNUSE(fp, l);
700 switch (error) {
701 case -1:
702 printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
703 "pid=%d comm=%s\n",
704 (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
705 (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
706 p->p_pid, p->p_comm);
707 /* FALLTHROUGH */
708 case EPASSTHROUGH:
709 error = ENOTTY;
710 /* FALLTHROUGH */
711 default:
712 return (error);
713 }
714 }
715
716 /*
717 * Select system call.
718 */
719 int
720 sys_pselect(lwp_t *l, void *v, register_t *retval)
721 {
722 struct sys_pselect_args /* {
723 syscallarg(int) nd;
724 syscallarg(fd_set *) in;
725 syscallarg(fd_set *) ou;
726 syscallarg(fd_set *) ex;
727 syscallarg(const struct timespec *) ts;
728 syscallarg(sigset_t *) mask;
729 } */ * const uap = v;
730 struct timespec ats;
731 struct timeval atv, *tv = NULL;
732 sigset_t amask, *mask = NULL;
733 int error;
734
735 if (SCARG(uap, ts)) {
736 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
737 if (error)
738 return error;
739 atv.tv_sec = ats.tv_sec;
740 atv.tv_usec = ats.tv_nsec / 1000;
741 tv = &atv;
742 }
743 if (SCARG(uap, mask) != NULL) {
744 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
745 if (error)
746 return error;
747 mask = &amask;
748 }
749
750 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
751 SCARG(uap, ou), SCARG(uap, ex), tv, mask);
752 }
753
754 int
755 inittimeleft(struct timeval *tv, struct timeval *sleeptv)
756 {
757 if (itimerfix(tv))
758 return -1;
759 getmicrouptime(sleeptv);
760 return 0;
761 }
762
763 int
764 gettimeleft(struct timeval *tv, struct timeval *sleeptv)
765 {
766 /*
767 * We have to recalculate the timeout on every retry.
768 */
769 struct timeval slepttv;
770 /*
771 * reduce tv by elapsed time
772 * based on monotonic time scale
773 */
774 getmicrouptime(&slepttv);
775 timeradd(tv, sleeptv, tv);
776 timersub(tv, &slepttv, tv);
777 *sleeptv = slepttv;
778 return tvtohz(tv);
779 }
780
781 int
782 sys_select(lwp_t *l, void *v, register_t *retval)
783 {
784 struct sys_select_args /* {
785 syscallarg(int) nd;
786 syscallarg(fd_set *) in;
787 syscallarg(fd_set *) ou;
788 syscallarg(fd_set *) ex;
789 syscallarg(struct timeval *) tv;
790 } */ * const uap = v;
791 struct timeval atv, *tv = NULL;
792 int error;
793
794 if (SCARG(uap, tv)) {
795 error = copyin(SCARG(uap, tv), (void *)&atv,
796 sizeof(atv));
797 if (error)
798 return error;
799 tv = &atv;
800 }
801
802 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
803 SCARG(uap, ou), SCARG(uap, ex), tv, NULL);
804 }
805
806 int
807 selcommon(lwp_t *l, register_t *retval, int nd, fd_set *u_in,
808 fd_set *u_ou, fd_set *u_ex, struct timeval *tv, sigset_t *mask)
809 {
810 char smallbits[howmany(FD_SETSIZE, NFDBITS) *
811 sizeof(fd_mask) * 6];
812 proc_t * const p = l->l_proc;
813 char *bits;
814 int ncoll, error, timo;
815 size_t ni;
816 sigset_t oldmask;
817 struct timeval sleeptv;
818
819 error = 0;
820 if (nd < 0)
821 return (EINVAL);
822 if (nd > p->p_fd->fd_nfiles) {
823 /* forgiving; slightly wrong */
824 nd = p->p_fd->fd_nfiles;
825 }
826 ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
827 if (ni * 6 > sizeof(smallbits))
828 bits = kmem_alloc(ni * 6, KM_SLEEP);
829 else
830 bits = smallbits;
831
832 #define getbits(name, x) \
833 if (u_ ## name) { \
834 error = copyin(u_ ## name, bits + ni * x, ni); \
835 if (error) \
836 goto done; \
837 } else \
838 memset(bits + ni * x, 0, ni);
839 getbits(in, 0);
840 getbits(ou, 1);
841 getbits(ex, 2);
842 #undef getbits
843
844 timo = 0;
845 if (tv && inittimeleft(tv, &sleeptv) == -1) {
846 error = EINVAL;
847 goto done;
848 }
849
850 if (mask) {
851 sigminusset(&sigcantmask, mask);
852 mutex_enter(&p->p_smutex);
853 oldmask = l->l_sigmask;
854 l->l_sigmask = *mask;
855 mutex_exit(&p->p_smutex);
856 } else
857 oldmask = l->l_sigmask; /* XXXgcc */
858
859 mutex_enter(&select_lock);
860 SLIST_INIT(&l->l_selwait);
861 for (;;) {
862 l->l_selflag = SEL_SCANNING;
863 ncoll = nselcoll;
864 mutex_exit(&select_lock);
865
866 error = selscan(l, (fd_mask *)(bits + ni * 0),
867 (fd_mask *)(bits + ni * 3), nd, retval);
868
869 mutex_enter(&select_lock);
870 if (error || *retval)
871 break;
872 if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
873 break;
874 if (l->l_selflag != SEL_SCANNING || ncoll != nselcoll)
875 continue;
876 l->l_selflag = SEL_BLOCKING;
877 error = cv_timedwait_sig(&select_cv, &select_lock, timo);
878 if (error != 0)
879 break;
880 }
881 selclear();
882 mutex_exit(&select_lock);
883
884 if (mask) {
885 mutex_enter(&p->p_smutex);
886 l->l_sigmask = oldmask;
887 mutex_exit(&p->p_smutex);
888 }
889
890 done:
891 /* select is not restarted after signals... */
892 if (error == ERESTART)
893 error = EINTR;
894 if (error == EWOULDBLOCK)
895 error = 0;
896 if (error == 0 && u_in != NULL)
897 error = copyout(bits + ni * 3, u_in, ni);
898 if (error == 0 && u_ou != NULL)
899 error = copyout(bits + ni * 4, u_ou, ni);
900 if (error == 0 && u_ex != NULL)
901 error = copyout(bits + ni * 5, u_ex, ni);
902 if (bits != smallbits)
903 kmem_free(bits, ni * 6);
904 return (error);
905 }
906
907 int
908 selscan(lwp_t *l, fd_mask *ibitp, fd_mask *obitp, int nfd,
909 register_t *retval)
910 {
911 static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
912 POLLWRNORM | POLLHUP | POLLERR,
913 POLLRDBAND };
914 proc_t *p = l->l_proc;
915 struct filedesc *fdp;
916 int msk, i, j, fd, n;
917 fd_mask ibits, obits;
918 struct file *fp;
919
920 fdp = p->p_fd;
921 n = 0;
922 for (msk = 0; msk < 3; msk++) {
923 for (i = 0; i < nfd; i += NFDBITS) {
924 ibits = *ibitp++;
925 obits = 0;
926 while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
927 ibits &= ~(1 << j);
928 if ((fp = fd_getfile(fdp, fd)) == NULL)
929 return (EBADF);
930 FILE_USE(fp);
931 if ((*fp->f_ops->fo_poll)(fp, flag[msk], l)) {
932 obits |= (1 << j);
933 n++;
934 }
935 FILE_UNUSE(fp, l);
936 }
937 *obitp++ = obits;
938 }
939 }
940 *retval = n;
941 return (0);
942 }
943
944 /*
945 * Poll system call.
946 */
947 int
948 sys_poll(lwp_t *l, void *v, register_t *retval)
949 {
950 struct sys_poll_args /* {
951 syscallarg(struct pollfd *) fds;
952 syscallarg(u_int) nfds;
953 syscallarg(int) timeout;
954 } */ * const uap = v;
955 struct timeval atv, *tv = NULL;
956
957 if (SCARG(uap, timeout) != INFTIM) {
958 atv.tv_sec = SCARG(uap, timeout) / 1000;
959 atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
960 tv = &atv;
961 }
962
963 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
964 tv, NULL);
965 }
966
967 /*
968 * Poll system call.
969 */
970 int
971 sys_pollts(lwp_t *l, void *v, register_t *retval)
972 {
973 struct sys_pollts_args /* {
974 syscallarg(struct pollfd *) fds;
975 syscallarg(u_int) nfds;
976 syscallarg(const struct timespec *) ts;
977 syscallarg(const sigset_t *) mask;
978 } */ * const uap = v;
979 struct timespec ats;
980 struct timeval atv, *tv = NULL;
981 sigset_t amask, *mask = NULL;
982 int error;
983
984 if (SCARG(uap, ts)) {
985 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
986 if (error)
987 return error;
988 atv.tv_sec = ats.tv_sec;
989 atv.tv_usec = ats.tv_nsec / 1000;
990 tv = &atv;
991 }
992 if (SCARG(uap, mask)) {
993 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
994 if (error)
995 return error;
996 mask = &amask;
997 }
998
999 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
1000 tv, mask);
1001 }
1002
1003 int
1004 pollcommon(lwp_t *l, register_t *retval,
1005 struct pollfd *u_fds, u_int nfds,
1006 struct timeval *tv, sigset_t *mask)
1007 {
1008 char smallbits[32 * sizeof(struct pollfd)];
1009 proc_t * const p = l->l_proc;
1010 void * bits;
1011 sigset_t oldmask;
1012 int ncoll, error, timo;
1013 size_t ni;
1014 struct timeval sleeptv;
1015
1016 if (nfds > p->p_fd->fd_nfiles) {
1017 /* forgiving; slightly wrong */
1018 nfds = p->p_fd->fd_nfiles;
1019 }
1020 ni = nfds * sizeof(struct pollfd);
1021 if (ni > sizeof(smallbits))
1022 bits = kmem_alloc(ni, KM_SLEEP);
1023 else
1024 bits = smallbits;
1025
1026 error = copyin(u_fds, bits, ni);
1027 if (error)
1028 goto done;
1029
1030 timo = 0;
1031 if (tv && inittimeleft(tv, &sleeptv) == -1) {
1032 error = EINVAL;
1033 goto done;
1034 }
1035
1036 if (mask) {
1037 sigminusset(&sigcantmask, mask);
1038 mutex_enter(&p->p_smutex);
1039 oldmask = l->l_sigmask;
1040 l->l_sigmask = *mask;
1041 mutex_exit(&p->p_smutex);
1042 } else
1043 oldmask = l->l_sigmask; /* XXXgcc */
1044
1045 mutex_enter(&select_lock);
1046 SLIST_INIT(&l->l_selwait);
1047 for (;;) {
1048 ncoll = nselcoll;
1049 l->l_selflag = SEL_SCANNING;
1050 mutex_exit(&select_lock);
1051
1052 error = pollscan(l, (struct pollfd *)bits, nfds, retval);
1053
1054 mutex_enter(&select_lock);
1055 if (error || *retval)
1056 break;
1057 if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
1058 break;
1059 if (l->l_selflag != SEL_SCANNING || nselcoll != ncoll)
1060 continue;
1061 l->l_selflag = SEL_BLOCKING;
1062 error = cv_timedwait_sig(&select_cv, &select_lock, timo);
1063 if (error != 0)
1064 break;
1065 }
1066 selclear();
1067 mutex_exit(&select_lock);
1068
1069 if (mask) {
1070 mutex_enter(&p->p_smutex);
1071 l->l_sigmask = oldmask;
1072 mutex_exit(&p->p_smutex);
1073 }
1074 done:
1075 /* poll is not restarted after signals... */
1076 if (error == ERESTART)
1077 error = EINTR;
1078 if (error == EWOULDBLOCK)
1079 error = 0;
1080 if (error == 0)
1081 error = copyout(bits, u_fds, ni);
1082 if (bits != smallbits)
1083 kmem_free(bits, ni);
1084 return (error);
1085 }
1086
1087 int
1088 pollscan(lwp_t *l, struct pollfd *fds, int nfd, register_t *retval)
1089 {
1090 proc_t *p = l->l_proc;
1091 struct filedesc *fdp;
1092 int i, n;
1093 struct file *fp;
1094
1095 fdp = p->p_fd;
1096 n = 0;
1097 for (i = 0; i < nfd; i++, fds++) {
1098 if (fds->fd >= fdp->fd_nfiles) {
1099 fds->revents = POLLNVAL;
1100 n++;
1101 } else if (fds->fd < 0) {
1102 fds->revents = 0;
1103 } else {
1104 if ((fp = fd_getfile(fdp, fds->fd)) == NULL) {
1105 fds->revents = POLLNVAL;
1106 n++;
1107 } else {
1108 FILE_USE(fp);
1109 fds->revents = (*fp->f_ops->fo_poll)(fp,
1110 fds->events | POLLERR | POLLHUP, l);
1111 if (fds->revents != 0)
1112 n++;
1113 FILE_UNUSE(fp, l);
1114 }
1115 }
1116 }
1117 *retval = n;
1118 return (0);
1119 }
1120
1121 /*ARGSUSED*/
1122 int
1123 seltrue(dev_t dev, int events, lwp_t *l)
1124 {
1125
1126 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1127 }
1128
1129 /*
1130 * Record a select request.
1131 */
1132 void
1133 selrecord(lwp_t *selector, struct selinfo *sip)
1134 {
1135
1136 mutex_enter(&select_lock);
1137 if (sip->sel_lwp == NULL) {
1138 /* First named waiter, although there may be more. */
1139 sip->sel_lwp = selector;
1140 SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
1141 } else if (sip->sel_lwp != selector) {
1142 /* Multiple waiters. */
1143 sip->sel_collision = true;
1144 }
1145 mutex_exit(&select_lock);
1146 }
1147
1148 /*
1149 * Do a wakeup when a selectable event occurs.
1150 */
1151 void
1152 selwakeup(struct selinfo *sip)
1153 {
1154 lwp_t *l;
1155
1156 mutex_enter(&select_lock);
1157 if (sip->sel_collision) {
1158 /* Multiple waiters - just notify everybody. */
1159 nselcoll++;
1160 sip->sel_collision = false;
1161 cv_broadcast(&select_cv);
1162 } else if (sip->sel_lwp != NULL) {
1163 /* Only one LWP waiting. */
1164 l = sip->sel_lwp;
1165 if (l->l_selflag == SEL_BLOCKING) {
1166 /*
1167 * If it's sleeping, wake it up. If not, it's
1168 * already awake but hasn't yet removed itself
1169 * from the selector. We reset the state below
1170 * so that we only attempt to do this once.
1171 */
1172 lwp_lock(l);
1173 if (l->l_wchan == &select_cv) {
1174 /* lwp_unsleep() releases the LWP lock. */
1175 lwp_unsleep(l);
1176 } else
1177 lwp_unlock(l);
1178 } else {
1179 /*
1180 * Not yet asleep. Reset its state below so that
1181 * it will go around again.
1182 */
1183 }
1184 l->l_selflag = SEL_RESET;
1185 }
1186 mutex_exit(&select_lock);
1187 }
1188
1189 void
1190 selnotify(struct selinfo *sip, long knhint)
1191 {
1192
1193 selwakeup(sip);
1194 KNOTE(&sip->sel_klist, knhint);
1195 }
1196
1197 /*
1198 * Remove an LWP from all objects that it is waiting for.
1199 */
1200 static void
1201 selclear(void)
1202 {
1203 struct selinfo *sip;
1204 lwp_t *l = curlwp;
1205
1206 KASSERT(mutex_owned(&select_lock));
1207
1208 SLIST_FOREACH(sip, &l->l_selwait, sel_chain) {
1209 KASSERT(sip->sel_lwp == l);
1210 sip->sel_lwp = NULL;
1211 }
1212 }
1213
1214 /*
1215 * Initialize the select/poll system calls.
1216 */
1217 void
1218 selsysinit(void)
1219 {
1220
1221 mutex_init(&select_lock, MUTEX_DRIVER, IPL_VM);
1222 cv_init(&select_cv, "select");
1223 }
1224