sys_generic.c revision 1.103 1 /* $NetBSD: sys_generic.c,v 1.103 2007/07/09 21:10:56 ad Exp $ */
2
3 /*-
4 * Copyright (c) 2007 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38
39 /*
40 * Copyright (c) 1982, 1986, 1989, 1993
41 * The Regents of the University of California. All rights reserved.
42 * (c) UNIX System Laboratories, Inc.
43 * All or some portions of this file are derived from material licensed
44 * to the University of California by American Telephone and Telegraph
45 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
46 * the permission of UNIX System Laboratories, Inc.
47 *
48 * Redistribution and use in source and binary forms, with or without
49 * modification, are permitted provided that the following conditions
50 * are met:
51 * 1. Redistributions of source code must retain the above copyright
52 * notice, this list of conditions and the following disclaimer.
53 * 2. Redistributions in binary form must reproduce the above copyright
54 * notice, this list of conditions and the following disclaimer in the
55 * documentation and/or other materials provided with the distribution.
56 * 3. Neither the name of the University nor the names of its contributors
57 * may be used to endorse or promote products derived from this software
58 * without specific prior written permission.
59 *
60 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
61 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
62 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
63 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
64 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
65 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
66 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
67 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
68 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
69 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
70 * SUCH DAMAGE.
71 *
72 * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95
73 */
74
75 /*
76 * System calls relating to files.
77 */
78
79 #include <sys/cdefs.h>
80 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.103 2007/07/09 21:10:56 ad Exp $");
81
82 #include "opt_ktrace.h"
83
84 #include <sys/param.h>
85 #include <sys/systm.h>
86 #include <sys/filedesc.h>
87 #include <sys/ioctl.h>
88 #include <sys/file.h>
89 #include <sys/proc.h>
90 #include <sys/socketvar.h>
91 #include <sys/signalvar.h>
92 #include <sys/uio.h>
93 #include <sys/kernel.h>
94 #include <sys/stat.h>
95 #include <sys/kmem.h>
96 #include <sys/poll.h>
97 #include <sys/vnode.h>
98 #include <sys/mount.h>
99 #include <sys/syscallargs.h>
100 #ifdef KTRACE
101 #include <sys/ktrace.h>
102 #endif
103
104 #include <uvm/uvm_extern.h>
105
106 /* Flags for lwp::l_selflag. */
107 #define SEL_RESET 0 /* awoken, interrupted, or not yet polling */
108 #define SEL_SCANNING 1 /* polling descriptors */
109 #define SEL_BLOCKING 2 /* about to block on select_cv */
110
111 static int selscan(lwp_t *, fd_mask *, fd_mask *, int, register_t *);
112 static int pollscan(lwp_t *, struct pollfd *, int, register_t *);
113 static void selclear(void);
114
115 /* Global state for select()/poll(). */
116 kmutex_t select_lock;
117 kcondvar_t select_cv;
118 int nselcoll;
119
120 /*
121 * Read system call.
122 */
123 /* ARGSUSED */
124 int
125 sys_read(lwp_t *l, void *v, register_t *retval)
126 {
127 struct sys_read_args /* {
128 syscallarg(int) fd;
129 syscallarg(void *) buf;
130 syscallarg(size_t) nbyte;
131 } */ *uap = v;
132 int fd;
133 struct file *fp;
134 proc_t *p;
135 struct filedesc *fdp;
136
137 fd = SCARG(uap, fd);
138 p = l->l_proc;
139 fdp = p->p_fd;
140
141 if ((fp = fd_getfile(fdp, fd)) == NULL)
142 return (EBADF);
143
144 if ((fp->f_flag & FREAD) == 0) {
145 simple_unlock(&fp->f_slock);
146 return (EBADF);
147 }
148
149 FILE_USE(fp);
150
151 /* dofileread() will unuse the descriptor for us */
152 return (dofileread(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
153 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
154 }
155
156 int
157 dofileread(lwp_t *l, int fd, struct file *fp, void *buf, size_t nbyte,
158 off_t *offset, int flags, register_t *retval)
159 {
160 struct iovec aiov;
161 struct uio auio;
162 proc_t *p;
163 struct vmspace *vm;
164 size_t cnt;
165 int error;
166 #ifdef KTRACE
167 struct iovec ktriov;
168 #endif
169 p = l->l_proc;
170
171 error = proc_vmspace_getref(p, &vm);
172 if (error) {
173 goto out;
174 }
175
176 aiov.iov_base = (void *)buf;
177 aiov.iov_len = nbyte;
178 auio.uio_iov = &aiov;
179 auio.uio_iovcnt = 1;
180 auio.uio_resid = nbyte;
181 auio.uio_rw = UIO_READ;
182 auio.uio_vmspace = vm;
183
184 /*
185 * Reads return ssize_t because -1 is returned on error. Therefore
186 * we must restrict the length to SSIZE_MAX to avoid garbage return
187 * values.
188 */
189 if (auio.uio_resid > SSIZE_MAX) {
190 error = EINVAL;
191 goto out;
192 }
193
194 #ifdef KTRACE
195 /* In case we are tracing, save a copy of iovec */
196 ktriov = aiov;
197 #endif
198 cnt = auio.uio_resid;
199 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
200 if (error)
201 if (auio.uio_resid != cnt && (error == ERESTART ||
202 error == EINTR || error == EWOULDBLOCK))
203 error = 0;
204 cnt -= auio.uio_resid;
205 #ifdef KTRACE
206 if (KTRPOINT(p, KTR_GENIO) && error == 0)
207 ktrgenio(l, fd, UIO_READ, &ktriov, cnt, error);
208 #endif
209 *retval = cnt;
210 out:
211 FILE_UNUSE(fp, l);
212 uvmspace_free(vm);
213 return (error);
214 }
215
216 /*
217 * Scatter read system call.
218 */
219 int
220 sys_readv(lwp_t *l, void *v, register_t *retval)
221 {
222 struct sys_readv_args /* {
223 syscallarg(int) fd;
224 syscallarg(const struct iovec *) iovp;
225 syscallarg(int) iovcnt;
226 } */ *uap = v;
227
228 return do_filereadv(l, SCARG(uap, fd), SCARG(uap, iovp),
229 SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
230 }
231
232 int
233 do_filereadv(struct lwp *l, int fd, const struct iovec *iovp, int iovcnt,
234 off_t *offset, int flags, register_t *retval)
235 {
236 struct proc *p;
237 struct uio auio;
238 struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV];
239 struct vmspace *vm;
240 int i, error;
241 size_t cnt;
242 u_int iovlen;
243 struct file *fp;
244 struct filedesc *fdp;
245 #ifdef KTRACE
246 struct iovec *ktriov = NULL;
247 #endif
248
249 if (iovcnt == 0)
250 return EINVAL;
251
252 p = l->l_proc;
253 fdp = p->p_fd;
254
255 if ((fp = fd_getfile(fdp, fd)) == NULL)
256 return EBADF;
257
258 if ((fp->f_flag & FREAD) == 0) {
259 simple_unlock(&fp->f_slock);
260 return EBADF;
261 }
262
263 FILE_USE(fp);
264
265 if (offset == NULL)
266 offset = &fp->f_offset;
267 else {
268 struct vnode *vp = fp->f_data;
269 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
270 error = ESPIPE;
271 goto out;
272 }
273 /*
274 * Test that the device is seekable ?
275 * XXX This works because no file systems actually
276 * XXX take any action on the seek operation.
277 */
278 error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
279 if (error != 0)
280 goto out;
281 }
282
283 error = proc_vmspace_getref(p, &vm);
284 if (error)
285 goto out;
286
287 iovlen = iovcnt * sizeof(struct iovec);
288 if (flags & FOF_IOV_SYSSPACE)
289 iov = __UNCONST(iovp);
290 else {
291 iov = aiov;
292 if ((u_int)iovcnt > UIO_SMALLIOV) {
293 if ((u_int)iovcnt > IOV_MAX) {
294 error = EINVAL;
295 goto out;
296 }
297 iov = kmem_alloc(iovlen, KM_SLEEP);
298 if (iov == NULL) {
299 error = ENOMEM;
300 goto out;
301 }
302 needfree = iov;
303 }
304 error = copyin(iovp, iov, iovlen);
305 if (error)
306 goto done;
307 }
308
309 auio.uio_iov = iov;
310 auio.uio_iovcnt = iovcnt;
311 auio.uio_rw = UIO_READ;
312 auio.uio_vmspace = vm;
313
314 auio.uio_resid = 0;
315 for (i = 0; i < iovcnt; i++, iov++) {
316 auio.uio_resid += iov->iov_len;
317 /*
318 * Reads return ssize_t because -1 is returned on error.
319 * Therefore we must restrict the length to SSIZE_MAX to
320 * avoid garbage return values.
321 */
322 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
323 error = EINVAL;
324 goto done;
325 }
326 }
327
328 #ifdef KTRACE
329 /*
330 * if tracing, save a copy of iovec
331 */
332 if (KTRPOINT(p, KTR_GENIO)) {
333 ktriov = kmem_alloc(iovlen, KM_SLEEP);
334 if (ktriov != NULL)
335 memcpy(ktriov, auio.uio_iov, iovlen);
336 }
337 #endif
338
339 cnt = auio.uio_resid;
340 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
341 if (error)
342 if (auio.uio_resid != cnt && (error == ERESTART ||
343 error == EINTR || error == EWOULDBLOCK))
344 error = 0;
345 cnt -= auio.uio_resid;
346 *retval = cnt;
347
348 #ifdef KTRACE
349 if (ktriov != NULL) {
350 if (KTRPOINT(p, KTR_GENIO) && (error == 0))
351 ktrgenio(l, fd, UIO_READ, ktriov, cnt, error);
352 kmem_free(ktriov, iovlen);
353 }
354 #endif
355
356 done:
357 if (needfree)
358 kmem_free(needfree, iovlen);
359 out:
360 FILE_UNUSE(fp, l);
361 uvmspace_free(vm);
362 return (error);
363 }
364
365 /*
366 * Write system call
367 */
368 int
369 sys_write(lwp_t *l, void *v, register_t *retval)
370 {
371 struct sys_write_args /* {
372 syscallarg(int) fd;
373 syscallarg(const void *) buf;
374 syscallarg(size_t) nbyte;
375 } */ *uap = v;
376 int fd;
377 struct file *fp;
378 proc_t *p;
379 struct filedesc *fdp;
380
381 fd = SCARG(uap, fd);
382 p = l->l_proc;
383 fdp = p->p_fd;
384
385 if ((fp = fd_getfile(fdp, fd)) == NULL)
386 return (EBADF);
387
388 if ((fp->f_flag & FWRITE) == 0) {
389 simple_unlock(&fp->f_slock);
390 return (EBADF);
391 }
392
393 FILE_USE(fp);
394
395 /* dofilewrite() will unuse the descriptor for us */
396 return (dofilewrite(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
397 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
398 }
399
400 int
401 dofilewrite(lwp_t *l, int fd, struct file *fp, const void *buf,
402 size_t nbyte, off_t *offset, int flags, register_t *retval)
403 {
404 struct iovec aiov;
405 struct uio auio;
406 proc_t *p;
407 struct vmspace *vm;
408 size_t cnt;
409 int error;
410 #ifdef KTRACE
411 struct iovec ktriov;
412 #endif
413
414 p = l->l_proc;
415 error = proc_vmspace_getref(p, &vm);
416 if (error) {
417 goto out;
418 }
419 aiov.iov_base = __UNCONST(buf); /* XXXUNCONST kills const */
420 aiov.iov_len = nbyte;
421 auio.uio_iov = &aiov;
422 auio.uio_iovcnt = 1;
423 auio.uio_resid = nbyte;
424 auio.uio_rw = UIO_WRITE;
425 auio.uio_vmspace = vm;
426
427 /*
428 * Writes return ssize_t because -1 is returned on error. Therefore
429 * we must restrict the length to SSIZE_MAX to avoid garbage return
430 * values.
431 */
432 if (auio.uio_resid > SSIZE_MAX) {
433 error = EINVAL;
434 goto out;
435 }
436
437 #ifdef KTRACE
438 /* In case we are tracing, save a copy of iovec */
439 ktriov = aiov;
440 #endif
441 cnt = auio.uio_resid;
442 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
443 if (error) {
444 if (auio.uio_resid != cnt && (error == ERESTART ||
445 error == EINTR || error == EWOULDBLOCK))
446 error = 0;
447 if (error == EPIPE) {
448 mutex_enter(&proclist_mutex);
449 psignal(p, SIGPIPE);
450 mutex_exit(&proclist_mutex);
451 }
452 }
453 cnt -= auio.uio_resid;
454 #ifdef KTRACE
455 if (KTRPOINT(p, KTR_GENIO) && error == 0)
456 ktrgenio(l, fd, UIO_WRITE, &ktriov, cnt, error);
457 #endif
458 *retval = cnt;
459 out:
460 FILE_UNUSE(fp, l);
461 uvmspace_free(vm);
462 return (error);
463 }
464
465 /*
466 * Gather write system call
467 */
468 int
469 sys_writev(lwp_t *l, void *v, register_t *retval)
470 {
471 struct sys_writev_args /* {
472 syscallarg(int) fd;
473 syscallarg(const struct iovec *) iovp;
474 syscallarg(int) iovcnt;
475 } */ *uap = v;
476
477 return do_filewritev(l, SCARG(uap, fd), SCARG(uap, iovp),
478 SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
479 }
480
481 int
482 do_filewritev(struct lwp *l, int fd, const struct iovec *iovp, int iovcnt,
483 off_t *offset, int flags, register_t *retval)
484 {
485 struct proc *p;
486 struct uio auio;
487 struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV];
488 struct vmspace *vm;
489 int i, error;
490 size_t cnt;
491 u_int iovlen;
492 struct file *fp;
493 struct filedesc *fdp;
494 #ifdef KTRACE
495 struct iovec *ktriov = NULL;
496 #endif
497
498 if (iovcnt == 0)
499 return EINVAL;
500
501 p = l->l_proc;
502 fdp = p->p_fd;
503
504 if ((fp = fd_getfile(fdp, fd)) == NULL)
505 return EBADF;
506
507 if ((fp->f_flag & FWRITE) == 0) {
508 simple_unlock(&fp->f_slock);
509 return EBADF;
510 }
511
512 FILE_USE(fp);
513
514 if (offset == NULL)
515 offset = &fp->f_offset;
516 else {
517 struct vnode *vp = fp->f_data;
518 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
519 error = ESPIPE;
520 goto out;
521 }
522 /*
523 * Test that the device is seekable ?
524 * XXX This works because no file systems actually
525 * XXX take any action on the seek operation.
526 */
527 error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
528 if (error != 0)
529 goto out;
530 }
531
532 error = proc_vmspace_getref(p, &vm);
533 if (error)
534 goto out;
535
536 iovlen = iovcnt * sizeof(struct iovec);
537 if (flags & FOF_IOV_SYSSPACE)
538 iov = __UNCONST(iovp);
539 else {
540 iov = aiov;
541 if ((u_int)iovcnt > UIO_SMALLIOV) {
542 if ((u_int)iovcnt > IOV_MAX) {
543 error = EINVAL;
544 goto out;
545 }
546 iov = kmem_alloc(iovlen, KM_SLEEP);
547 if (iov == NULL) {
548 error = ENOMEM;
549 goto out;
550 }
551 needfree = iov;
552 }
553 error = copyin(iovp, iov, iovlen);
554 if (error)
555 goto done;
556 }
557
558 auio.uio_iov = iov;
559 auio.uio_iovcnt = iovcnt;
560 auio.uio_rw = UIO_WRITE;
561 auio.uio_vmspace = vm;
562
563 auio.uio_resid = 0;
564 for (i = 0; i < iovcnt; i++, iov++) {
565 auio.uio_resid += iov->iov_len;
566 /*
567 * Writes return ssize_t because -1 is returned on error.
568 * Therefore we must restrict the length to SSIZE_MAX to
569 * avoid garbage return values.
570 */
571 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
572 error = EINVAL;
573 goto done;
574 }
575 }
576
577 #ifdef KTRACE
578 /*
579 * if tracing, save a copy of iovec
580 */
581 if (KTRPOINT(p, KTR_GENIO)) {
582 ktriov = kmem_alloc(iovlen, KM_SLEEP);
583 if (ktriov != NULL)
584 memcpy(ktriov, auio.uio_iov, iovlen);
585 }
586 #endif
587 cnt = auio.uio_resid;
588 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
589 if (error) {
590 if (auio.uio_resid != cnt && (error == ERESTART ||
591 error == EINTR || error == EWOULDBLOCK))
592 error = 0;
593 if (error == EPIPE) {
594 mutex_enter(&proclist_mutex);
595 psignal(p, SIGPIPE);
596 mutex_exit(&proclist_mutex);
597 }
598 }
599 cnt -= auio.uio_resid;
600 *retval = cnt;
601
602 #ifdef KTRACE
603 if (ktriov != NULL) {
604 if (KTRPOINT(p, KTR_GENIO) && (error == 0))
605 ktrgenio(l, fd, UIO_WRITE, ktriov, cnt, error);
606 kmem_free(ktriov, iovlen);
607 }
608 #endif
609
610 done:
611 if (needfree)
612 kmem_free(needfree, iovlen);
613 out:
614 FILE_UNUSE(fp, l);
615 uvmspace_free(vm);
616 return (error);
617 }
618
619 /*
620 * Ioctl system call
621 */
622 /* ARGSUSED */
623 int
624 sys_ioctl(struct lwp *l, void *v, register_t *retval)
625 {
626 struct sys_ioctl_args /* {
627 syscallarg(int) fd;
628 syscallarg(u_long) com;
629 syscallarg(void *) data;
630 } */ *uap = v;
631 struct file *fp;
632 proc_t *p;
633 struct filedesc *fdp;
634 u_long com;
635 int error;
636 u_int size;
637 void *data, *memp;
638 #define STK_PARAMS 128
639 u_long stkbuf[STK_PARAMS/sizeof(u_long)];
640
641 error = 0;
642 p = l->l_proc;
643 fdp = p->p_fd;
644
645 if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
646 return (EBADF);
647
648 FILE_USE(fp);
649
650 if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
651 error = EBADF;
652 com = 0;
653 goto out;
654 }
655
656 switch (com = SCARG(uap, com)) {
657 case FIONCLEX:
658 fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
659 goto out;
660
661 case FIOCLEX:
662 fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
663 goto out;
664 }
665
666 /*
667 * Interpret high order word to find amount of data to be
668 * copied to/from the user's address space.
669 */
670 size = IOCPARM_LEN(com);
671 if (size > IOCPARM_MAX) {
672 error = ENOTTY;
673 goto out;
674 }
675 memp = NULL;
676 if (size > sizeof(stkbuf)) {
677 memp = kmem_alloc(size, KM_SLEEP);
678 data = memp;
679 } else
680 data = (void *)stkbuf;
681 if (com&IOC_IN) {
682 if (size) {
683 error = copyin(SCARG(uap, data), data, size);
684 if (error) {
685 if (memp)
686 kmem_free(memp, size);
687 goto out;
688 }
689 #ifdef KTRACE
690 if (KTRPOINT(p, KTR_GENIO)) {
691 struct iovec iov;
692 iov.iov_base = SCARG(uap, data);
693 iov.iov_len = size;
694 ktrgenio(l, SCARG(uap, fd), UIO_WRITE, &iov,
695 size, 0);
696 }
697 #endif
698 } else
699 *(void **)data = SCARG(uap, data);
700 } else if ((com&IOC_OUT) && size)
701 /*
702 * Zero the buffer so the user always
703 * gets back something deterministic.
704 */
705 memset(data, 0, size);
706 else if (com&IOC_VOID)
707 *(void **)data = SCARG(uap, data);
708
709 switch (com) {
710
711 case FIONBIO:
712 if (*(int *)data != 0)
713 fp->f_flag |= FNONBLOCK;
714 else
715 fp->f_flag &= ~FNONBLOCK;
716 error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data, l);
717 break;
718
719 case FIOASYNC:
720 if (*(int *)data != 0)
721 fp->f_flag |= FASYNC;
722 else
723 fp->f_flag &= ~FASYNC;
724 error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data, l);
725 break;
726
727 default:
728 error = (*fp->f_ops->fo_ioctl)(fp, com, data, l);
729 /*
730 * Copy any data to user, size was
731 * already set and checked above.
732 */
733 if (error == 0 && (com&IOC_OUT) && size) {
734 error = copyout(data, SCARG(uap, data), size);
735 #ifdef KTRACE
736 if (KTRPOINT(p, KTR_GENIO)) {
737 struct iovec iov;
738 iov.iov_base = SCARG(uap, data);
739 iov.iov_len = size;
740 ktrgenio(l, SCARG(uap, fd), UIO_READ, &iov,
741 size, error);
742 }
743 #endif
744 }
745 break;
746 }
747 if (memp)
748 kmem_free(memp, size);
749 out:
750 FILE_UNUSE(fp, l);
751 switch (error) {
752 case -1:
753 printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
754 "pid=%d comm=%s\n",
755 (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
756 (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
757 p->p_pid, p->p_comm);
758 /* FALLTHROUGH */
759 case EPASSTHROUGH:
760 error = ENOTTY;
761 /* FALLTHROUGH */
762 default:
763 return (error);
764 }
765 }
766
767 /*
768 * Select system call.
769 */
770 int
771 sys_pselect(lwp_t *l, void *v, register_t *retval)
772 {
773 struct sys_pselect_args /* {
774 syscallarg(int) nd;
775 syscallarg(fd_set *) in;
776 syscallarg(fd_set *) ou;
777 syscallarg(fd_set *) ex;
778 syscallarg(const struct timespec *) ts;
779 syscallarg(sigset_t *) mask;
780 } */ * const uap = v;
781 struct timespec ats;
782 struct timeval atv, *tv = NULL;
783 sigset_t amask, *mask = NULL;
784 int error;
785
786 if (SCARG(uap, ts)) {
787 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
788 if (error)
789 return error;
790 atv.tv_sec = ats.tv_sec;
791 atv.tv_usec = ats.tv_nsec / 1000;
792 tv = &atv;
793 }
794 if (SCARG(uap, mask) != NULL) {
795 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
796 if (error)
797 return error;
798 mask = &amask;
799 }
800
801 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
802 SCARG(uap, ou), SCARG(uap, ex), tv, mask);
803 }
804
805 int
806 inittimeleft(struct timeval *tv, struct timeval *sleeptv)
807 {
808 if (itimerfix(tv))
809 return -1;
810 getmicrouptime(sleeptv);
811 return 0;
812 }
813
814 int
815 gettimeleft(struct timeval *tv, struct timeval *sleeptv)
816 {
817 /*
818 * We have to recalculate the timeout on every retry.
819 */
820 struct timeval slepttv;
821 /*
822 * reduce tv by elapsed time
823 * based on monotonic time scale
824 */
825 getmicrouptime(&slepttv);
826 timeradd(tv, sleeptv, tv);
827 timersub(tv, &slepttv, tv);
828 *sleeptv = slepttv;
829 return tvtohz(tv);
830 }
831
832 int
833 sys_select(lwp_t *l, void *v, register_t *retval)
834 {
835 struct sys_select_args /* {
836 syscallarg(int) nd;
837 syscallarg(fd_set *) in;
838 syscallarg(fd_set *) ou;
839 syscallarg(fd_set *) ex;
840 syscallarg(struct timeval *) tv;
841 } */ * const uap = v;
842 struct timeval atv, *tv = NULL;
843 int error;
844
845 if (SCARG(uap, tv)) {
846 error = copyin(SCARG(uap, tv), (void *)&atv,
847 sizeof(atv));
848 if (error)
849 return error;
850 tv = &atv;
851 }
852
853 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
854 SCARG(uap, ou), SCARG(uap, ex), tv, NULL);
855 }
856
857 int
858 selcommon(lwp_t *l, register_t *retval, int nd, fd_set *u_in,
859 fd_set *u_ou, fd_set *u_ex, struct timeval *tv, sigset_t *mask)
860 {
861 char smallbits[howmany(FD_SETSIZE, NFDBITS) *
862 sizeof(fd_mask) * 6];
863 proc_t * const p = l->l_proc;
864 char *bits;
865 int ncoll, error, timo;
866 size_t ni;
867 sigset_t oldmask;
868 struct timeval sleeptv;
869
870 error = 0;
871 if (nd < 0)
872 return (EINVAL);
873 if (nd > p->p_fd->fd_nfiles) {
874 /* forgiving; slightly wrong */
875 nd = p->p_fd->fd_nfiles;
876 }
877 ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
878 if (ni * 6 > sizeof(smallbits))
879 bits = kmem_alloc(ni * 6, KM_SLEEP);
880 else
881 bits = smallbits;
882
883 #define getbits(name, x) \
884 if (u_ ## name) { \
885 error = copyin(u_ ## name, bits + ni * x, ni); \
886 if (error) \
887 goto done; \
888 } else \
889 memset(bits + ni * x, 0, ni);
890 getbits(in, 0);
891 getbits(ou, 1);
892 getbits(ex, 2);
893 #undef getbits
894
895 timo = 0;
896 if (tv && inittimeleft(tv, &sleeptv) == -1) {
897 error = EINVAL;
898 goto done;
899 }
900
901 if (mask) {
902 sigminusset(&sigcantmask, mask);
903 mutex_enter(&p->p_smutex);
904 oldmask = l->l_sigmask;
905 l->l_sigmask = *mask;
906 mutex_exit(&p->p_smutex);
907 } else
908 oldmask = l->l_sigmask; /* XXXgcc */
909
910 mutex_enter(&select_lock);
911 SLIST_INIT(&l->l_selwait);
912 for (;;) {
913 l->l_selflag = SEL_SCANNING;
914 ncoll = nselcoll;
915 mutex_exit(&select_lock);
916
917 error = selscan(l, (fd_mask *)(bits + ni * 0),
918 (fd_mask *)(bits + ni * 3), nd, retval);
919
920 mutex_enter(&select_lock);
921 if (error || *retval)
922 break;
923 if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
924 break;
925 if (l->l_selflag != SEL_SCANNING || ncoll != nselcoll)
926 continue;
927 l->l_selflag = SEL_BLOCKING;
928 error = cv_timedwait_sig(&select_cv, &select_lock, timo);
929 if (error != 0)
930 break;
931 }
932 selclear();
933 mutex_exit(&select_lock);
934
935 if (mask) {
936 mutex_enter(&p->p_smutex);
937 l->l_sigmask = oldmask;
938 mutex_exit(&p->p_smutex);
939 }
940
941 done:
942 /* select is not restarted after signals... */
943 if (error == ERESTART)
944 error = EINTR;
945 if (error == EWOULDBLOCK)
946 error = 0;
947 if (error == 0 && u_in != NULL)
948 error = copyout(bits + ni * 3, u_in, ni);
949 if (error == 0 && u_ou != NULL)
950 error = copyout(bits + ni * 4, u_ou, ni);
951 if (error == 0 && u_ex != NULL)
952 error = copyout(bits + ni * 5, u_ex, ni);
953 if (bits != smallbits)
954 kmem_free(bits, ni * 6);
955 return (error);
956 }
957
958 int
959 selscan(lwp_t *l, fd_mask *ibitp, fd_mask *obitp, int nfd,
960 register_t *retval)
961 {
962 static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
963 POLLWRNORM | POLLHUP | POLLERR,
964 POLLRDBAND };
965 proc_t *p = l->l_proc;
966 struct filedesc *fdp;
967 int msk, i, j, fd, n;
968 fd_mask ibits, obits;
969 struct file *fp;
970
971 fdp = p->p_fd;
972 n = 0;
973 for (msk = 0; msk < 3; msk++) {
974 for (i = 0; i < nfd; i += NFDBITS) {
975 ibits = *ibitp++;
976 obits = 0;
977 while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
978 ibits &= ~(1 << j);
979 if ((fp = fd_getfile(fdp, fd)) == NULL)
980 return (EBADF);
981 FILE_USE(fp);
982 if ((*fp->f_ops->fo_poll)(fp, flag[msk], l)) {
983 obits |= (1 << j);
984 n++;
985 }
986 FILE_UNUSE(fp, l);
987 }
988 *obitp++ = obits;
989 }
990 }
991 *retval = n;
992 return (0);
993 }
994
995 /*
996 * Poll system call.
997 */
998 int
999 sys_poll(lwp_t *l, void *v, register_t *retval)
1000 {
1001 struct sys_poll_args /* {
1002 syscallarg(struct pollfd *) fds;
1003 syscallarg(u_int) nfds;
1004 syscallarg(int) timeout;
1005 } */ * const uap = v;
1006 struct timeval atv, *tv = NULL;
1007
1008 if (SCARG(uap, timeout) != INFTIM) {
1009 atv.tv_sec = SCARG(uap, timeout) / 1000;
1010 atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
1011 tv = &atv;
1012 }
1013
1014 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
1015 tv, NULL);
1016 }
1017
1018 /*
1019 * Poll system call.
1020 */
1021 int
1022 sys_pollts(lwp_t *l, void *v, register_t *retval)
1023 {
1024 struct sys_pollts_args /* {
1025 syscallarg(struct pollfd *) fds;
1026 syscallarg(u_int) nfds;
1027 syscallarg(const struct timespec *) ts;
1028 syscallarg(const sigset_t *) mask;
1029 } */ * const uap = v;
1030 struct timespec ats;
1031 struct timeval atv, *tv = NULL;
1032 sigset_t amask, *mask = NULL;
1033 int error;
1034
1035 if (SCARG(uap, ts)) {
1036 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
1037 if (error)
1038 return error;
1039 atv.tv_sec = ats.tv_sec;
1040 atv.tv_usec = ats.tv_nsec / 1000;
1041 tv = &atv;
1042 }
1043 if (SCARG(uap, mask)) {
1044 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
1045 if (error)
1046 return error;
1047 mask = &amask;
1048 }
1049
1050 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
1051 tv, mask);
1052 }
1053
1054 int
1055 pollcommon(lwp_t *l, register_t *retval,
1056 struct pollfd *u_fds, u_int nfds,
1057 struct timeval *tv, sigset_t *mask)
1058 {
1059 char smallbits[32 * sizeof(struct pollfd)];
1060 proc_t * const p = l->l_proc;
1061 void * bits;
1062 sigset_t oldmask;
1063 int ncoll, error, timo;
1064 size_t ni;
1065 struct timeval sleeptv;
1066
1067 if (nfds > p->p_fd->fd_nfiles) {
1068 /* forgiving; slightly wrong */
1069 nfds = p->p_fd->fd_nfiles;
1070 }
1071 ni = nfds * sizeof(struct pollfd);
1072 if (ni > sizeof(smallbits))
1073 bits = kmem_alloc(ni, KM_SLEEP);
1074 else
1075 bits = smallbits;
1076
1077 error = copyin(u_fds, bits, ni);
1078 if (error)
1079 goto done;
1080
1081 timo = 0;
1082 if (tv && inittimeleft(tv, &sleeptv) == -1) {
1083 error = EINVAL;
1084 goto done;
1085 }
1086
1087 if (mask) {
1088 sigminusset(&sigcantmask, mask);
1089 mutex_enter(&p->p_smutex);
1090 oldmask = l->l_sigmask;
1091 l->l_sigmask = *mask;
1092 mutex_exit(&p->p_smutex);
1093 } else
1094 oldmask = l->l_sigmask; /* XXXgcc */
1095
1096 mutex_enter(&select_lock);
1097 SLIST_INIT(&l->l_selwait);
1098 for (;;) {
1099 ncoll = nselcoll;
1100 l->l_selflag = SEL_SCANNING;
1101 mutex_exit(&select_lock);
1102
1103 error = pollscan(l, (struct pollfd *)bits, nfds, retval);
1104
1105 mutex_enter(&select_lock);
1106 if (error || *retval)
1107 break;
1108 if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
1109 break;
1110 if (l->l_selflag != SEL_SCANNING || nselcoll != ncoll)
1111 continue;
1112 l->l_selflag = SEL_BLOCKING;
1113 error = cv_timedwait_sig(&select_cv, &select_lock, timo);
1114 if (error != 0)
1115 break;
1116 }
1117 selclear();
1118 mutex_exit(&select_lock);
1119
1120 if (mask) {
1121 mutex_enter(&p->p_smutex);
1122 l->l_sigmask = oldmask;
1123 mutex_exit(&p->p_smutex);
1124 }
1125 done:
1126 /* poll is not restarted after signals... */
1127 if (error == ERESTART)
1128 error = EINTR;
1129 if (error == EWOULDBLOCK)
1130 error = 0;
1131 if (error == 0)
1132 error = copyout(bits, u_fds, ni);
1133 if (bits != smallbits)
1134 kmem_free(bits, ni);
1135 return (error);
1136 }
1137
1138 int
1139 pollscan(lwp_t *l, struct pollfd *fds, int nfd, register_t *retval)
1140 {
1141 proc_t *p = l->l_proc;
1142 struct filedesc *fdp;
1143 int i, n;
1144 struct file *fp;
1145
1146 fdp = p->p_fd;
1147 n = 0;
1148 for (i = 0; i < nfd; i++, fds++) {
1149 if (fds->fd >= fdp->fd_nfiles) {
1150 fds->revents = POLLNVAL;
1151 n++;
1152 } else if (fds->fd < 0) {
1153 fds->revents = 0;
1154 } else {
1155 if ((fp = fd_getfile(fdp, fds->fd)) == NULL) {
1156 fds->revents = POLLNVAL;
1157 n++;
1158 } else {
1159 FILE_USE(fp);
1160 fds->revents = (*fp->f_ops->fo_poll)(fp,
1161 fds->events | POLLERR | POLLHUP, l);
1162 if (fds->revents != 0)
1163 n++;
1164 FILE_UNUSE(fp, l);
1165 }
1166 }
1167 }
1168 *retval = n;
1169 return (0);
1170 }
1171
1172 /*ARGSUSED*/
1173 int
1174 seltrue(dev_t dev, int events, lwp_t *l)
1175 {
1176
1177 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1178 }
1179
1180 /*
1181 * Record a select request.
1182 */
1183 void
1184 selrecord(lwp_t *selector, struct selinfo *sip)
1185 {
1186
1187 mutex_enter(&select_lock);
1188 if (sip->sel_lwp == NULL) {
1189 /* First named waiter, although there may be more. */
1190 sip->sel_lwp = selector;
1191 SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
1192 } else if (sip->sel_lwp != selector) {
1193 /* Multiple waiters. */
1194 sip->sel_collision = true;
1195 }
1196 mutex_exit(&select_lock);
1197 }
1198
1199 /*
1200 * Do a wakeup when a selectable event occurs.
1201 */
1202 void
1203 selwakeup(struct selinfo *sip)
1204 {
1205 lwp_t *l;
1206
1207 mutex_enter(&select_lock);
1208 if (sip->sel_collision) {
1209 /* Multiple waiters - just notify everybody. */
1210 nselcoll++;
1211 sip->sel_collision = false;
1212 cv_broadcast(&select_cv);
1213 } else if (sip->sel_lwp != NULL) {
1214 /* Only one LWP waiting. */
1215 l = sip->sel_lwp;
1216 if (l->l_selflag == SEL_BLOCKING) {
1217 /*
1218 * If it's sleeping, wake it up. If not, it's
1219 * already awake but hasn't yet removed itself
1220 * from the selector. We reset the state below
1221 * so that we only attempt to do this once.
1222 */
1223 lwp_lock(l);
1224 if (l->l_wchan == &select_cv) {
1225 /* lwp_unsleep() releases the LWP lock. */
1226 lwp_unsleep(l);
1227 } else
1228 lwp_unlock(l);
1229 } else {
1230 /*
1231 * Not yet asleep. Reset its state below so that
1232 * it will go around again.
1233 */
1234 }
1235 l->l_selflag = SEL_RESET;
1236 }
1237 mutex_exit(&select_lock);
1238 }
1239
1240 void
1241 selnotify(struct selinfo *sip, long knhint)
1242 {
1243
1244 selwakeup(sip);
1245 KNOTE(&sip->sel_klist, knhint);
1246 }
1247
1248 /*
1249 * Remove an LWP from all objects that it is waiting for.
1250 */
1251 static void
1252 selclear(void)
1253 {
1254 struct selinfo *sip;
1255 lwp_t *l = curlwp;
1256
1257 KASSERT(mutex_owned(&select_lock));
1258
1259 SLIST_FOREACH(sip, &l->l_selwait, sel_chain) {
1260 KASSERT(sip->sel_lwp == l);
1261 sip->sel_lwp = NULL;
1262 }
1263 }
1264
1265 /*
1266 * Initialize the select/poll system calls.
1267 */
1268 void
1269 selsysinit(void)
1270 {
1271
1272 mutex_init(&select_lock, MUTEX_DRIVER, IPL_VM);
1273 cv_init(&select_cv, "select");
1274 }
1275