sys_generic.c revision 1.100.2.6 1 /* $NetBSD: sys_generic.c,v 1.100.2.6 2007/04/28 22:40:04 ad Exp $ */
2
3 /*-
4 * Copyright (c) 2007 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38
39 /*
40 * Copyright (c) 1982, 1986, 1989, 1993
41 * The Regents of the University of California. All rights reserved.
42 * (c) UNIX System Laboratories, Inc.
43 * All or some portions of this file are derived from material licensed
44 * to the University of California by American Telephone and Telegraph
45 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
46 * the permission of UNIX System Laboratories, Inc.
47 *
48 * Redistribution and use in source and binary forms, with or without
49 * modification, are permitted provided that the following conditions
50 * are met:
51 * 1. Redistributions of source code must retain the above copyright
52 * notice, this list of conditions and the following disclaimer.
53 * 2. Redistributions in binary form must reproduce the above copyright
54 * notice, this list of conditions and the following disclaimer in the
55 * documentation and/or other materials provided with the distribution.
56 * 3. Neither the name of the University nor the names of its contributors
57 * may be used to endorse or promote products derived from this software
58 * without specific prior written permission.
59 *
60 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
61 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
62 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
63 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
64 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
65 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
66 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
67 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
68 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
69 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
70 * SUCH DAMAGE.
71 *
72 * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95
73 */
74
75 /*
76 * System calls relating to files.
77 */
78
79 #include <sys/cdefs.h>
80 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.100.2.6 2007/04/28 22:40:04 ad Exp $");
81
82 #include "opt_ktrace.h"
83
84 #include <sys/param.h>
85 #include <sys/systm.h>
86 #include <sys/filedesc.h>
87 #include <sys/ioctl.h>
88 #include <sys/file.h>
89 #include <sys/proc.h>
90 #include <sys/socketvar.h>
91 #include <sys/signalvar.h>
92 #include <sys/uio.h>
93 #include <sys/kernel.h>
94 #include <sys/stat.h>
95 #include <sys/kmem.h>
96 #include <sys/poll.h>
97 #include <sys/mount.h>
98 #include <sys/syscallargs.h>
99 #ifdef KTRACE
100 #include <sys/ktrace.h>
101 #endif
102
103 #include <uvm/uvm_extern.h>
104
105 /* Flags for lwp::l_selflag. */
106 #define SEL_RESET 0 /* awoken, interrupted, or not yet polling */
107 #define SEL_SCANNING 1 /* polling descriptors */
108 #define SEL_BLOCKING 2 /* about to block on select_cv */
109
110 static int selscan(lwp_t *, fd_mask *, fd_mask *, int, register_t *);
111 static int pollscan(lwp_t *, struct pollfd *, int, register_t *);
112 static void selclear(void);
113
114 /* Global state for select()/poll(). */
115 kmutex_t select_lock;
116 kcondvar_t select_cv;
117 int nselcoll;
118
119 /*
120 * Read system call.
121 */
122 /* ARGSUSED */
123 int
124 sys_read(lwp_t *l, void *v, register_t *retval)
125 {
126 struct sys_read_args /* {
127 syscallarg(int) fd;
128 syscallarg(void *) buf;
129 syscallarg(size_t) nbyte;
130 } */ *uap = v;
131 int fd;
132 struct file *fp;
133 proc_t *p;
134 struct filedesc *fdp;
135
136 fd = SCARG(uap, fd);
137 p = l->l_proc;
138 fdp = p->p_fd;
139
140 if ((fp = fd_getfile(fdp, fd)) == NULL)
141 return (EBADF);
142
143 if ((fp->f_flag & FREAD) == 0) {
144 mutex_exit(&fp->f_lock);
145 return (EBADF);
146 }
147
148 FILE_USE(fp);
149
150 /* dofileread() will unuse the descriptor for us */
151 return (dofileread(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
152 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
153 }
154
155 int
156 dofileread(lwp_t *l, int fd, struct file *fp, void *buf, size_t nbyte,
157 off_t *offset, int flags, register_t *retval)
158 {
159 struct iovec aiov;
160 struct uio auio;
161 proc_t *p;
162 struct vmspace *vm;
163 size_t cnt;
164 int error;
165 #ifdef KTRACE
166 struct iovec ktriov = { .iov_base = NULL, };
167 #endif
168 p = l->l_proc;
169
170 error = proc_vmspace_getref(p, &vm);
171 if (error) {
172 goto out;
173 }
174
175 aiov.iov_base = (void *)buf;
176 aiov.iov_len = nbyte;
177 auio.uio_iov = &aiov;
178 auio.uio_iovcnt = 1;
179 auio.uio_resid = nbyte;
180 auio.uio_rw = UIO_READ;
181 auio.uio_vmspace = vm;
182
183 /*
184 * Reads return ssize_t because -1 is returned on error. Therefore
185 * we must restrict the length to SSIZE_MAX to avoid garbage return
186 * values.
187 */
188 if (auio.uio_resid > SSIZE_MAX) {
189 error = EINVAL;
190 goto out;
191 }
192
193 #ifdef KTRACE
194 /*
195 * if tracing, save a copy of iovec
196 */
197 if (KTRPOINT(p, KTR_GENIO))
198 ktriov = aiov;
199 #endif
200 cnt = auio.uio_resid;
201 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
202 if (error)
203 if (auio.uio_resid != cnt && (error == ERESTART ||
204 error == EINTR || error == EWOULDBLOCK))
205 error = 0;
206 cnt -= auio.uio_resid;
207 #ifdef KTRACE
208 if (KTRPOINT(p, KTR_GENIO) && error == 0)
209 ktrgenio(l, fd, UIO_READ, &ktriov, cnt, error);
210 #endif
211 *retval = cnt;
212 out:
213 FILE_UNUSE(fp, l);
214 uvmspace_free(vm);
215 return (error);
216 }
217
218 /*
219 * Scatter read system call.
220 */
221 int
222 sys_readv(lwp_t *l, void *v, register_t *retval)
223 {
224 struct sys_readv_args /* {
225 syscallarg(int) fd;
226 syscallarg(const struct iovec *) iovp;
227 syscallarg(int) iovcnt;
228 } */ *uap = v;
229 struct filedesc *fdp;
230 struct file *fp;
231 proc_t *p;
232 int fd;
233
234 fd = SCARG(uap, fd);
235 p = l->l_proc;
236 fdp = p->p_fd;
237
238 if ((fp = fd_getfile(fdp, fd)) == NULL)
239 return (EBADF);
240
241 if ((fp->f_flag & FREAD) == 0) {
242 mutex_exit(&fp->f_lock);
243 return (EBADF);
244 }
245
246 FILE_USE(fp);
247
248 /* dofilereadv() will unuse the descriptor for us */
249 return (dofilereadv(l, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
250 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
251 }
252
253 int
254 dofilereadv(lwp_t *l, int fd, struct file *fp, const struct iovec *iovp,
255 int iovcnt, off_t *offset, int flags, register_t *retval)
256 {
257 proc_t *p;
258 struct uio auio;
259 struct iovec *iov, *needfree, aiov[UIO_SMALLIOV];
260 struct vmspace *vm;
261 int i, error;
262 size_t cnt;
263 u_int iovlen;
264 #ifdef KTRACE
265 struct iovec *ktriov;
266 #endif
267
268 p = l->l_proc;
269 error = proc_vmspace_getref(p, &vm);
270 if (error) {
271 goto out;
272 }
273
274 #ifdef KTRACE
275 ktriov = NULL;
276 #endif
277 /* note: can't use iovlen until iovcnt is validated */
278 iovlen = iovcnt * sizeof(struct iovec);
279 if ((u_int)iovcnt > UIO_SMALLIOV) {
280 if ((u_int)iovcnt > IOV_MAX) {
281 error = EINVAL;
282 goto out;
283 }
284 iov = kmem_alloc(iovlen, KM_SLEEP);
285 needfree = iov;
286 } else if ((u_int)iovcnt > 0) {
287 iov = aiov;
288 needfree = NULL;
289 } else {
290 error = EINVAL;
291 goto out;
292 }
293
294 auio.uio_iov = iov;
295 auio.uio_iovcnt = iovcnt;
296 auio.uio_rw = UIO_READ;
297 auio.uio_vmspace = vm;
298 error = copyin(iovp, iov, iovlen);
299 if (error)
300 goto done;
301 auio.uio_resid = 0;
302 for (i = 0; i < iovcnt; i++) {
303 auio.uio_resid += iov->iov_len;
304 /*
305 * Reads return ssize_t because -1 is returned on error.
306 * Therefore we must restrict the length to SSIZE_MAX to
307 * avoid garbage return values.
308 */
309 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
310 error = EINVAL;
311 goto done;
312 }
313 iov++;
314 }
315 #ifdef KTRACE
316 /*
317 * if tracing, save a copy of iovec
318 */
319 if (KTRPOINT(p, KTR_GENIO)) {
320 ktriov = kmem_alloc(iovlen, KM_SLEEP);
321 memcpy((void *)ktriov, (void *)auio.uio_iov, iovlen);
322 }
323 #endif
324 cnt = auio.uio_resid;
325 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
326 if (error)
327 if (auio.uio_resid != cnt && (error == ERESTART ||
328 error == EINTR || error == EWOULDBLOCK))
329 error = 0;
330 cnt -= auio.uio_resid;
331 #ifdef KTRACE
332 if (ktriov != NULL) {
333 if (KTRPOINT(p, KTR_GENIO) && (error == 0))
334 ktrgenio(l, fd, UIO_READ, ktriov, cnt, error);
335 kmem_free(ktriov, iovlen);
336 }
337 #endif
338 *retval = cnt;
339 done:
340 if (needfree)
341 kmem_free(needfree, iovlen);
342 out:
343 FILE_UNUSE(fp, l);
344 uvmspace_free(vm);
345 return (error);
346 }
347
348 /*
349 * Write system call
350 */
351 int
352 sys_write(lwp_t *l, void *v, register_t *retval)
353 {
354 struct sys_write_args /* {
355 syscallarg(int) fd;
356 syscallarg(const void *) buf;
357 syscallarg(size_t) nbyte;
358 } */ *uap = v;
359 int fd;
360 struct file *fp;
361 proc_t *p;
362 struct filedesc *fdp;
363
364 fd = SCARG(uap, fd);
365 p = l->l_proc;
366 fdp = p->p_fd;
367
368 if ((fp = fd_getfile(fdp, fd)) == NULL)
369 return (EBADF);
370
371 if ((fp->f_flag & FWRITE) == 0) {
372 mutex_exit(&fp->f_lock);
373 return (EBADF);
374 }
375
376 FILE_USE(fp);
377
378 /* dofilewrite() will unuse the descriptor for us */
379 return (dofilewrite(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
380 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
381 }
382
383 int
384 dofilewrite(lwp_t *l, int fd, struct file *fp, const void *buf,
385 size_t nbyte, off_t *offset, int flags, register_t *retval)
386 {
387 struct iovec aiov;
388 struct uio auio;
389 proc_t *p;
390 struct vmspace *vm;
391 size_t cnt;
392 int error;
393 #ifdef KTRACE
394 struct iovec ktriov = { .iov_base = NULL, };
395 #endif
396
397 p = l->l_proc;
398 error = proc_vmspace_getref(p, &vm);
399 if (error) {
400 goto out;
401 }
402 aiov.iov_base = __UNCONST(buf); /* XXXUNCONST kills const */
403 aiov.iov_len = nbyte;
404 auio.uio_iov = &aiov;
405 auio.uio_iovcnt = 1;
406 auio.uio_resid = nbyte;
407 auio.uio_rw = UIO_WRITE;
408 auio.uio_vmspace = vm;
409
410 /*
411 * Writes return ssize_t because -1 is returned on error. Therefore
412 * we must restrict the length to SSIZE_MAX to avoid garbage return
413 * values.
414 */
415 if (auio.uio_resid > SSIZE_MAX) {
416 error = EINVAL;
417 goto out;
418 }
419
420 #ifdef KTRACE
421 /*
422 * if tracing, save a copy of iovec
423 */
424 if (KTRPOINT(p, KTR_GENIO))
425 ktriov = aiov;
426 #endif
427 cnt = auio.uio_resid;
428 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
429 if (error) {
430 if (auio.uio_resid != cnt && (error == ERESTART ||
431 error == EINTR || error == EWOULDBLOCK))
432 error = 0;
433 if (error == EPIPE) {
434 mutex_enter(&proclist_mutex);
435 psignal(p, SIGPIPE);
436 mutex_exit(&proclist_mutex);
437 }
438 }
439 cnt -= auio.uio_resid;
440 #ifdef KTRACE
441 if (KTRPOINT(p, KTR_GENIO) && error == 0)
442 ktrgenio(l, fd, UIO_WRITE, &ktriov, cnt, error);
443 #endif
444 *retval = cnt;
445 out:
446 FILE_UNUSE(fp, l);
447 uvmspace_free(vm);
448 return (error);
449 }
450
451 /*
452 * Gather write system call
453 */
454 int
455 sys_writev(lwp_t *l, void *v, register_t *retval)
456 {
457 struct sys_writev_args /* {
458 syscallarg(int) fd;
459 syscallarg(const struct iovec *) iovp;
460 syscallarg(int) iovcnt;
461 } */ *uap = v;
462 int fd;
463 struct file *fp;
464 proc_t *p;
465 struct filedesc *fdp;
466
467 fd = SCARG(uap, fd);
468 p = l->l_proc;
469 fdp = p->p_fd;
470
471 if ((fp = fd_getfile(fdp, fd)) == NULL)
472 return (EBADF);
473
474 if ((fp->f_flag & FWRITE) == 0) {
475 mutex_exit(&fp->f_lock);
476 return (EBADF);
477 }
478
479 FILE_USE(fp);
480
481 /* dofilewritev() will unuse the descriptor for us */
482 return (dofilewritev(l, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
483 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
484 }
485
486 int
487 dofilewritev(lwp_t *l, int fd, struct file *fp, const struct iovec *iovp,
488 int iovcnt, off_t *offset, int flags, register_t *retval)
489 {
490 proc_t *p;
491 struct uio auio;
492 struct iovec *iov, *needfree, aiov[UIO_SMALLIOV];
493 struct vmspace *vm;
494 int i, error;
495 size_t cnt;
496 u_int iovlen;
497 #ifdef KTRACE
498 struct iovec *ktriov;
499 #endif
500
501 p = l->l_proc;
502 error = proc_vmspace_getref(p, &vm);
503 if (error) {
504 goto out;
505 }
506 #ifdef KTRACE
507 ktriov = NULL;
508 #endif
509 /* note: can't use iovlen until iovcnt is validated */
510 iovlen = iovcnt * sizeof(struct iovec);
511 if ((u_int)iovcnt > UIO_SMALLIOV) {
512 if ((u_int)iovcnt > IOV_MAX) {
513 error = EINVAL;
514 goto out;
515 }
516 iov = kmem_alloc(iovlen, KM_SLEEP);
517 needfree = iov;
518 } else if ((u_int)iovcnt > 0) {
519 iov = aiov;
520 needfree = NULL;
521 } else {
522 error = EINVAL;
523 goto out;
524 }
525
526 auio.uio_iov = iov;
527 auio.uio_iovcnt = iovcnt;
528 auio.uio_rw = UIO_WRITE;
529 auio.uio_vmspace = vm;
530 error = copyin(iovp, iov, iovlen);
531 if (error)
532 goto done;
533 auio.uio_resid = 0;
534 for (i = 0; i < iovcnt; i++) {
535 auio.uio_resid += iov->iov_len;
536 /*
537 * Writes return ssize_t because -1 is returned on error.
538 * Therefore we must restrict the length to SSIZE_MAX to
539 * avoid garbage return values.
540 */
541 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
542 error = EINVAL;
543 goto done;
544 }
545 iov++;
546 }
547 #ifdef KTRACE
548 /*
549 * if tracing, save a copy of iovec
550 */
551 if (KTRPOINT(p, KTR_GENIO)) {
552 ktriov = kmem_alloc(iovlen, KM_SLEEP);
553 memcpy((void *)ktriov, (void *)auio.uio_iov, iovlen);
554 }
555 #endif
556 cnt = auio.uio_resid;
557 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
558 if (error) {
559 if (auio.uio_resid != cnt && (error == ERESTART ||
560 error == EINTR || error == EWOULDBLOCK))
561 error = 0;
562 if (error == EPIPE) {
563 mutex_enter(&proclist_mutex);
564 psignal(p, SIGPIPE);
565 mutex_exit(&proclist_mutex);
566 }
567 }
568 cnt -= auio.uio_resid;
569 #ifdef KTRACE
570 if (ktriov != NULL) {
571 if (KTRPOINT(p, KTR_GENIO) && (error == 0))
572 ktrgenio(l, fd, UIO_WRITE, ktriov, cnt, error);
573 kmem_free(ktriov, iovlen);
574 }
575 #endif
576 *retval = cnt;
577 done:
578 if (needfree)
579 kmem_free(needfree, iovlen);
580 out:
581 FILE_UNUSE(fp, l);
582 uvmspace_free(vm);
583 return (error);
584 }
585
586 /*
587 * Ioctl system call
588 */
589 /* ARGSUSED */
590 int
591 sys_ioctl(lwp_t *l, void *v, register_t *retval)
592 {
593 struct sys_ioctl_args /* {
594 syscallarg(int) fd;
595 syscallarg(u_long) com;
596 syscallarg(void *) data;
597 } */ *uap = v;
598 struct file *fp;
599 proc_t *p;
600 struct filedesc *fdp;
601 u_long com;
602 int error;
603 u_int size;
604 void *data, *memp;
605 #define STK_PARAMS 128
606 u_long stkbuf[STK_PARAMS/sizeof(u_long)];
607
608 error = 0;
609 p = l->l_proc;
610 fdp = p->p_fd;
611
612 if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
613 return (EBADF);
614
615 FILE_USE(fp);
616
617 if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
618 error = EBADF;
619 com = 0;
620 goto out;
621 }
622
623 switch (com = SCARG(uap, com)) {
624 case FIONCLEX:
625 rw_enter(&fdp->fd_lock, RW_WRITER);
626 fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
627 rw_exit(&fdp->fd_lock);
628 goto out;
629
630 case FIOCLEX:
631 rw_enter(&fdp->fd_lock, RW_WRITER);
632 fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
633 rw_exit(&fdp->fd_lock);
634 goto out;
635 }
636
637 /*
638 * Interpret high order word to find amount of data to be
639 * copied to/from the user's address space.
640 */
641 size = IOCPARM_LEN(com);
642 if (size > IOCPARM_MAX) {
643 error = ENOTTY;
644 goto out;
645 }
646 memp = NULL;
647 if (size > sizeof(stkbuf)) {
648 memp = kmem_alloc(size, KM_SLEEP);
649 data = memp;
650 } else
651 data = (void *)stkbuf;
652 if (com&IOC_IN) {
653 if (size) {
654 error = copyin(SCARG(uap, data), data, size);
655 if (error) {
656 if (memp)
657 kmem_free(memp, size);
658 goto out;
659 }
660 #ifdef KTRACE
661 if (KTRPOINT(p, KTR_GENIO)) {
662 struct iovec iov;
663 iov.iov_base = SCARG(uap, data);
664 iov.iov_len = size;
665 ktrgenio(l, SCARG(uap, fd), UIO_WRITE, &iov,
666 size, 0);
667 }
668 #endif
669 } else
670 *(void **)data = SCARG(uap, data);
671 } else if ((com&IOC_OUT) && size)
672 /*
673 * Zero the buffer so the user always
674 * gets back something deterministic.
675 */
676 memset(data, 0, size);
677 else if (com&IOC_VOID)
678 *(void **)data = SCARG(uap, data);
679
680 switch (com) {
681
682 case FIONBIO:
683 mutex_enter(&fp->f_lock);
684 if (*(int *)data != 0)
685 fp->f_flag |= FNONBLOCK;
686 else
687 fp->f_flag &= ~FNONBLOCK;
688 mutex_exit(&fp->f_lock);
689 error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data, l);
690 break;
691
692 case FIOASYNC:
693 mutex_enter(&fp->f_lock);
694 if (*(int *)data != 0)
695 fp->f_flag |= FASYNC;
696 else
697 fp->f_flag &= ~FASYNC;
698 mutex_exit(&fp->f_lock);
699 error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data, l);
700 break;
701
702 default:
703 error = (*fp->f_ops->fo_ioctl)(fp, com, data, l);
704 /*
705 * Copy any data to user, size was
706 * already set and checked above.
707 */
708 if (error == 0 && (com&IOC_OUT) && size) {
709 error = copyout(data, SCARG(uap, data), size);
710 #ifdef KTRACE
711 if (KTRPOINT(p, KTR_GENIO)) {
712 struct iovec iov;
713 iov.iov_base = SCARG(uap, data);
714 iov.iov_len = size;
715 ktrgenio(l, SCARG(uap, fd), UIO_READ, &iov,
716 size, error);
717 }
718 #endif
719 }
720 break;
721 }
722 if (memp)
723 kmem_free(memp, size);
724 out:
725 FILE_UNUSE(fp, l);
726 switch (error) {
727 case -1:
728 printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
729 "pid=%d comm=%s\n",
730 (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
731 (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
732 p->p_pid, p->p_comm);
733 /* FALLTHROUGH */
734 case EPASSTHROUGH:
735 error = ENOTTY;
736 /* FALLTHROUGH */
737 default:
738 return (error);
739 }
740 }
741
742 /*
743 * Select system call.
744 */
745 int
746 sys_pselect(lwp_t *l, void *v, register_t *retval)
747 {
748 struct sys_pselect_args /* {
749 syscallarg(int) nd;
750 syscallarg(fd_set *) in;
751 syscallarg(fd_set *) ou;
752 syscallarg(fd_set *) ex;
753 syscallarg(const struct timespec *) ts;
754 syscallarg(sigset_t *) mask;
755 } */ * const uap = v;
756 struct timespec ats;
757 struct timeval atv, *tv = NULL;
758 sigset_t amask, *mask = NULL;
759 int error;
760
761 if (SCARG(uap, ts)) {
762 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
763 if (error)
764 return error;
765 atv.tv_sec = ats.tv_sec;
766 atv.tv_usec = ats.tv_nsec / 1000;
767 tv = &atv;
768 }
769 if (SCARG(uap, mask) != NULL) {
770 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
771 if (error)
772 return error;
773 mask = &amask;
774 }
775
776 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
777 SCARG(uap, ou), SCARG(uap, ex), tv, mask);
778 }
779
780 int
781 inittimeleft(struct timeval *tv, struct timeval *sleeptv)
782 {
783 if (itimerfix(tv))
784 return -1;
785 getmicrouptime(sleeptv);
786 return 0;
787 }
788
789 int
790 gettimeleft(struct timeval *tv, struct timeval *sleeptv)
791 {
792 /*
793 * We have to recalculate the timeout on every retry.
794 */
795 struct timeval slepttv;
796 /*
797 * reduce tv by elapsed time
798 * based on monotonic time scale
799 */
800 getmicrouptime(&slepttv);
801 timeradd(tv, sleeptv, tv);
802 timersub(tv, &slepttv, tv);
803 *sleeptv = slepttv;
804 return tvtohz(tv);
805 }
806
807 int
808 sys_select(lwp_t *l, void *v, register_t *retval)
809 {
810 struct sys_select_args /* {
811 syscallarg(int) nd;
812 syscallarg(fd_set *) in;
813 syscallarg(fd_set *) ou;
814 syscallarg(fd_set *) ex;
815 syscallarg(struct timeval *) tv;
816 } */ * const uap = v;
817 struct timeval atv, *tv = NULL;
818 int error;
819
820 if (SCARG(uap, tv)) {
821 error = copyin(SCARG(uap, tv), (void *)&atv,
822 sizeof(atv));
823 if (error)
824 return error;
825 tv = &atv;
826 }
827
828 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
829 SCARG(uap, ou), SCARG(uap, ex), tv, NULL);
830 }
831
832 int
833 selcommon(lwp_t *l, register_t *retval, int nd, fd_set *u_in,
834 fd_set *u_ou, fd_set *u_ex, struct timeval *tv, sigset_t *mask)
835 {
836 char smallbits[howmany(FD_SETSIZE, NFDBITS) *
837 sizeof(fd_mask) * 6];
838 proc_t * const p = l->l_proc;
839 char *bits;
840 int ncoll, error, timo;
841 size_t ni;
842 sigset_t oldmask;
843 struct timeval sleeptv;
844
845 error = 0;
846 if (nd < 0)
847 return (EINVAL);
848 if (nd > p->p_fd->fd_nfiles) {
849 /* forgiving; slightly wrong */
850 nd = p->p_fd->fd_nfiles;
851 }
852 ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
853 if (ni * 6 > sizeof(smallbits))
854 bits = kmem_alloc(ni * 6, KM_SLEEP);
855 else
856 bits = smallbits;
857
858 #define getbits(name, x) \
859 if (u_ ## name) { \
860 error = copyin(u_ ## name, bits + ni * x, ni); \
861 if (error) \
862 goto done; \
863 } else \
864 memset(bits + ni * x, 0, ni);
865 getbits(in, 0);
866 getbits(ou, 1);
867 getbits(ex, 2);
868 #undef getbits
869
870 timo = 0;
871 if (tv && inittimeleft(tv, &sleeptv) == -1) {
872 error = EINVAL;
873 goto done;
874 }
875
876 if (mask) {
877 sigminusset(&sigcantmask, mask);
878 mutex_enter(&p->p_smutex);
879 oldmask = l->l_sigmask;
880 l->l_sigmask = *mask;
881 mutex_exit(&p->p_smutex);
882 } else
883 oldmask = l->l_sigmask; /* XXXgcc */
884
885 mutex_enter(&select_lock);
886 SLIST_INIT(&l->l_selwait);
887 for (;;) {
888 l->l_selflag = SEL_SCANNING;
889 ncoll = nselcoll;
890 mutex_exit(&select_lock);
891
892 error = selscan(l, (fd_mask *)(bits + ni * 0),
893 (fd_mask *)(bits + ni * 3), nd, retval);
894
895 mutex_enter(&select_lock);
896 if (error || *retval)
897 break;
898 if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
899 break;
900 if (l->l_selflag != SEL_SCANNING || ncoll != nselcoll)
901 continue;
902 l->l_selflag = SEL_BLOCKING;
903 error = cv_timedwait_sig(&select_cv, &select_lock, timo);
904 if (error != 0)
905 break;
906 }
907 selclear();
908 mutex_exit(&select_lock);
909
910 if (mask) {
911 mutex_enter(&p->p_smutex);
912 l->l_sigmask = oldmask;
913 mutex_exit(&p->p_smutex);
914 }
915
916 done:
917 /* select is not restarted after signals... */
918 if (error == ERESTART)
919 error = EINTR;
920 if (error == EWOULDBLOCK)
921 error = 0;
922 if (error == 0 && u_in != NULL)
923 error = copyout(bits + ni * 3, u_in, ni);
924 if (error == 0 && u_ou != NULL)
925 error = copyout(bits + ni * 4, u_ou, ni);
926 if (error == 0 && u_ex != NULL)
927 error = copyout(bits + ni * 5, u_ex, ni);
928 if (bits != smallbits)
929 kmem_free(bits, ni * 6);
930 return (error);
931 }
932
933 int
934 selscan(lwp_t *l, fd_mask *ibitp, fd_mask *obitp, int nfd,
935 register_t *retval)
936 {
937 static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
938 POLLWRNORM | POLLHUP | POLLERR,
939 POLLRDBAND };
940 proc_t *p = l->l_proc;
941 struct filedesc *fdp;
942 int msk, i, j, fd, n;
943 fd_mask ibits, obits;
944 struct file *fp;
945
946 fdp = p->p_fd;
947 n = 0;
948 for (msk = 0; msk < 3; msk++) {
949 for (i = 0; i < nfd; i += NFDBITS) {
950 ibits = *ibitp++;
951 obits = 0;
952 while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
953 ibits &= ~(1 << j);
954 if ((fp = fd_getfile(fdp, fd)) == NULL)
955 return (EBADF);
956 FILE_USE(fp);
957 if ((*fp->f_ops->fo_poll)(fp, flag[msk], l)) {
958 obits |= (1 << j);
959 n++;
960 }
961 FILE_UNUSE(fp, l);
962 }
963 *obitp++ = obits;
964 }
965 }
966 *retval = n;
967 return (0);
968 }
969
970 /*
971 * Poll system call.
972 */
973 int
974 sys_poll(lwp_t *l, void *v, register_t *retval)
975 {
976 struct sys_poll_args /* {
977 syscallarg(struct pollfd *) fds;
978 syscallarg(u_int) nfds;
979 syscallarg(int) timeout;
980 } */ * const uap = v;
981 struct timeval atv, *tv = NULL;
982
983 if (SCARG(uap, timeout) != INFTIM) {
984 atv.tv_sec = SCARG(uap, timeout) / 1000;
985 atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
986 tv = &atv;
987 }
988
989 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
990 tv, NULL);
991 }
992
993 /*
994 * Poll system call.
995 */
996 int
997 sys_pollts(lwp_t *l, void *v, register_t *retval)
998 {
999 struct sys_pollts_args /* {
1000 syscallarg(struct pollfd *) fds;
1001 syscallarg(u_int) nfds;
1002 syscallarg(const struct timespec *) ts;
1003 syscallarg(const sigset_t *) mask;
1004 } */ * const uap = v;
1005 struct timespec ats;
1006 struct timeval atv, *tv = NULL;
1007 sigset_t amask, *mask = NULL;
1008 int error;
1009
1010 if (SCARG(uap, ts)) {
1011 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
1012 if (error)
1013 return error;
1014 atv.tv_sec = ats.tv_sec;
1015 atv.tv_usec = ats.tv_nsec / 1000;
1016 tv = &atv;
1017 }
1018 if (SCARG(uap, mask)) {
1019 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
1020 if (error)
1021 return error;
1022 mask = &amask;
1023 }
1024
1025 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
1026 tv, mask);
1027 }
1028
1029 int
1030 pollcommon(lwp_t *l, register_t *retval,
1031 struct pollfd *u_fds, u_int nfds,
1032 struct timeval *tv, sigset_t *mask)
1033 {
1034 char smallbits[32 * sizeof(struct pollfd)];
1035 proc_t * const p = l->l_proc;
1036 void * bits;
1037 sigset_t oldmask;
1038 int ncoll, error, timo;
1039 size_t ni;
1040 struct timeval sleeptv;
1041
1042 if (nfds > p->p_fd->fd_nfiles) {
1043 /* forgiving; slightly wrong */
1044 nfds = p->p_fd->fd_nfiles;
1045 }
1046 ni = nfds * sizeof(struct pollfd);
1047 if (ni > sizeof(smallbits))
1048 bits = kmem_alloc(ni, KM_SLEEP);
1049 else
1050 bits = smallbits;
1051
1052 error = copyin(u_fds, bits, ni);
1053 if (error)
1054 goto done;
1055
1056 timo = 0;
1057 if (tv && inittimeleft(tv, &sleeptv) == -1) {
1058 error = EINVAL;
1059 goto done;
1060 }
1061
1062 if (mask) {
1063 sigminusset(&sigcantmask, mask);
1064 mutex_enter(&p->p_smutex);
1065 oldmask = l->l_sigmask;
1066 l->l_sigmask = *mask;
1067 mutex_exit(&p->p_smutex);
1068 } else
1069 oldmask = l->l_sigmask; /* XXXgcc */
1070
1071 mutex_enter(&select_lock);
1072 SLIST_INIT(&l->l_selwait);
1073 for (;;) {
1074 ncoll = nselcoll;
1075 l->l_selflag = SEL_SCANNING;
1076 mutex_exit(&select_lock);
1077
1078 error = pollscan(l, (struct pollfd *)bits, nfds, retval);
1079
1080 mutex_enter(&select_lock);
1081 if (error || *retval)
1082 break;
1083 if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
1084 break;
1085 if (l->l_selflag != SEL_SCANNING || nselcoll != ncoll)
1086 continue;
1087 l->l_selflag = SEL_BLOCKING;
1088 error = cv_timedwait_sig(&select_cv, &select_lock, timo);
1089 if (error != 0)
1090 break;
1091 }
1092 selclear();
1093 mutex_exit(&select_lock);
1094
1095 if (mask) {
1096 mutex_enter(&p->p_smutex);
1097 l->l_sigmask = oldmask;
1098 mutex_exit(&p->p_smutex);
1099 }
1100 done:
1101 /* poll is not restarted after signals... */
1102 if (error == ERESTART)
1103 error = EINTR;
1104 if (error == EWOULDBLOCK)
1105 error = 0;
1106 if (error == 0)
1107 error = copyout(bits, u_fds, ni);
1108 if (bits != smallbits)
1109 kmem_free(bits, ni);
1110 return (error);
1111 }
1112
1113 int
1114 pollscan(lwp_t *l, struct pollfd *fds, int nfd, register_t *retval)
1115 {
1116 proc_t *p = l->l_proc;
1117 struct filedesc *fdp;
1118 int i, n;
1119 struct file *fp;
1120
1121 fdp = p->p_fd;
1122 n = 0;
1123 for (i = 0; i < nfd; i++, fds++) {
1124 if (fds->fd >= fdp->fd_nfiles) {
1125 fds->revents = POLLNVAL;
1126 n++;
1127 } else if (fds->fd < 0) {
1128 fds->revents = 0;
1129 } else {
1130 if ((fp = fd_getfile(fdp, fds->fd)) == NULL) {
1131 fds->revents = POLLNVAL;
1132 n++;
1133 } else {
1134 FILE_USE(fp);
1135 fds->revents = (*fp->f_ops->fo_poll)(fp,
1136 fds->events | POLLERR | POLLHUP, l);
1137 if (fds->revents != 0)
1138 n++;
1139 FILE_UNUSE(fp, l);
1140 }
1141 }
1142 }
1143 *retval = n;
1144 return (0);
1145 }
1146
1147 /*ARGSUSED*/
1148 int
1149 seltrue(dev_t dev, int events, lwp_t *l)
1150 {
1151
1152 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1153 }
1154
1155 /*
1156 * Record a select request.
1157 */
1158 void
1159 selrecord(lwp_t *selector, struct selinfo *sip)
1160 {
1161
1162 mutex_enter(&select_lock);
1163 if (sip->sel_lwp == NULL) {
1164 /* First named waiter, although there may be more. */
1165 sip->sel_lwp = selector;
1166 SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
1167 } else if (sip->sel_lwp != selector) {
1168 /* Multiple waiters. */
1169 sip->sel_collision = true;
1170 }
1171 mutex_exit(&select_lock);
1172 }
1173
1174 /*
1175 * Do a wakeup when a selectable event occurs.
1176 */
1177 void
1178 selwakeup(struct selinfo *sip)
1179 {
1180 lwp_t *l;
1181
1182 mutex_enter(&select_lock);
1183 if (sip->sel_collision) {
1184 /* Multiple waiters - just notify everybody. */
1185 nselcoll++;
1186 sip->sel_collision = false;
1187 cv_broadcast(&select_cv);
1188 } else if (sip->sel_lwp != NULL) {
1189 /* Only one LWP waiting. */
1190 l = sip->sel_lwp;
1191 if (l->l_selflag == SEL_BLOCKING) {
1192 /*
1193 * If it's sleeping, wake it up. If not, it's
1194 * already awake but hasn't yet removed itself
1195 * from the selector. We reset the state below
1196 * so that we only attempt to do this once.
1197 */
1198 lwp_lock(l);
1199 if (l->l_wchan == &select_cv) {
1200 /* lwp_unsleep() releases the LWP lock. */
1201 lwp_unsleep(l);
1202 } else
1203 lwp_unlock(l);
1204 } else {
1205 /*
1206 * Not yet asleep. Reset its state below so that
1207 * it will go around again.
1208 */
1209 }
1210 l->l_selflag = SEL_RESET;
1211 }
1212 mutex_exit(&select_lock);
1213 }
1214
1215 void
1216 selnotify(struct selinfo *sip, long knhint)
1217 {
1218
1219 selwakeup(sip);
1220 KNOTE(&sip->sel_klist, knhint);
1221 }
1222
1223 /*
1224 * Remove an LWP from all objects that it is waiting for.
1225 */
1226 static void
1227 selclear(void)
1228 {
1229 struct selinfo *sip;
1230 lwp_t *l = curlwp;
1231
1232 KASSERT(mutex_owned(&select_lock));
1233
1234 SLIST_FOREACH(sip, &l->l_selwait, sel_chain) {
1235 KASSERT(sip->sel_lwp == l);
1236 sip->sel_lwp = NULL;
1237 }
1238 }
1239
1240 /*
1241 * Initialize the select/poll system calls.
1242 */
1243 void
1244 selsysinit(void)
1245 {
1246
1247 mutex_init(&select_lock, MUTEX_DRIVER, IPL_VM);
1248 cv_init(&select_cv, "select");
1249 }
1250