sys_generic.c revision 1.100.2.7 1 /* $NetBSD: sys_generic.c,v 1.100.2.7 2007/06/09 23:58:06 ad Exp $ */
2
3 /*-
4 * Copyright (c) 2007 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38
39 /*
40 * Copyright (c) 1982, 1986, 1989, 1993
41 * The Regents of the University of California. All rights reserved.
42 * (c) UNIX System Laboratories, Inc.
43 * All or some portions of this file are derived from material licensed
44 * to the University of California by American Telephone and Telegraph
45 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
46 * the permission of UNIX System Laboratories, Inc.
47 *
48 * Redistribution and use in source and binary forms, with or without
49 * modification, are permitted provided that the following conditions
50 * are met:
51 * 1. Redistributions of source code must retain the above copyright
52 * notice, this list of conditions and the following disclaimer.
53 * 2. Redistributions in binary form must reproduce the above copyright
54 * notice, this list of conditions and the following disclaimer in the
55 * documentation and/or other materials provided with the distribution.
56 * 3. Neither the name of the University nor the names of its contributors
57 * may be used to endorse or promote products derived from this software
58 * without specific prior written permission.
59 *
60 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
61 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
62 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
63 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
64 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
65 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
66 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
67 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
68 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
69 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
70 * SUCH DAMAGE.
71 *
72 * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95
73 */
74
75 /*
76 * System calls relating to files.
77 */
78
79 #include <sys/cdefs.h>
80 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.100.2.7 2007/06/09 23:58:06 ad Exp $");
81
82 #include "opt_ktrace.h"
83
84 #include <sys/param.h>
85 #include <sys/systm.h>
86 #include <sys/filedesc.h>
87 #include <sys/ioctl.h>
88 #include <sys/file.h>
89 #include <sys/proc.h>
90 #include <sys/socketvar.h>
91 #include <sys/signalvar.h>
92 #include <sys/uio.h>
93 #include <sys/kernel.h>
94 #include <sys/stat.h>
95 #include <sys/kmem.h>
96 #include <sys/poll.h>
97 #include <sys/mount.h>
98 #include <sys/syscallargs.h>
99 #ifdef KTRACE
100 #include <sys/ktrace.h>
101 #endif
102
103 #include <uvm/uvm_extern.h>
104
105 /* Flags for lwp::l_selflag. */
106 #define SEL_RESET 0 /* awoken, interrupted, or not yet polling */
107 #define SEL_SCANNING 1 /* polling descriptors */
108 #define SEL_BLOCKING 2 /* about to block on select_cv */
109
110 static int selscan(lwp_t *, fd_mask *, fd_mask *, int, register_t *);
111 static int pollscan(lwp_t *, struct pollfd *, int, register_t *);
112 static void selclear(void);
113
114 /* Global state for select()/poll(). */
115 kmutex_t select_lock;
116 kcondvar_t select_cv;
117 int nselcoll;
118
119 /*
120 * Read system call.
121 */
122 /* ARGSUSED */
123 int
124 sys_read(lwp_t *l, void *v, register_t *retval)
125 {
126 struct sys_read_args /* {
127 syscallarg(int) fd;
128 syscallarg(void *) buf;
129 syscallarg(size_t) nbyte;
130 } */ *uap = v;
131 int fd;
132 struct file *fp;
133 proc_t *p;
134 struct filedesc *fdp;
135
136 fd = SCARG(uap, fd);
137 p = l->l_proc;
138 fdp = p->p_fd;
139
140 if ((fp = fd_getfile(fdp, fd)) == NULL)
141 return (EBADF);
142
143 if ((fp->f_flag & FREAD) == 0) {
144 mutex_exit(&fp->f_lock);
145 return (EBADF);
146 }
147
148 FILE_USE(fp);
149
150 /* dofileread() will unuse the descriptor for us */
151 return (dofileread(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
152 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
153 }
154
155 int
156 dofileread(lwp_t *l, int fd, struct file *fp, void *buf, size_t nbyte,
157 off_t *offset, int flags, register_t *retval)
158 {
159 struct iovec aiov;
160 struct uio auio;
161 proc_t *p;
162 struct vmspace *vm;
163 size_t cnt;
164 int error;
165 #ifdef KTRACE
166 struct iovec ktriov;
167 #endif
168 p = l->l_proc;
169
170 error = proc_vmspace_getref(p, &vm);
171 if (error) {
172 goto out;
173 }
174
175 aiov.iov_base = (void *)buf;
176 aiov.iov_len = nbyte;
177 auio.uio_iov = &aiov;
178 auio.uio_iovcnt = 1;
179 auio.uio_resid = nbyte;
180 auio.uio_rw = UIO_READ;
181 auio.uio_vmspace = vm;
182
183 /*
184 * Reads return ssize_t because -1 is returned on error. Therefore
185 * we must restrict the length to SSIZE_MAX to avoid garbage return
186 * values.
187 */
188 if (auio.uio_resid > SSIZE_MAX) {
189 error = EINVAL;
190 goto out;
191 }
192
193 #ifdef KTRACE
194 /* In case we are tracing, save a copy of iovec */
195 ktriov = aiov;
196 #endif
197 cnt = auio.uio_resid;
198 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
199 if (error)
200 if (auio.uio_resid != cnt && (error == ERESTART ||
201 error == EINTR || error == EWOULDBLOCK))
202 error = 0;
203 cnt -= auio.uio_resid;
204 #ifdef KTRACE
205 if (KTRPOINT(p, KTR_GENIO) && error == 0)
206 ktrgenio(l, fd, UIO_READ, &ktriov, cnt, error);
207 #endif
208 *retval = cnt;
209 out:
210 FILE_UNUSE(fp, l);
211 uvmspace_free(vm);
212 return (error);
213 }
214
215 /*
216 * Scatter read system call.
217 */
218 int
219 sys_readv(lwp_t *l, void *v, register_t *retval)
220 {
221 struct sys_readv_args /* {
222 syscallarg(int) fd;
223 syscallarg(const struct iovec *) iovp;
224 syscallarg(int) iovcnt;
225 } */ *uap = v;
226 struct filedesc *fdp;
227 struct file *fp;
228 proc_t *p;
229 int fd;
230
231 fd = SCARG(uap, fd);
232 p = l->l_proc;
233 fdp = p->p_fd;
234
235 if ((fp = fd_getfile(fdp, fd)) == NULL)
236 return (EBADF);
237
238 if ((fp->f_flag & FREAD) == 0) {
239 mutex_exit(&fp->f_lock);
240 return (EBADF);
241 }
242
243 FILE_USE(fp);
244
245 /* dofilereadv() will unuse the descriptor for us */
246 return (dofilereadv(l, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
247 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
248 }
249
250 int
251 dofilereadv(lwp_t *l, int fd, struct file *fp, const struct iovec *iovp,
252 int iovcnt, off_t *offset, int flags, register_t *retval)
253 {
254 proc_t *p;
255 struct uio auio;
256 struct iovec *iov, *needfree, aiov[UIO_SMALLIOV];
257 struct vmspace *vm;
258 int i, error;
259 size_t cnt;
260 u_int iovlen;
261 #ifdef KTRACE
262 struct iovec *ktriov;
263 #endif
264
265 p = l->l_proc;
266 error = proc_vmspace_getref(p, &vm);
267 if (error) {
268 goto out;
269 }
270
271 #ifdef KTRACE
272 ktriov = NULL;
273 #endif
274 /* note: can't use iovlen until iovcnt is validated */
275 iovlen = iovcnt * sizeof(struct iovec);
276 if ((u_int)iovcnt > UIO_SMALLIOV) {
277 if ((u_int)iovcnt > IOV_MAX) {
278 error = EINVAL;
279 goto out;
280 }
281 iov = kmem_alloc(iovlen, KM_SLEEP);
282 needfree = iov;
283 } else if ((u_int)iovcnt > 0) {
284 iov = aiov;
285 needfree = NULL;
286 } else {
287 error = EINVAL;
288 goto out;
289 }
290
291 auio.uio_iov = iov;
292 auio.uio_iovcnt = iovcnt;
293 auio.uio_rw = UIO_READ;
294 auio.uio_vmspace = vm;
295 error = copyin(iovp, iov, iovlen);
296 if (error)
297 goto done;
298 auio.uio_resid = 0;
299 for (i = 0; i < iovcnt; i++) {
300 auio.uio_resid += iov->iov_len;
301 /*
302 * Reads return ssize_t because -1 is returned on error.
303 * Therefore we must restrict the length to SSIZE_MAX to
304 * avoid garbage return values.
305 */
306 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
307 error = EINVAL;
308 goto done;
309 }
310 iov++;
311 }
312 #ifdef KTRACE
313 /*
314 * if tracing, save a copy of iovec
315 */
316 if (KTRPOINT(p, KTR_GENIO)) {
317 ktriov = kmem_alloc(iovlen, KM_SLEEP);
318 memcpy((void *)ktriov, (void *)auio.uio_iov, iovlen);
319 }
320 #endif
321 cnt = auio.uio_resid;
322 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
323 if (error)
324 if (auio.uio_resid != cnt && (error == ERESTART ||
325 error == EINTR || error == EWOULDBLOCK))
326 error = 0;
327 cnt -= auio.uio_resid;
328 #ifdef KTRACE
329 if (ktriov != NULL) {
330 if (KTRPOINT(p, KTR_GENIO) && (error == 0))
331 ktrgenio(l, fd, UIO_READ, ktriov, cnt, error);
332 kmem_free(ktriov, iovlen);
333 }
334 #endif
335 *retval = cnt;
336 done:
337 if (needfree)
338 kmem_free(needfree, iovlen);
339 out:
340 FILE_UNUSE(fp, l);
341 uvmspace_free(vm);
342 return (error);
343 }
344
345 /*
346 * Write system call
347 */
348 int
349 sys_write(lwp_t *l, void *v, register_t *retval)
350 {
351 struct sys_write_args /* {
352 syscallarg(int) fd;
353 syscallarg(const void *) buf;
354 syscallarg(size_t) nbyte;
355 } */ *uap = v;
356 int fd;
357 struct file *fp;
358 proc_t *p;
359 struct filedesc *fdp;
360
361 fd = SCARG(uap, fd);
362 p = l->l_proc;
363 fdp = p->p_fd;
364
365 if ((fp = fd_getfile(fdp, fd)) == NULL)
366 return (EBADF);
367
368 if ((fp->f_flag & FWRITE) == 0) {
369 mutex_exit(&fp->f_lock);
370 return (EBADF);
371 }
372
373 FILE_USE(fp);
374
375 /* dofilewrite() will unuse the descriptor for us */
376 return (dofilewrite(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
377 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
378 }
379
380 int
381 dofilewrite(lwp_t *l, int fd, struct file *fp, const void *buf,
382 size_t nbyte, off_t *offset, int flags, register_t *retval)
383 {
384 struct iovec aiov;
385 struct uio auio;
386 proc_t *p;
387 struct vmspace *vm;
388 size_t cnt;
389 int error;
390 #ifdef KTRACE
391 struct iovec ktriov;
392 #endif
393
394 p = l->l_proc;
395 error = proc_vmspace_getref(p, &vm);
396 if (error) {
397 goto out;
398 }
399 aiov.iov_base = __UNCONST(buf); /* XXXUNCONST kills const */
400 aiov.iov_len = nbyte;
401 auio.uio_iov = &aiov;
402 auio.uio_iovcnt = 1;
403 auio.uio_resid = nbyte;
404 auio.uio_rw = UIO_WRITE;
405 auio.uio_vmspace = vm;
406
407 /*
408 * Writes return ssize_t because -1 is returned on error. Therefore
409 * we must restrict the length to SSIZE_MAX to avoid garbage return
410 * values.
411 */
412 if (auio.uio_resid > SSIZE_MAX) {
413 error = EINVAL;
414 goto out;
415 }
416
417 #ifdef KTRACE
418 /* In case we are tracing, save a copy of iovec */
419 ktriov = aiov;
420 #endif
421 cnt = auio.uio_resid;
422 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
423 if (error) {
424 if (auio.uio_resid != cnt && (error == ERESTART ||
425 error == EINTR || error == EWOULDBLOCK))
426 error = 0;
427 if (error == EPIPE) {
428 mutex_enter(&proclist_mutex);
429 psignal(p, SIGPIPE);
430 mutex_exit(&proclist_mutex);
431 }
432 }
433 cnt -= auio.uio_resid;
434 #ifdef KTRACE
435 if (KTRPOINT(p, KTR_GENIO) && error == 0)
436 ktrgenio(l, fd, UIO_WRITE, &ktriov, cnt, error);
437 #endif
438 *retval = cnt;
439 out:
440 FILE_UNUSE(fp, l);
441 uvmspace_free(vm);
442 return (error);
443 }
444
445 /*
446 * Gather write system call
447 */
448 int
449 sys_writev(lwp_t *l, void *v, register_t *retval)
450 {
451 struct sys_writev_args /* {
452 syscallarg(int) fd;
453 syscallarg(const struct iovec *) iovp;
454 syscallarg(int) iovcnt;
455 } */ *uap = v;
456 int fd;
457 struct file *fp;
458 proc_t *p;
459 struct filedesc *fdp;
460
461 fd = SCARG(uap, fd);
462 p = l->l_proc;
463 fdp = p->p_fd;
464
465 if ((fp = fd_getfile(fdp, fd)) == NULL)
466 return (EBADF);
467
468 if ((fp->f_flag & FWRITE) == 0) {
469 mutex_exit(&fp->f_lock);
470 return (EBADF);
471 }
472
473 FILE_USE(fp);
474
475 /* dofilewritev() will unuse the descriptor for us */
476 return (dofilewritev(l, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
477 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
478 }
479
480 int
481 dofilewritev(lwp_t *l, int fd, struct file *fp, const struct iovec *iovp,
482 int iovcnt, off_t *offset, int flags, register_t *retval)
483 {
484 proc_t *p;
485 struct uio auio;
486 struct iovec *iov, *needfree, aiov[UIO_SMALLIOV];
487 struct vmspace *vm;
488 int i, error;
489 size_t cnt;
490 u_int iovlen;
491 #ifdef KTRACE
492 struct iovec *ktriov;
493 #endif
494
495 p = l->l_proc;
496 error = proc_vmspace_getref(p, &vm);
497 if (error) {
498 goto out;
499 }
500 #ifdef KTRACE
501 ktriov = NULL;
502 #endif
503 /* note: can't use iovlen until iovcnt is validated */
504 iovlen = iovcnt * sizeof(struct iovec);
505 if ((u_int)iovcnt > UIO_SMALLIOV) {
506 if ((u_int)iovcnt > IOV_MAX) {
507 error = EINVAL;
508 goto out;
509 }
510 iov = kmem_alloc(iovlen, KM_SLEEP);
511 needfree = iov;
512 } else if ((u_int)iovcnt > 0) {
513 iov = aiov;
514 needfree = NULL;
515 } else {
516 error = EINVAL;
517 goto out;
518 }
519
520 auio.uio_iov = iov;
521 auio.uio_iovcnt = iovcnt;
522 auio.uio_rw = UIO_WRITE;
523 auio.uio_vmspace = vm;
524 error = copyin(iovp, iov, iovlen);
525 if (error)
526 goto done;
527 auio.uio_resid = 0;
528 for (i = 0; i < iovcnt; i++) {
529 auio.uio_resid += iov->iov_len;
530 /*
531 * Writes return ssize_t because -1 is returned on error.
532 * Therefore we must restrict the length to SSIZE_MAX to
533 * avoid garbage return values.
534 */
535 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
536 error = EINVAL;
537 goto done;
538 }
539 iov++;
540 }
541 #ifdef KTRACE
542 /*
543 * if tracing, save a copy of iovec
544 */
545 if (KTRPOINT(p, KTR_GENIO)) {
546 ktriov = kmem_alloc(iovlen, KM_SLEEP);
547 memcpy((void *)ktriov, (void *)auio.uio_iov, iovlen);
548 }
549 #endif
550 cnt = auio.uio_resid;
551 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
552 if (error) {
553 if (auio.uio_resid != cnt && (error == ERESTART ||
554 error == EINTR || error == EWOULDBLOCK))
555 error = 0;
556 if (error == EPIPE) {
557 mutex_enter(&proclist_mutex);
558 psignal(p, SIGPIPE);
559 mutex_exit(&proclist_mutex);
560 }
561 }
562 cnt -= auio.uio_resid;
563 #ifdef KTRACE
564 if (ktriov != NULL) {
565 if (KTRPOINT(p, KTR_GENIO) && (error == 0))
566 ktrgenio(l, fd, UIO_WRITE, ktriov, cnt, error);
567 kmem_free(ktriov, iovlen);
568 }
569 #endif
570 *retval = cnt;
571 done:
572 if (needfree)
573 kmem_free(needfree, iovlen);
574 out:
575 FILE_UNUSE(fp, l);
576 uvmspace_free(vm);
577 return (error);
578 }
579
580 /*
581 * Ioctl system call
582 */
583 /* ARGSUSED */
584 int
585 sys_ioctl(lwp_t *l, void *v, register_t *retval)
586 {
587 struct sys_ioctl_args /* {
588 syscallarg(int) fd;
589 syscallarg(u_long) com;
590 syscallarg(void *) data;
591 } */ *uap = v;
592 struct file *fp;
593 proc_t *p;
594 struct filedesc *fdp;
595 u_long com;
596 int error;
597 u_int size;
598 void *data, *memp;
599 #define STK_PARAMS 128
600 u_long stkbuf[STK_PARAMS/sizeof(u_long)];
601
602 error = 0;
603 p = l->l_proc;
604 fdp = p->p_fd;
605
606 if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
607 return (EBADF);
608
609 FILE_USE(fp);
610
611 if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
612 error = EBADF;
613 com = 0;
614 goto out;
615 }
616
617 switch (com = SCARG(uap, com)) {
618 case FIONCLEX:
619 rw_enter(&fdp->fd_lock, RW_WRITER);
620 fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
621 rw_exit(&fdp->fd_lock);
622 goto out;
623
624 case FIOCLEX:
625 rw_enter(&fdp->fd_lock, RW_WRITER);
626 fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
627 rw_exit(&fdp->fd_lock);
628 goto out;
629 }
630
631 /*
632 * Interpret high order word to find amount of data to be
633 * copied to/from the user's address space.
634 */
635 size = IOCPARM_LEN(com);
636 if (size > IOCPARM_MAX) {
637 error = ENOTTY;
638 goto out;
639 }
640 memp = NULL;
641 if (size > sizeof(stkbuf)) {
642 memp = kmem_alloc(size, KM_SLEEP);
643 data = memp;
644 } else
645 data = (void *)stkbuf;
646 if (com&IOC_IN) {
647 if (size) {
648 error = copyin(SCARG(uap, data), data, size);
649 if (error) {
650 if (memp)
651 kmem_free(memp, size);
652 goto out;
653 }
654 #ifdef KTRACE
655 if (KTRPOINT(p, KTR_GENIO)) {
656 struct iovec iov;
657 iov.iov_base = SCARG(uap, data);
658 iov.iov_len = size;
659 ktrgenio(l, SCARG(uap, fd), UIO_WRITE, &iov,
660 size, 0);
661 }
662 #endif
663 } else
664 *(void **)data = SCARG(uap, data);
665 } else if ((com&IOC_OUT) && size)
666 /*
667 * Zero the buffer so the user always
668 * gets back something deterministic.
669 */
670 memset(data, 0, size);
671 else if (com&IOC_VOID)
672 *(void **)data = SCARG(uap, data);
673
674 switch (com) {
675
676 case FIONBIO:
677 mutex_enter(&fp->f_lock);
678 if (*(int *)data != 0)
679 fp->f_flag |= FNONBLOCK;
680 else
681 fp->f_flag &= ~FNONBLOCK;
682 mutex_exit(&fp->f_lock);
683 error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data, l);
684 break;
685
686 case FIOASYNC:
687 mutex_enter(&fp->f_lock);
688 if (*(int *)data != 0)
689 fp->f_flag |= FASYNC;
690 else
691 fp->f_flag &= ~FASYNC;
692 mutex_exit(&fp->f_lock);
693 error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data, l);
694 break;
695
696 default:
697 error = (*fp->f_ops->fo_ioctl)(fp, com, data, l);
698 /*
699 * Copy any data to user, size was
700 * already set and checked above.
701 */
702 if (error == 0 && (com&IOC_OUT) && size) {
703 error = copyout(data, SCARG(uap, data), size);
704 #ifdef KTRACE
705 if (KTRPOINT(p, KTR_GENIO)) {
706 struct iovec iov;
707 iov.iov_base = SCARG(uap, data);
708 iov.iov_len = size;
709 ktrgenio(l, SCARG(uap, fd), UIO_READ, &iov,
710 size, error);
711 }
712 #endif
713 }
714 break;
715 }
716 if (memp)
717 kmem_free(memp, size);
718 out:
719 FILE_UNUSE(fp, l);
720 switch (error) {
721 case -1:
722 printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
723 "pid=%d comm=%s\n",
724 (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
725 (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
726 p->p_pid, p->p_comm);
727 /* FALLTHROUGH */
728 case EPASSTHROUGH:
729 error = ENOTTY;
730 /* FALLTHROUGH */
731 default:
732 return (error);
733 }
734 }
735
736 /*
737 * Select system call.
738 */
739 int
740 sys_pselect(lwp_t *l, void *v, register_t *retval)
741 {
742 struct sys_pselect_args /* {
743 syscallarg(int) nd;
744 syscallarg(fd_set *) in;
745 syscallarg(fd_set *) ou;
746 syscallarg(fd_set *) ex;
747 syscallarg(const struct timespec *) ts;
748 syscallarg(sigset_t *) mask;
749 } */ * const uap = v;
750 struct timespec ats;
751 struct timeval atv, *tv = NULL;
752 sigset_t amask, *mask = NULL;
753 int error;
754
755 if (SCARG(uap, ts)) {
756 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
757 if (error)
758 return error;
759 atv.tv_sec = ats.tv_sec;
760 atv.tv_usec = ats.tv_nsec / 1000;
761 tv = &atv;
762 }
763 if (SCARG(uap, mask) != NULL) {
764 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
765 if (error)
766 return error;
767 mask = &amask;
768 }
769
770 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
771 SCARG(uap, ou), SCARG(uap, ex), tv, mask);
772 }
773
774 int
775 inittimeleft(struct timeval *tv, struct timeval *sleeptv)
776 {
777 if (itimerfix(tv))
778 return -1;
779 getmicrouptime(sleeptv);
780 return 0;
781 }
782
783 int
784 gettimeleft(struct timeval *tv, struct timeval *sleeptv)
785 {
786 /*
787 * We have to recalculate the timeout on every retry.
788 */
789 struct timeval slepttv;
790 /*
791 * reduce tv by elapsed time
792 * based on monotonic time scale
793 */
794 getmicrouptime(&slepttv);
795 timeradd(tv, sleeptv, tv);
796 timersub(tv, &slepttv, tv);
797 *sleeptv = slepttv;
798 return tvtohz(tv);
799 }
800
801 int
802 sys_select(lwp_t *l, void *v, register_t *retval)
803 {
804 struct sys_select_args /* {
805 syscallarg(int) nd;
806 syscallarg(fd_set *) in;
807 syscallarg(fd_set *) ou;
808 syscallarg(fd_set *) ex;
809 syscallarg(struct timeval *) tv;
810 } */ * const uap = v;
811 struct timeval atv, *tv = NULL;
812 int error;
813
814 if (SCARG(uap, tv)) {
815 error = copyin(SCARG(uap, tv), (void *)&atv,
816 sizeof(atv));
817 if (error)
818 return error;
819 tv = &atv;
820 }
821
822 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
823 SCARG(uap, ou), SCARG(uap, ex), tv, NULL);
824 }
825
826 int
827 selcommon(lwp_t *l, register_t *retval, int nd, fd_set *u_in,
828 fd_set *u_ou, fd_set *u_ex, struct timeval *tv, sigset_t *mask)
829 {
830 char smallbits[howmany(FD_SETSIZE, NFDBITS) *
831 sizeof(fd_mask) * 6];
832 proc_t * const p = l->l_proc;
833 char *bits;
834 int ncoll, error, timo;
835 size_t ni;
836 sigset_t oldmask;
837 struct timeval sleeptv;
838
839 error = 0;
840 if (nd < 0)
841 return (EINVAL);
842 if (nd > p->p_fd->fd_nfiles) {
843 /* forgiving; slightly wrong */
844 nd = p->p_fd->fd_nfiles;
845 }
846 ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
847 if (ni * 6 > sizeof(smallbits))
848 bits = kmem_alloc(ni * 6, KM_SLEEP);
849 else
850 bits = smallbits;
851
852 #define getbits(name, x) \
853 if (u_ ## name) { \
854 error = copyin(u_ ## name, bits + ni * x, ni); \
855 if (error) \
856 goto done; \
857 } else \
858 memset(bits + ni * x, 0, ni);
859 getbits(in, 0);
860 getbits(ou, 1);
861 getbits(ex, 2);
862 #undef getbits
863
864 timo = 0;
865 if (tv && inittimeleft(tv, &sleeptv) == -1) {
866 error = EINVAL;
867 goto done;
868 }
869
870 if (mask) {
871 sigminusset(&sigcantmask, mask);
872 mutex_enter(&p->p_smutex);
873 oldmask = l->l_sigmask;
874 l->l_sigmask = *mask;
875 mutex_exit(&p->p_smutex);
876 } else
877 oldmask = l->l_sigmask; /* XXXgcc */
878
879 mutex_enter(&select_lock);
880 SLIST_INIT(&l->l_selwait);
881 for (;;) {
882 l->l_selflag = SEL_SCANNING;
883 ncoll = nselcoll;
884 mutex_exit(&select_lock);
885
886 error = selscan(l, (fd_mask *)(bits + ni * 0),
887 (fd_mask *)(bits + ni * 3), nd, retval);
888
889 mutex_enter(&select_lock);
890 if (error || *retval)
891 break;
892 if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
893 break;
894 if (l->l_selflag != SEL_SCANNING || ncoll != nselcoll)
895 continue;
896 l->l_selflag = SEL_BLOCKING;
897 error = cv_timedwait_sig(&select_cv, &select_lock, timo);
898 if (error != 0)
899 break;
900 }
901 selclear();
902 mutex_exit(&select_lock);
903
904 if (mask) {
905 mutex_enter(&p->p_smutex);
906 l->l_sigmask = oldmask;
907 mutex_exit(&p->p_smutex);
908 }
909
910 done:
911 /* select is not restarted after signals... */
912 if (error == ERESTART)
913 error = EINTR;
914 if (error == EWOULDBLOCK)
915 error = 0;
916 if (error == 0 && u_in != NULL)
917 error = copyout(bits + ni * 3, u_in, ni);
918 if (error == 0 && u_ou != NULL)
919 error = copyout(bits + ni * 4, u_ou, ni);
920 if (error == 0 && u_ex != NULL)
921 error = copyout(bits + ni * 5, u_ex, ni);
922 if (bits != smallbits)
923 kmem_free(bits, ni * 6);
924 return (error);
925 }
926
927 int
928 selscan(lwp_t *l, fd_mask *ibitp, fd_mask *obitp, int nfd,
929 register_t *retval)
930 {
931 static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
932 POLLWRNORM | POLLHUP | POLLERR,
933 POLLRDBAND };
934 proc_t *p = l->l_proc;
935 struct filedesc *fdp;
936 int msk, i, j, fd, n;
937 fd_mask ibits, obits;
938 struct file *fp;
939
940 fdp = p->p_fd;
941 n = 0;
942 for (msk = 0; msk < 3; msk++) {
943 for (i = 0; i < nfd; i += NFDBITS) {
944 ibits = *ibitp++;
945 obits = 0;
946 while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
947 ibits &= ~(1 << j);
948 if ((fp = fd_getfile(fdp, fd)) == NULL)
949 return (EBADF);
950 FILE_USE(fp);
951 if ((*fp->f_ops->fo_poll)(fp, flag[msk], l)) {
952 obits |= (1 << j);
953 n++;
954 }
955 FILE_UNUSE(fp, l);
956 }
957 *obitp++ = obits;
958 }
959 }
960 *retval = n;
961 return (0);
962 }
963
964 /*
965 * Poll system call.
966 */
967 int
968 sys_poll(lwp_t *l, void *v, register_t *retval)
969 {
970 struct sys_poll_args /* {
971 syscallarg(struct pollfd *) fds;
972 syscallarg(u_int) nfds;
973 syscallarg(int) timeout;
974 } */ * const uap = v;
975 struct timeval atv, *tv = NULL;
976
977 if (SCARG(uap, timeout) != INFTIM) {
978 atv.tv_sec = SCARG(uap, timeout) / 1000;
979 atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
980 tv = &atv;
981 }
982
983 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
984 tv, NULL);
985 }
986
987 /*
988 * Poll system call.
989 */
990 int
991 sys_pollts(lwp_t *l, void *v, register_t *retval)
992 {
993 struct sys_pollts_args /* {
994 syscallarg(struct pollfd *) fds;
995 syscallarg(u_int) nfds;
996 syscallarg(const struct timespec *) ts;
997 syscallarg(const sigset_t *) mask;
998 } */ * const uap = v;
999 struct timespec ats;
1000 struct timeval atv, *tv = NULL;
1001 sigset_t amask, *mask = NULL;
1002 int error;
1003
1004 if (SCARG(uap, ts)) {
1005 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
1006 if (error)
1007 return error;
1008 atv.tv_sec = ats.tv_sec;
1009 atv.tv_usec = ats.tv_nsec / 1000;
1010 tv = &atv;
1011 }
1012 if (SCARG(uap, mask)) {
1013 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
1014 if (error)
1015 return error;
1016 mask = &amask;
1017 }
1018
1019 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
1020 tv, mask);
1021 }
1022
1023 int
1024 pollcommon(lwp_t *l, register_t *retval,
1025 struct pollfd *u_fds, u_int nfds,
1026 struct timeval *tv, sigset_t *mask)
1027 {
1028 char smallbits[32 * sizeof(struct pollfd)];
1029 proc_t * const p = l->l_proc;
1030 void * bits;
1031 sigset_t oldmask;
1032 int ncoll, error, timo;
1033 size_t ni;
1034 struct timeval sleeptv;
1035
1036 if (nfds > p->p_fd->fd_nfiles) {
1037 /* forgiving; slightly wrong */
1038 nfds = p->p_fd->fd_nfiles;
1039 }
1040 ni = nfds * sizeof(struct pollfd);
1041 if (ni > sizeof(smallbits))
1042 bits = kmem_alloc(ni, KM_SLEEP);
1043 else
1044 bits = smallbits;
1045
1046 error = copyin(u_fds, bits, ni);
1047 if (error)
1048 goto done;
1049
1050 timo = 0;
1051 if (tv && inittimeleft(tv, &sleeptv) == -1) {
1052 error = EINVAL;
1053 goto done;
1054 }
1055
1056 if (mask) {
1057 sigminusset(&sigcantmask, mask);
1058 mutex_enter(&p->p_smutex);
1059 oldmask = l->l_sigmask;
1060 l->l_sigmask = *mask;
1061 mutex_exit(&p->p_smutex);
1062 } else
1063 oldmask = l->l_sigmask; /* XXXgcc */
1064
1065 mutex_enter(&select_lock);
1066 SLIST_INIT(&l->l_selwait);
1067 for (;;) {
1068 ncoll = nselcoll;
1069 l->l_selflag = SEL_SCANNING;
1070 mutex_exit(&select_lock);
1071
1072 error = pollscan(l, (struct pollfd *)bits, nfds, retval);
1073
1074 mutex_enter(&select_lock);
1075 if (error || *retval)
1076 break;
1077 if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
1078 break;
1079 if (l->l_selflag != SEL_SCANNING || nselcoll != ncoll)
1080 continue;
1081 l->l_selflag = SEL_BLOCKING;
1082 error = cv_timedwait_sig(&select_cv, &select_lock, timo);
1083 if (error != 0)
1084 break;
1085 }
1086 selclear();
1087 mutex_exit(&select_lock);
1088
1089 if (mask) {
1090 mutex_enter(&p->p_smutex);
1091 l->l_sigmask = oldmask;
1092 mutex_exit(&p->p_smutex);
1093 }
1094 done:
1095 /* poll is not restarted after signals... */
1096 if (error == ERESTART)
1097 error = EINTR;
1098 if (error == EWOULDBLOCK)
1099 error = 0;
1100 if (error == 0)
1101 error = copyout(bits, u_fds, ni);
1102 if (bits != smallbits)
1103 kmem_free(bits, ni);
1104 return (error);
1105 }
1106
1107 int
1108 pollscan(lwp_t *l, struct pollfd *fds, int nfd, register_t *retval)
1109 {
1110 proc_t *p = l->l_proc;
1111 struct filedesc *fdp;
1112 int i, n;
1113 struct file *fp;
1114
1115 fdp = p->p_fd;
1116 n = 0;
1117 for (i = 0; i < nfd; i++, fds++) {
1118 if (fds->fd >= fdp->fd_nfiles) {
1119 fds->revents = POLLNVAL;
1120 n++;
1121 } else if (fds->fd < 0) {
1122 fds->revents = 0;
1123 } else {
1124 if ((fp = fd_getfile(fdp, fds->fd)) == NULL) {
1125 fds->revents = POLLNVAL;
1126 n++;
1127 } else {
1128 FILE_USE(fp);
1129 fds->revents = (*fp->f_ops->fo_poll)(fp,
1130 fds->events | POLLERR | POLLHUP, l);
1131 if (fds->revents != 0)
1132 n++;
1133 FILE_UNUSE(fp, l);
1134 }
1135 }
1136 }
1137 *retval = n;
1138 return (0);
1139 }
1140
1141 /*ARGSUSED*/
1142 int
1143 seltrue(dev_t dev, int events, lwp_t *l)
1144 {
1145
1146 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1147 }
1148
1149 /*
1150 * Record a select request.
1151 */
1152 void
1153 selrecord(lwp_t *selector, struct selinfo *sip)
1154 {
1155
1156 mutex_enter(&select_lock);
1157 if (sip->sel_lwp == NULL) {
1158 /* First named waiter, although there may be more. */
1159 sip->sel_lwp = selector;
1160 SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
1161 } else if (sip->sel_lwp != selector) {
1162 /* Multiple waiters. */
1163 sip->sel_collision = true;
1164 }
1165 mutex_exit(&select_lock);
1166 }
1167
1168 /*
1169 * Do a wakeup when a selectable event occurs.
1170 */
1171 void
1172 selwakeup(struct selinfo *sip)
1173 {
1174 lwp_t *l;
1175
1176 mutex_enter(&select_lock);
1177 if (sip->sel_collision) {
1178 /* Multiple waiters - just notify everybody. */
1179 nselcoll++;
1180 sip->sel_collision = false;
1181 cv_broadcast(&select_cv);
1182 } else if (sip->sel_lwp != NULL) {
1183 /* Only one LWP waiting. */
1184 l = sip->sel_lwp;
1185 if (l->l_selflag == SEL_BLOCKING) {
1186 /*
1187 * If it's sleeping, wake it up. If not, it's
1188 * already awake but hasn't yet removed itself
1189 * from the selector. We reset the state below
1190 * so that we only attempt to do this once.
1191 */
1192 lwp_lock(l);
1193 if (l->l_wchan == &select_cv) {
1194 /* lwp_unsleep() releases the LWP lock. */
1195 lwp_unsleep(l);
1196 } else
1197 lwp_unlock(l);
1198 } else {
1199 /*
1200 * Not yet asleep. Reset its state below so that
1201 * it will go around again.
1202 */
1203 }
1204 l->l_selflag = SEL_RESET;
1205 }
1206 mutex_exit(&select_lock);
1207 }
1208
1209 void
1210 selnotify(struct selinfo *sip, long knhint)
1211 {
1212
1213 selwakeup(sip);
1214 KNOTE(&sip->sel_klist, knhint);
1215 }
1216
1217 /*
1218 * Remove an LWP from all objects that it is waiting for.
1219 */
1220 static void
1221 selclear(void)
1222 {
1223 struct selinfo *sip;
1224 lwp_t *l = curlwp;
1225
1226 KASSERT(mutex_owned(&select_lock));
1227
1228 SLIST_FOREACH(sip, &l->l_selwait, sel_chain) {
1229 KASSERT(sip->sel_lwp == l);
1230 sip->sel_lwp = NULL;
1231 }
1232 }
1233
1234 /*
1235 * Initialize the select/poll system calls.
1236 */
1237 void
1238 selsysinit(void)
1239 {
1240
1241 mutex_init(&select_lock, MUTEX_DRIVER, IPL_VM);
1242 cv_init(&select_cv, "select");
1243 }
1244