sys_generic.c revision 1.100.2.8 1 /* $NetBSD: sys_generic.c,v 1.100.2.8 2007/07/15 13:27:45 ad Exp $ */
2
3 /*-
4 * Copyright (c) 2007 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38
39 /*-
40 * Copyright (c) 2007 The NetBSD Foundation, Inc.
41 * All rights reserved.
42 *
43 * This code is derived from software contributed to The NetBSD Foundation
44 * by Andrew Doran.
45 *
46 * Redistribution and use in source and binary forms, with or without
47 * modification, are permitted provided that the following conditions
48 * are met:
49 * 1. Redistributions of source code must retain the above copyright
50 * notice, this list of conditions and the following disclaimer.
51 * 2. Redistributions in binary form must reproduce the above copyright
52 * notice, this list of conditions and the following disclaimer in the
53 * documentation and/or other materials provided with the distribution.
54 * 3. All advertising materials mentioning features or use of this software
55 * must display the following acknowledgement:
56 * This product includes software developed by the NetBSD
57 * Foundation, Inc. and its contributors.
58 * 4. Neither the name of The NetBSD Foundation nor the names of its
59 * contributors may be used to endorse or promote products derived
60 * from this software without specific prior written permission.
61 *
62 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
63 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
64 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
65 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
66 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
67 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
68 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
69 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
70 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
71 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
72 * POSSIBILITY OF SUCH DAMAGE.
73 */
74
75 /*
76 * Copyright (c) 1982, 1986, 1989, 1993
77 * The Regents of the University of California. All rights reserved.
78 * (c) UNIX System Laboratories, Inc.
79 * All or some portions of this file are derived from material licensed
80 * to the University of California by American Telephone and Telegraph
81 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
82 * the permission of UNIX System Laboratories, Inc.
83 *
84 * Redistribution and use in source and binary forms, with or without
85 * modification, are permitted provided that the following conditions
86 * are met:
87 * 1. Redistributions of source code must retain the above copyright
88 * notice, this list of conditions and the following disclaimer.
89 * 2. Redistributions in binary form must reproduce the above copyright
90 * notice, this list of conditions and the following disclaimer in the
91 * documentation and/or other materials provided with the distribution.
92 * 3. Neither the name of the University nor the names of its contributors
93 * may be used to endorse or promote products derived from this software
94 * without specific prior written permission.
95 *
96 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
97 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
98 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
99 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
100 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
101 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
102 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
103 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
104 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
105 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
106 * SUCH DAMAGE.
107 *
108 * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95
109 */
110
111 /*
112 * System calls relating to files.
113 */
114
115 /*
116 * System calls relating to files.
117 */
118
119 #include <sys/cdefs.h>
120 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.100.2.8 2007/07/15 13:27:45 ad Exp $");
121
122 #include "opt_ktrace.h"
123
124 #include <sys/param.h>
125 #include <sys/systm.h>
126 #include <sys/filedesc.h>
127 #include <sys/ioctl.h>
128 #include <sys/file.h>
129 #include <sys/proc.h>
130 #include <sys/socketvar.h>
131 #include <sys/signalvar.h>
132 #include <sys/uio.h>
133 #include <sys/kernel.h>
134 #include <sys/stat.h>
135 #include <sys/kmem.h>
136 #include <sys/poll.h>
137 #include <sys/mount.h>
138 #include <sys/syscallargs.h>
139 #ifdef KTRACE
140 #include <sys/ktrace.h>
141 #endif
142
143 #include <uvm/uvm_extern.h>
144
145 /* Flags for lwp::l_selflag. */
146 #define SEL_RESET 0 /* awoken, interrupted, or not yet polling */
147 #define SEL_SCANNING 1 /* polling descriptors */
148 #define SEL_BLOCKING 2 /* about to block on select_cv */
149
150 static int selscan(lwp_t *, fd_mask *, fd_mask *, int, register_t *);
151 static int pollscan(lwp_t *, struct pollfd *, int, register_t *);
152 static void selclear(void);
153
154 /* Global state for select()/poll(). */
155 kmutex_t select_lock;
156 kcondvar_t select_cv;
157 int nselcoll;
158
159 /*
160 * Read system call.
161 */
162 /* ARGSUSED */
163 int
164 sys_read(lwp_t *l, void *v, register_t *retval)
165 {
166 struct sys_read_args /* {
167 syscallarg(int) fd;
168 syscallarg(void *) buf;
169 syscallarg(size_t) nbyte;
170 } */ *uap = v;
171 int fd;
172 struct file *fp;
173 proc_t *p;
174 struct filedesc *fdp;
175
176 fd = SCARG(uap, fd);
177 p = l->l_proc;
178 fdp = p->p_fd;
179
180 if ((fp = fd_getfile(fdp, fd)) == NULL)
181 return (EBADF);
182
183 if ((fp->f_flag & FREAD) == 0) {
184 mutex_exit(&fp->f_lock);
185 return (EBADF);
186 }
187
188 FILE_USE(fp);
189
190 /* dofileread() will unuse the descriptor for us */
191 return (dofileread(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
192 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
193 }
194
195 int
196 dofileread(lwp_t *l, int fd, struct file *fp, void *buf, size_t nbyte,
197 off_t *offset, int flags, register_t *retval)
198 {
199 struct iovec aiov;
200 struct uio auio;
201 proc_t *p;
202 struct vmspace *vm;
203 size_t cnt;
204 int error;
205 #ifdef KTRACE
206 struct iovec ktriov;
207 #endif
208 p = l->l_proc;
209
210 error = proc_vmspace_getref(p, &vm);
211 if (error) {
212 goto out;
213 }
214
215 aiov.iov_base = (void *)buf;
216 aiov.iov_len = nbyte;
217 auio.uio_iov = &aiov;
218 auio.uio_iovcnt = 1;
219 auio.uio_resid = nbyte;
220 auio.uio_rw = UIO_READ;
221 auio.uio_vmspace = vm;
222
223 /*
224 * Reads return ssize_t because -1 is returned on error. Therefore
225 * we must restrict the length to SSIZE_MAX to avoid garbage return
226 * values.
227 */
228 if (auio.uio_resid > SSIZE_MAX) {
229 error = EINVAL;
230 goto out;
231 }
232
233 #ifdef KTRACE
234 /* In case we are tracing, save a copy of iovec */
235 ktriov = aiov;
236 #endif
237 cnt = auio.uio_resid;
238 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
239 if (error)
240 if (auio.uio_resid != cnt && (error == ERESTART ||
241 error == EINTR || error == EWOULDBLOCK))
242 error = 0;
243 cnt -= auio.uio_resid;
244 #ifdef KTRACE
245 if (KTRPOINT(p, KTR_GENIO) && error == 0)
246 ktrgenio(l, fd, UIO_READ, &ktriov, cnt, error);
247 #endif
248 *retval = cnt;
249 out:
250 FILE_UNUSE(fp, l);
251 uvmspace_free(vm);
252 return (error);
253 }
254
255 /*
256 * Scatter read system call.
257 */
258 int
259 sys_readv(lwp_t *l, void *v, register_t *retval)
260 {
261 struct sys_readv_args /* {
262 syscallarg(int) fd;
263 syscallarg(const struct iovec *) iovp;
264 syscallarg(int) iovcnt;
265 } */ *uap = v;
266 struct filedesc *fdp;
267 struct file *fp;
268 proc_t *p;
269 int fd;
270
271 fd = SCARG(uap, fd);
272 p = l->l_proc;
273 fdp = p->p_fd;
274
275 if ((fp = fd_getfile(fdp, fd)) == NULL)
276 return (EBADF);
277
278 if ((fp->f_flag & FREAD) == 0) {
279 mutex_exit(&fp->f_lock);
280 return (EBADF);
281 }
282
283 FILE_USE(fp);
284
285 /* dofilereadv() will unuse the descriptor for us */
286 return (dofilereadv(l, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
287 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
288 }
289
290 int
291 dofilereadv(lwp_t *l, int fd, struct file *fp, const struct iovec *iovp,
292 int iovcnt, off_t *offset, int flags, register_t *retval)
293 {
294 proc_t *p;
295 struct uio auio;
296 struct iovec *iov, *needfree, aiov[UIO_SMALLIOV];
297 struct vmspace *vm;
298 int i, error;
299 size_t cnt;
300 u_int iovlen;
301 #ifdef KTRACE
302 struct iovec *ktriov;
303 #endif
304
305 p = l->l_proc;
306 error = proc_vmspace_getref(p, &vm);
307 if (error) {
308 goto out;
309 }
310
311 #ifdef KTRACE
312 ktriov = NULL;
313 #endif
314 /* note: can't use iovlen until iovcnt is validated */
315 iovlen = iovcnt * sizeof(struct iovec);
316 if ((u_int)iovcnt > UIO_SMALLIOV) {
317 if ((u_int)iovcnt > IOV_MAX) {
318 error = EINVAL;
319 goto out;
320 }
321 iov = kmem_alloc(iovlen, KM_SLEEP);
322 needfree = iov;
323 } else if ((u_int)iovcnt > 0) {
324 iov = aiov;
325 needfree = NULL;
326 } else {
327 error = EINVAL;
328 goto out;
329 }
330
331 auio.uio_iov = iov;
332 auio.uio_iovcnt = iovcnt;
333 auio.uio_rw = UIO_READ;
334 auio.uio_vmspace = vm;
335 error = copyin(iovp, iov, iovlen);
336 if (error)
337 goto done;
338 auio.uio_resid = 0;
339 for (i = 0; i < iovcnt; i++) {
340 auio.uio_resid += iov->iov_len;
341 /*
342 * Reads return ssize_t because -1 is returned on error.
343 * Therefore we must restrict the length to SSIZE_MAX to
344 * avoid garbage return values.
345 */
346 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
347 error = EINVAL;
348 goto done;
349 }
350 iov++;
351 }
352 #ifdef KTRACE
353 /*
354 * if tracing, save a copy of iovec
355 */
356 if (KTRPOINT(p, KTR_GENIO)) {
357 ktriov = kmem_alloc(iovlen, KM_SLEEP);
358 memcpy((void *)ktriov, (void *)auio.uio_iov, iovlen);
359 }
360 #endif
361 cnt = auio.uio_resid;
362 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
363 if (error)
364 if (auio.uio_resid != cnt && (error == ERESTART ||
365 error == EINTR || error == EWOULDBLOCK))
366 error = 0;
367 cnt -= auio.uio_resid;
368 #ifdef KTRACE
369 if (ktriov != NULL) {
370 if (KTRPOINT(p, KTR_GENIO) && (error == 0))
371 ktrgenio(l, fd, UIO_READ, ktriov, cnt, error);
372 kmem_free(ktriov, iovlen);
373 }
374 #endif
375 *retval = cnt;
376 done:
377 if (needfree)
378 kmem_free(needfree, iovlen);
379 out:
380 FILE_UNUSE(fp, l);
381 uvmspace_free(vm);
382 return (error);
383 }
384
385 /*
386 * Write system call
387 */
388 int
389 sys_write(lwp_t *l, void *v, register_t *retval)
390 {
391 struct sys_write_args /* {
392 syscallarg(int) fd;
393 syscallarg(const void *) buf;
394 syscallarg(size_t) nbyte;
395 } */ *uap = v;
396 int fd;
397 struct file *fp;
398 proc_t *p;
399 struct filedesc *fdp;
400
401 fd = SCARG(uap, fd);
402 p = l->l_proc;
403 fdp = p->p_fd;
404
405 if ((fp = fd_getfile(fdp, fd)) == NULL)
406 return (EBADF);
407
408 if ((fp->f_flag & FWRITE) == 0) {
409 mutex_exit(&fp->f_lock);
410 return (EBADF);
411 }
412
413 FILE_USE(fp);
414
415 /* dofilewrite() will unuse the descriptor for us */
416 return (dofilewrite(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
417 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
418 }
419
420 int
421 dofilewrite(lwp_t *l, int fd, struct file *fp, const void *buf,
422 size_t nbyte, off_t *offset, int flags, register_t *retval)
423 {
424 struct iovec aiov;
425 struct uio auio;
426 proc_t *p;
427 struct vmspace *vm;
428 size_t cnt;
429 int error;
430 #ifdef KTRACE
431 struct iovec ktriov;
432 #endif
433
434 p = l->l_proc;
435 error = proc_vmspace_getref(p, &vm);
436 if (error) {
437 goto out;
438 }
439 aiov.iov_base = __UNCONST(buf); /* XXXUNCONST kills const */
440 aiov.iov_len = nbyte;
441 auio.uio_iov = &aiov;
442 auio.uio_iovcnt = 1;
443 auio.uio_resid = nbyte;
444 auio.uio_rw = UIO_WRITE;
445 auio.uio_vmspace = vm;
446
447 /*
448 * Writes return ssize_t because -1 is returned on error. Therefore
449 * we must restrict the length to SSIZE_MAX to avoid garbage return
450 * values.
451 */
452 if (auio.uio_resid > SSIZE_MAX) {
453 error = EINVAL;
454 goto out;
455 }
456
457 #ifdef KTRACE
458 /* In case we are tracing, save a copy of iovec */
459 ktriov = aiov;
460 #endif
461 cnt = auio.uio_resid;
462 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
463 if (error) {
464 if (auio.uio_resid != cnt && (error == ERESTART ||
465 error == EINTR || error == EWOULDBLOCK))
466 error = 0;
467 if (error == EPIPE) {
468 mutex_enter(&proclist_mutex);
469 psignal(p, SIGPIPE);
470 mutex_exit(&proclist_mutex);
471 }
472 }
473 cnt -= auio.uio_resid;
474 #ifdef KTRACE
475 if (KTRPOINT(p, KTR_GENIO) && error == 0)
476 ktrgenio(l, fd, UIO_WRITE, &ktriov, cnt, error);
477 #endif
478 *retval = cnt;
479 out:
480 FILE_UNUSE(fp, l);
481 uvmspace_free(vm);
482 return (error);
483 }
484
485 /*
486 * Gather write system call
487 */
488 int
489 sys_writev(lwp_t *l, void *v, register_t *retval)
490 {
491 struct sys_writev_args /* {
492 syscallarg(int) fd;
493 syscallarg(const struct iovec *) iovp;
494 syscallarg(int) iovcnt;
495 } */ *uap = v;
496 int fd;
497 struct file *fp;
498 proc_t *p;
499 struct filedesc *fdp;
500
501 fd = SCARG(uap, fd);
502 p = l->l_proc;
503 fdp = p->p_fd;
504
505 if ((fp = fd_getfile(fdp, fd)) == NULL)
506 return (EBADF);
507
508 if ((fp->f_flag & FWRITE) == 0) {
509 mutex_exit(&fp->f_lock);
510 return (EBADF);
511 }
512
513 FILE_USE(fp);
514
515 /* dofilewritev() will unuse the descriptor for us */
516 return (dofilewritev(l, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
517 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
518 }
519
520 int
521 dofilewritev(lwp_t *l, int fd, struct file *fp, const struct iovec *iovp,
522 int iovcnt, off_t *offset, int flags, register_t *retval)
523 {
524 proc_t *p;
525 struct uio auio;
526 struct iovec *iov, *needfree, aiov[UIO_SMALLIOV];
527 struct vmspace *vm;
528 int i, error;
529 size_t cnt;
530 u_int iovlen;
531 #ifdef KTRACE
532 struct iovec *ktriov;
533 #endif
534
535 p = l->l_proc;
536 error = proc_vmspace_getref(p, &vm);
537 if (error) {
538 goto out;
539 }
540 #ifdef KTRACE
541 ktriov = NULL;
542 #endif
543 /* note: can't use iovlen until iovcnt is validated */
544 iovlen = iovcnt * sizeof(struct iovec);
545 if ((u_int)iovcnt > UIO_SMALLIOV) {
546 if ((u_int)iovcnt > IOV_MAX) {
547 error = EINVAL;
548 goto out;
549 }
550 iov = kmem_alloc(iovlen, KM_SLEEP);
551 needfree = iov;
552 } else if ((u_int)iovcnt > 0) {
553 iov = aiov;
554 needfree = NULL;
555 } else {
556 error = EINVAL;
557 goto out;
558 }
559
560 auio.uio_iov = iov;
561 auio.uio_iovcnt = iovcnt;
562 auio.uio_rw = UIO_WRITE;
563 auio.uio_vmspace = vm;
564 error = copyin(iovp, iov, iovlen);
565 if (error)
566 goto done;
567 auio.uio_resid = 0;
568 for (i = 0; i < iovcnt; i++) {
569 auio.uio_resid += iov->iov_len;
570 /*
571 * Writes return ssize_t because -1 is returned on error.
572 * Therefore we must restrict the length to SSIZE_MAX to
573 * avoid garbage return values.
574 */
575 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
576 error = EINVAL;
577 goto done;
578 }
579 iov++;
580 }
581 #ifdef KTRACE
582 /*
583 * if tracing, save a copy of iovec
584 */
585 if (KTRPOINT(p, KTR_GENIO)) {
586 ktriov = kmem_alloc(iovlen, KM_SLEEP);
587 memcpy((void *)ktriov, (void *)auio.uio_iov, iovlen);
588 }
589 #endif
590 cnt = auio.uio_resid;
591 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
592 if (error) {
593 if (auio.uio_resid != cnt && (error == ERESTART ||
594 error == EINTR || error == EWOULDBLOCK))
595 error = 0;
596 if (error == EPIPE) {
597 mutex_enter(&proclist_mutex);
598 psignal(p, SIGPIPE);
599 mutex_exit(&proclist_mutex);
600 }
601 }
602 cnt -= auio.uio_resid;
603 #ifdef KTRACE
604 if (ktriov != NULL) {
605 if (KTRPOINT(p, KTR_GENIO) && (error == 0))
606 ktrgenio(l, fd, UIO_WRITE, ktriov, cnt, error);
607 kmem_free(ktriov, iovlen);
608 }
609 #endif
610 *retval = cnt;
611 done:
612 if (needfree)
613 kmem_free(needfree, iovlen);
614 out:
615 FILE_UNUSE(fp, l);
616 uvmspace_free(vm);
617 return (error);
618 }
619
620 /*
621 * Ioctl system call
622 */
623 /* ARGSUSED */
624 int
625 sys_ioctl(lwp_t *l, void *v, register_t *retval)
626 {
627 struct sys_ioctl_args /* {
628 syscallarg(int) fd;
629 syscallarg(u_long) com;
630 syscallarg(void *) data;
631 } */ *uap = v;
632 struct file *fp;
633 proc_t *p;
634 struct filedesc *fdp;
635 u_long com;
636 int error;
637 u_int size;
638 void *data, *memp;
639 #define STK_PARAMS 128
640 u_long stkbuf[STK_PARAMS/sizeof(u_long)];
641
642 error = 0;
643 p = l->l_proc;
644 fdp = p->p_fd;
645
646 if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
647 return (EBADF);
648
649 FILE_USE(fp);
650
651 if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
652 error = EBADF;
653 com = 0;
654 goto out;
655 }
656
657 switch (com = SCARG(uap, com)) {
658 case FIONCLEX:
659 rw_enter(&fdp->fd_lock, RW_WRITER);
660 fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
661 rw_exit(&fdp->fd_lock);
662 goto out;
663
664 case FIOCLEX:
665 rw_enter(&fdp->fd_lock, RW_WRITER);
666 fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
667 rw_exit(&fdp->fd_lock);
668 goto out;
669 }
670
671 /*
672 * Interpret high order word to find amount of data to be
673 * copied to/from the user's address space.
674 */
675 size = IOCPARM_LEN(com);
676 if (size > IOCPARM_MAX) {
677 error = ENOTTY;
678 goto out;
679 }
680 memp = NULL;
681 if (size > sizeof(stkbuf)) {
682 memp = kmem_alloc(size, KM_SLEEP);
683 data = memp;
684 } else
685 data = (void *)stkbuf;
686 if (com&IOC_IN) {
687 if (size) {
688 error = copyin(SCARG(uap, data), data, size);
689 if (error) {
690 if (memp)
691 kmem_free(memp, size);
692 goto out;
693 }
694 #ifdef KTRACE
695 if (KTRPOINT(p, KTR_GENIO)) {
696 struct iovec iov;
697 iov.iov_base = SCARG(uap, data);
698 iov.iov_len = size;
699 ktrgenio(l, SCARG(uap, fd), UIO_WRITE, &iov,
700 size, 0);
701 }
702 #endif
703 } else
704 *(void **)data = SCARG(uap, data);
705 } else if ((com&IOC_OUT) && size)
706 /*
707 * Zero the buffer so the user always
708 * gets back something deterministic.
709 */
710 memset(data, 0, size);
711 else if (com&IOC_VOID)
712 *(void **)data = SCARG(uap, data);
713
714 switch (com) {
715
716 case FIONBIO:
717 mutex_enter(&fp->f_lock);
718 if (*(int *)data != 0)
719 fp->f_flag |= FNONBLOCK;
720 else
721 fp->f_flag &= ~FNONBLOCK;
722 mutex_exit(&fp->f_lock);
723 error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data, l);
724 break;
725
726 case FIOASYNC:
727 mutex_enter(&fp->f_lock);
728 if (*(int *)data != 0)
729 fp->f_flag |= FASYNC;
730 else
731 fp->f_flag &= ~FASYNC;
732 mutex_exit(&fp->f_lock);
733 error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data, l);
734 break;
735
736 default:
737 error = (*fp->f_ops->fo_ioctl)(fp, com, data, l);
738 /*
739 * Copy any data to user, size was
740 * already set and checked above.
741 */
742 if (error == 0 && (com&IOC_OUT) && size) {
743 error = copyout(data, SCARG(uap, data), size);
744 #ifdef KTRACE
745 if (KTRPOINT(p, KTR_GENIO)) {
746 struct iovec iov;
747 iov.iov_base = SCARG(uap, data);
748 iov.iov_len = size;
749 ktrgenio(l, SCARG(uap, fd), UIO_READ, &iov,
750 size, error);
751 }
752 #endif
753 }
754 break;
755 }
756 if (memp)
757 kmem_free(memp, size);
758 out:
759 FILE_UNUSE(fp, l);
760 switch (error) {
761 case -1:
762 printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
763 "pid=%d comm=%s\n",
764 (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
765 (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
766 p->p_pid, p->p_comm);
767 /* FALLTHROUGH */
768 case EPASSTHROUGH:
769 error = ENOTTY;
770 /* FALLTHROUGH */
771 default:
772 return (error);
773 }
774 }
775
776 /*
777 * Select system call.
778 */
779 int
780 sys_pselect(lwp_t *l, void *v, register_t *retval)
781 {
782 struct sys_pselect_args /* {
783 syscallarg(int) nd;
784 syscallarg(fd_set *) in;
785 syscallarg(fd_set *) ou;
786 syscallarg(fd_set *) ex;
787 syscallarg(const struct timespec *) ts;
788 syscallarg(sigset_t *) mask;
789 } */ * const uap = v;
790 struct timespec ats;
791 struct timeval atv, *tv = NULL;
792 sigset_t amask, *mask = NULL;
793 int error;
794
795 if (SCARG(uap, ts)) {
796 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
797 if (error)
798 return error;
799 atv.tv_sec = ats.tv_sec;
800 atv.tv_usec = ats.tv_nsec / 1000;
801 tv = &atv;
802 }
803 if (SCARG(uap, mask) != NULL) {
804 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
805 if (error)
806 return error;
807 mask = &amask;
808 }
809
810 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
811 SCARG(uap, ou), SCARG(uap, ex), tv, mask);
812 }
813
814 int
815 inittimeleft(struct timeval *tv, struct timeval *sleeptv)
816 {
817 if (itimerfix(tv))
818 return -1;
819 getmicrouptime(sleeptv);
820 return 0;
821 }
822
823 int
824 gettimeleft(struct timeval *tv, struct timeval *sleeptv)
825 {
826 /*
827 * We have to recalculate the timeout on every retry.
828 */
829 struct timeval slepttv;
830 /*
831 * reduce tv by elapsed time
832 * based on monotonic time scale
833 */
834 getmicrouptime(&slepttv);
835 timeradd(tv, sleeptv, tv);
836 timersub(tv, &slepttv, tv);
837 *sleeptv = slepttv;
838 return tvtohz(tv);
839 }
840
841 int
842 sys_select(lwp_t *l, void *v, register_t *retval)
843 {
844 struct sys_select_args /* {
845 syscallarg(int) nd;
846 syscallarg(fd_set *) in;
847 syscallarg(fd_set *) ou;
848 syscallarg(fd_set *) ex;
849 syscallarg(struct timeval *) tv;
850 } */ * const uap = v;
851 struct timeval atv, *tv = NULL;
852 int error;
853
854 if (SCARG(uap, tv)) {
855 error = copyin(SCARG(uap, tv), (void *)&atv,
856 sizeof(atv));
857 if (error)
858 return error;
859 tv = &atv;
860 }
861
862 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
863 SCARG(uap, ou), SCARG(uap, ex), tv, NULL);
864 }
865
866 int
867 selcommon(lwp_t *l, register_t *retval, int nd, fd_set *u_in,
868 fd_set *u_ou, fd_set *u_ex, struct timeval *tv, sigset_t *mask)
869 {
870 char smallbits[howmany(FD_SETSIZE, NFDBITS) *
871 sizeof(fd_mask) * 6];
872 proc_t * const p = l->l_proc;
873 char *bits;
874 int ncoll, error, timo;
875 size_t ni;
876 sigset_t oldmask;
877 struct timeval sleeptv;
878
879 error = 0;
880 if (nd < 0)
881 return (EINVAL);
882 if (nd > p->p_fd->fd_nfiles) {
883 /* forgiving; slightly wrong */
884 nd = p->p_fd->fd_nfiles;
885 }
886 ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
887 if (ni * 6 > sizeof(smallbits))
888 bits = kmem_alloc(ni * 6, KM_SLEEP);
889 else
890 bits = smallbits;
891
892 #define getbits(name, x) \
893 if (u_ ## name) { \
894 error = copyin(u_ ## name, bits + ni * x, ni); \
895 if (error) \
896 goto done; \
897 } else \
898 memset(bits + ni * x, 0, ni);
899 getbits(in, 0);
900 getbits(ou, 1);
901 getbits(ex, 2);
902 #undef getbits
903
904 timo = 0;
905 if (tv && inittimeleft(tv, &sleeptv) == -1) {
906 error = EINVAL;
907 goto done;
908 }
909
910 if (mask) {
911 sigminusset(&sigcantmask, mask);
912 mutex_enter(&p->p_smutex);
913 oldmask = l->l_sigmask;
914 l->l_sigmask = *mask;
915 mutex_exit(&p->p_smutex);
916 } else
917 oldmask = l->l_sigmask; /* XXXgcc */
918
919 mutex_enter(&select_lock);
920 SLIST_INIT(&l->l_selwait);
921 for (;;) {
922 l->l_selflag = SEL_SCANNING;
923 ncoll = nselcoll;
924 mutex_exit(&select_lock);
925
926 error = selscan(l, (fd_mask *)(bits + ni * 0),
927 (fd_mask *)(bits + ni * 3), nd, retval);
928
929 mutex_enter(&select_lock);
930 if (error || *retval)
931 break;
932 if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
933 break;
934 if (l->l_selflag != SEL_SCANNING || ncoll != nselcoll)
935 continue;
936 l->l_selflag = SEL_BLOCKING;
937 error = cv_timedwait_sig(&select_cv, &select_lock, timo);
938 if (error != 0)
939 break;
940 }
941 selclear();
942 mutex_exit(&select_lock);
943
944 if (mask) {
945 mutex_enter(&p->p_smutex);
946 l->l_sigmask = oldmask;
947 mutex_exit(&p->p_smutex);
948 }
949
950 done:
951 /* select is not restarted after signals... */
952 if (error == ERESTART)
953 error = EINTR;
954 if (error == EWOULDBLOCK)
955 error = 0;
956 if (error == 0 && u_in != NULL)
957 error = copyout(bits + ni * 3, u_in, ni);
958 if (error == 0 && u_ou != NULL)
959 error = copyout(bits + ni * 4, u_ou, ni);
960 if (error == 0 && u_ex != NULL)
961 error = copyout(bits + ni * 5, u_ex, ni);
962 if (bits != smallbits)
963 kmem_free(bits, ni * 6);
964 return (error);
965 }
966
967 int
968 selscan(lwp_t *l, fd_mask *ibitp, fd_mask *obitp, int nfd,
969 register_t *retval)
970 {
971 static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
972 POLLWRNORM | POLLHUP | POLLERR,
973 POLLRDBAND };
974 proc_t *p = l->l_proc;
975 struct filedesc *fdp;
976 int msk, i, j, fd, n;
977 fd_mask ibits, obits;
978 struct file *fp;
979
980 fdp = p->p_fd;
981 n = 0;
982 for (msk = 0; msk < 3; msk++) {
983 for (i = 0; i < nfd; i += NFDBITS) {
984 ibits = *ibitp++;
985 obits = 0;
986 while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
987 ibits &= ~(1 << j);
988 if ((fp = fd_getfile(fdp, fd)) == NULL)
989 return (EBADF);
990 FILE_USE(fp);
991 if ((*fp->f_ops->fo_poll)(fp, flag[msk], l)) {
992 obits |= (1 << j);
993 n++;
994 }
995 FILE_UNUSE(fp, l);
996 }
997 *obitp++ = obits;
998 }
999 }
1000 *retval = n;
1001 return (0);
1002 }
1003
1004 /*
1005 * Poll system call.
1006 */
1007 int
1008 sys_poll(lwp_t *l, void *v, register_t *retval)
1009 {
1010 struct sys_poll_args /* {
1011 syscallarg(struct pollfd *) fds;
1012 syscallarg(u_int) nfds;
1013 syscallarg(int) timeout;
1014 } */ * const uap = v;
1015 struct timeval atv, *tv = NULL;
1016
1017 if (SCARG(uap, timeout) != INFTIM) {
1018 atv.tv_sec = SCARG(uap, timeout) / 1000;
1019 atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
1020 tv = &atv;
1021 }
1022
1023 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
1024 tv, NULL);
1025 }
1026
1027 /*
1028 * Poll system call.
1029 */
1030 int
1031 sys_pollts(lwp_t *l, void *v, register_t *retval)
1032 {
1033 struct sys_pollts_args /* {
1034 syscallarg(struct pollfd *) fds;
1035 syscallarg(u_int) nfds;
1036 syscallarg(const struct timespec *) ts;
1037 syscallarg(const sigset_t *) mask;
1038 } */ * const uap = v;
1039 struct timespec ats;
1040 struct timeval atv, *tv = NULL;
1041 sigset_t amask, *mask = NULL;
1042 int error;
1043
1044 if (SCARG(uap, ts)) {
1045 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
1046 if (error)
1047 return error;
1048 atv.tv_sec = ats.tv_sec;
1049 atv.tv_usec = ats.tv_nsec / 1000;
1050 tv = &atv;
1051 }
1052 if (SCARG(uap, mask)) {
1053 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
1054 if (error)
1055 return error;
1056 mask = &amask;
1057 }
1058
1059 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
1060 tv, mask);
1061 }
1062
1063 int
1064 pollcommon(lwp_t *l, register_t *retval,
1065 struct pollfd *u_fds, u_int nfds,
1066 struct timeval *tv, sigset_t *mask)
1067 {
1068 char smallbits[32 * sizeof(struct pollfd)];
1069 proc_t * const p = l->l_proc;
1070 void * bits;
1071 sigset_t oldmask;
1072 int ncoll, error, timo;
1073 size_t ni;
1074 struct timeval sleeptv;
1075
1076 if (nfds > p->p_fd->fd_nfiles) {
1077 /* forgiving; slightly wrong */
1078 nfds = p->p_fd->fd_nfiles;
1079 }
1080 ni = nfds * sizeof(struct pollfd);
1081 if (ni > sizeof(smallbits))
1082 bits = kmem_alloc(ni, KM_SLEEP);
1083 else
1084 bits = smallbits;
1085
1086 error = copyin(u_fds, bits, ni);
1087 if (error)
1088 goto done;
1089
1090 timo = 0;
1091 if (tv && inittimeleft(tv, &sleeptv) == -1) {
1092 error = EINVAL;
1093 goto done;
1094 }
1095
1096 if (mask) {
1097 sigminusset(&sigcantmask, mask);
1098 mutex_enter(&p->p_smutex);
1099 oldmask = l->l_sigmask;
1100 l->l_sigmask = *mask;
1101 mutex_exit(&p->p_smutex);
1102 } else
1103 oldmask = l->l_sigmask; /* XXXgcc */
1104
1105 mutex_enter(&select_lock);
1106 SLIST_INIT(&l->l_selwait);
1107 for (;;) {
1108 ncoll = nselcoll;
1109 l->l_selflag = SEL_SCANNING;
1110 mutex_exit(&select_lock);
1111
1112 error = pollscan(l, (struct pollfd *)bits, nfds, retval);
1113
1114 mutex_enter(&select_lock);
1115 if (error || *retval)
1116 break;
1117 if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
1118 break;
1119 if (l->l_selflag != SEL_SCANNING || nselcoll != ncoll)
1120 continue;
1121 l->l_selflag = SEL_BLOCKING;
1122 error = cv_timedwait_sig(&select_cv, &select_lock, timo);
1123 if (error != 0)
1124 break;
1125 }
1126 selclear();
1127 mutex_exit(&select_lock);
1128
1129 if (mask) {
1130 mutex_enter(&p->p_smutex);
1131 l->l_sigmask = oldmask;
1132 mutex_exit(&p->p_smutex);
1133 }
1134 done:
1135 /* poll is not restarted after signals... */
1136 if (error == ERESTART)
1137 error = EINTR;
1138 if (error == EWOULDBLOCK)
1139 error = 0;
1140 if (error == 0)
1141 error = copyout(bits, u_fds, ni);
1142 if (bits != smallbits)
1143 kmem_free(bits, ni);
1144 return (error);
1145 }
1146
1147 int
1148 pollscan(lwp_t *l, struct pollfd *fds, int nfd, register_t *retval)
1149 {
1150 proc_t *p = l->l_proc;
1151 struct filedesc *fdp;
1152 int i, n;
1153 struct file *fp;
1154
1155 fdp = p->p_fd;
1156 n = 0;
1157 for (i = 0; i < nfd; i++, fds++) {
1158 if (fds->fd >= fdp->fd_nfiles) {
1159 fds->revents = POLLNVAL;
1160 n++;
1161 } else if (fds->fd < 0) {
1162 fds->revents = 0;
1163 } else {
1164 if ((fp = fd_getfile(fdp, fds->fd)) == NULL) {
1165 fds->revents = POLLNVAL;
1166 n++;
1167 } else {
1168 FILE_USE(fp);
1169 fds->revents = (*fp->f_ops->fo_poll)(fp,
1170 fds->events | POLLERR | POLLHUP, l);
1171 if (fds->revents != 0)
1172 n++;
1173 FILE_UNUSE(fp, l);
1174 }
1175 }
1176 }
1177 *retval = n;
1178 return (0);
1179 }
1180
1181 /*ARGSUSED*/
1182 int
1183 seltrue(dev_t dev, int events, lwp_t *l)
1184 {
1185
1186 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1187 }
1188
1189 /*
1190 * Record a select request.
1191 */
1192 void
1193 selrecord(lwp_t *selector, struct selinfo *sip)
1194 {
1195
1196 mutex_enter(&select_lock);
1197 if (sip->sel_lwp == NULL) {
1198 /* First named waiter, although there may be more. */
1199 sip->sel_lwp = selector;
1200 SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
1201 } else if (sip->sel_lwp != selector) {
1202 /* Multiple waiters. */
1203 sip->sel_collision = true;
1204 }
1205 mutex_exit(&select_lock);
1206 }
1207
1208 /*
1209 * Do a wakeup when a selectable event occurs.
1210 */
1211 void
1212 selwakeup(struct selinfo *sip)
1213 {
1214 lwp_t *l;
1215
1216 mutex_enter(&select_lock);
1217 if (sip->sel_collision) {
1218 /* Multiple waiters - just notify everybody. */
1219 nselcoll++;
1220 sip->sel_collision = false;
1221 cv_broadcast(&select_cv);
1222 } else if (sip->sel_lwp != NULL) {
1223 /* Only one LWP waiting. */
1224 l = sip->sel_lwp;
1225 if (l->l_selflag == SEL_BLOCKING) {
1226 /*
1227 * If it's sleeping, wake it up. If not, it's
1228 * already awake but hasn't yet removed itself
1229 * from the selector. We reset the state below
1230 * so that we only attempt to do this once.
1231 */
1232 lwp_lock(l);
1233 if (l->l_wchan == &select_cv) {
1234 /* lwp_unsleep() releases the LWP lock. */
1235 lwp_unsleep(l);
1236 } else
1237 lwp_unlock(l);
1238 } else {
1239 /*
1240 * Not yet asleep. Reset its state below so that
1241 * it will go around again.
1242 */
1243 }
1244 l->l_selflag = SEL_RESET;
1245 }
1246 mutex_exit(&select_lock);
1247 }
1248
1249 void
1250 selnotify(struct selinfo *sip, long knhint)
1251 {
1252
1253 selwakeup(sip);
1254 KNOTE(&sip->sel_klist, knhint);
1255 }
1256
1257 /*
1258 * Remove an LWP from all objects that it is waiting for.
1259 */
1260 static void
1261 selclear(void)
1262 {
1263 struct selinfo *sip;
1264 lwp_t *l = curlwp;
1265
1266 KASSERT(mutex_owned(&select_lock));
1267
1268 SLIST_FOREACH(sip, &l->l_selwait, sel_chain) {
1269 KASSERT(sip->sel_lwp == l);
1270 sip->sel_lwp = NULL;
1271 }
1272 }
1273
1274 /*
1275 * Initialize the select/poll system calls.
1276 */
1277 void
1278 selsysinit(void)
1279 {
1280
1281 mutex_init(&select_lock, MUTEX_DRIVER, IPL_VM);
1282 cv_init(&select_cv, "select");
1283 }
1284