sys_generic.c revision 1.100.2.11 1 /* $NetBSD: sys_generic.c,v 1.100.2.11 2007/08/30 13:10:37 ad Exp $ */
2
3 /*-
4 * Copyright (c) 2007 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38
39 /*
40 * Copyright (c) 1982, 1986, 1989, 1993
41 * The Regents of the University of California. All rights reserved.
42 * (c) UNIX System Laboratories, Inc.
43 * All or some portions of this file are derived from material licensed
44 * to the University of California by American Telephone and Telegraph
45 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
46 * the permission of UNIX System Laboratories, Inc.
47 *
48 * Redistribution and use in source and binary forms, with or without
49 * modification, are permitted provided that the following conditions
50 * are met:
51 * 1. Redistributions of source code must retain the above copyright
52 * notice, this list of conditions and the following disclaimer.
53 * 2. Redistributions in binary form must reproduce the above copyright
54 * notice, this list of conditions and the following disclaimer in the
55 * documentation and/or other materials provided with the distribution.
56 * 3. Neither the name of the University nor the names of its contributors
57 * may be used to endorse or promote products derived from this software
58 * without specific prior written permission.
59 *
60 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
61 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
62 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
63 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
64 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
65 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
66 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
67 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
68 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
69 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
70 * SUCH DAMAGE.
71 *
72 * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95
73 */
74
75 /*
76 * System calls relating to files.
77 */
78
79 #include <sys/cdefs.h>
80 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.100.2.11 2007/08/30 13:10:37 ad Exp $");
81
82 #include <sys/param.h>
83 #include <sys/systm.h>
84 #include <sys/filedesc.h>
85 #include <sys/ioctl.h>
86 #include <sys/file.h>
87 #include <sys/proc.h>
88 #include <sys/socketvar.h>
89 #include <sys/signalvar.h>
90 #include <sys/uio.h>
91 #include <sys/kernel.h>
92 #include <sys/stat.h>
93 #include <sys/kmem.h>
94 #include <sys/poll.h>
95 #include <sys/vnode.h>
96 #include <sys/mount.h>
97 #include <sys/syscallargs.h>
98 #include <sys/ktrace.h>
99
100 #include <uvm/uvm_extern.h>
101
102 /* Flags for lwp::l_selflag. */
103 #define SEL_RESET 0 /* awoken, interrupted, or not yet polling */
104 #define SEL_SCANNING 1 /* polling descriptors */
105 #define SEL_BLOCKING 2 /* about to block on select_cv */
106
107 static int selscan(lwp_t *, fd_mask *, fd_mask *, int, register_t *);
108 static int pollscan(lwp_t *, struct pollfd *, int, register_t *);
109 static void selclear(void);
110
111 /* Global state for select()/poll(). */
112 kmutex_t select_lock;
113 kcondvar_t select_cv;
114 int nselcoll;
115
116 /*
117 * Read system call.
118 */
119 /* ARGSUSED */
120 int
121 sys_read(lwp_t *l, void *v, register_t *retval)
122 {
123 struct sys_read_args /* {
124 syscallarg(int) fd;
125 syscallarg(void *) buf;
126 syscallarg(size_t) nbyte;
127 } */ *uap = v;
128 int fd;
129 struct file *fp;
130 proc_t *p;
131 struct filedesc *fdp;
132
133 fd = SCARG(uap, fd);
134 p = l->l_proc;
135 fdp = p->p_fd;
136
137 if ((fp = fd_getfile(fdp, fd)) == NULL)
138 return (EBADF);
139
140 if ((fp->f_flag & FREAD) == 0) {
141 mutex_exit(&fp->f_lock);
142 return (EBADF);
143 }
144
145 FILE_USE(fp);
146
147 /* dofileread() will unuse the descriptor for us */
148 return (dofileread(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
149 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
150 }
151
152 int
153 dofileread(lwp_t *l, int fd, struct file *fp, void *buf, size_t nbyte,
154 off_t *offset, int flags, register_t *retval)
155 {
156 struct iovec aiov;
157 struct uio auio;
158 proc_t *p;
159 struct vmspace *vm;
160 size_t cnt;
161 int error;
162 p = l->l_proc;
163
164 error = proc_vmspace_getref(p, &vm);
165 if (error) {
166 goto out;
167 }
168
169 aiov.iov_base = (void *)buf;
170 aiov.iov_len = nbyte;
171 auio.uio_iov = &aiov;
172 auio.uio_iovcnt = 1;
173 auio.uio_resid = nbyte;
174 auio.uio_rw = UIO_READ;
175 auio.uio_vmspace = vm;
176
177 /*
178 * Reads return ssize_t because -1 is returned on error. Therefore
179 * we must restrict the length to SSIZE_MAX to avoid garbage return
180 * values.
181 */
182 if (auio.uio_resid > SSIZE_MAX) {
183 error = EINVAL;
184 goto out;
185 }
186
187 cnt = auio.uio_resid;
188 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
189 if (error)
190 if (auio.uio_resid != cnt && (error == ERESTART ||
191 error == EINTR || error == EWOULDBLOCK))
192 error = 0;
193 cnt -= auio.uio_resid;
194 ktrgenio(fd, UIO_READ, buf, nbyte, error);
195 *retval = cnt;
196 out:
197 FILE_UNUSE(fp, l);
198 uvmspace_free(vm);
199 return (error);
200 }
201
202 /*
203 * Scatter read system call.
204 */
205 int
206 sys_readv(lwp_t *l, void *v, register_t *retval)
207 {
208 struct sys_readv_args /* {
209 syscallarg(int) fd;
210 syscallarg(const struct iovec *) iovp;
211 syscallarg(int) iovcnt;
212 } */ *uap = v;
213
214 return do_filereadv(l, SCARG(uap, fd), SCARG(uap, iovp),
215 SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
216 }
217
218 int
219 do_filereadv(struct lwp *l, int fd, const struct iovec *iovp, int iovcnt,
220 off_t *offset, int flags, register_t *retval)
221 {
222 struct proc *p;
223 struct uio auio;
224 struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV];
225 struct vmspace *vm;
226 int i, error;
227 size_t cnt;
228 u_int iovlen;
229 struct file *fp;
230 struct filedesc *fdp;
231 struct iovec *ktriov = NULL;
232
233 if (iovcnt == 0)
234 return EINVAL;
235
236 p = l->l_proc;
237 fdp = p->p_fd;
238
239 if ((fp = fd_getfile(fdp, fd)) == NULL)
240 return EBADF;
241
242 if ((fp->f_flag & FREAD) == 0) {
243 mutex_exit(&fp->f_lock);
244 return EBADF;
245 }
246
247 FILE_USE(fp);
248
249 if (offset == NULL)
250 offset = &fp->f_offset;
251 else {
252 struct vnode *vp = fp->f_data;
253 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
254 error = ESPIPE;
255 goto out;
256 }
257 /*
258 * Test that the device is seekable ?
259 * XXX This works because no file systems actually
260 * XXX take any action on the seek operation.
261 */
262 error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
263 if (error != 0)
264 goto out;
265 }
266
267 error = proc_vmspace_getref(p, &vm);
268 if (error)
269 goto out;
270
271 iovlen = iovcnt * sizeof(struct iovec);
272 if (flags & FOF_IOV_SYSSPACE)
273 iov = __UNCONST(iovp);
274 else {
275 iov = aiov;
276 if ((u_int)iovcnt > UIO_SMALLIOV) {
277 if ((u_int)iovcnt > IOV_MAX) {
278 error = EINVAL;
279 goto out;
280 }
281 iov = kmem_alloc(iovlen, KM_SLEEP);
282 if (iov == NULL) {
283 error = ENOMEM;
284 goto out;
285 }
286 needfree = iov;
287 }
288 error = copyin(iovp, iov, iovlen);
289 if (error)
290 goto done;
291 }
292
293 auio.uio_iov = iov;
294 auio.uio_iovcnt = iovcnt;
295 auio.uio_rw = UIO_READ;
296 auio.uio_vmspace = vm;
297
298 auio.uio_resid = 0;
299 for (i = 0; i < iovcnt; i++, iov++) {
300 auio.uio_resid += iov->iov_len;
301 /*
302 * Reads return ssize_t because -1 is returned on error.
303 * Therefore we must restrict the length to SSIZE_MAX to
304 * avoid garbage return values.
305 */
306 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
307 error = EINVAL;
308 goto done;
309 }
310 }
311
312 /*
313 * if tracing, save a copy of iovec
314 */
315 if (ktrpoint(KTR_GENIO)) {
316 ktriov = kmem_alloc(iovlen, KM_SLEEP);
317 if (ktriov != NULL)
318 memcpy(ktriov, auio.uio_iov, iovlen);
319 }
320
321 cnt = auio.uio_resid;
322 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
323 if (error)
324 if (auio.uio_resid != cnt && (error == ERESTART ||
325 error == EINTR || error == EWOULDBLOCK))
326 error = 0;
327 cnt -= auio.uio_resid;
328 *retval = cnt;
329
330 if (ktriov != NULL) {
331 ktrgeniov(fd, UIO_READ, ktriov, cnt, error);
332 kmem_free(ktriov, iovlen);
333 }
334
335 done:
336 if (needfree)
337 kmem_free(needfree, iovlen);
338 out:
339 FILE_UNUSE(fp, l);
340 uvmspace_free(vm);
341 return (error);
342 }
343
344 /*
345 * Write system call
346 */
347 int
348 sys_write(lwp_t *l, void *v, register_t *retval)
349 {
350 struct sys_write_args /* {
351 syscallarg(int) fd;
352 syscallarg(const void *) buf;
353 syscallarg(size_t) nbyte;
354 } */ *uap = v;
355 int fd;
356 struct file *fp;
357 proc_t *p;
358 struct filedesc *fdp;
359
360 fd = SCARG(uap, fd);
361 p = l->l_proc;
362 fdp = p->p_fd;
363
364 if ((fp = fd_getfile(fdp, fd)) == NULL)
365 return (EBADF);
366
367 if ((fp->f_flag & FWRITE) == 0) {
368 mutex_exit(&fp->f_lock);
369 return (EBADF);
370 }
371
372 FILE_USE(fp);
373
374 /* dofilewrite() will unuse the descriptor for us */
375 return (dofilewrite(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
376 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
377 }
378
379 int
380 dofilewrite(lwp_t *l, int fd, struct file *fp, const void *buf,
381 size_t nbyte, off_t *offset, int flags, register_t *retval)
382 {
383 struct iovec aiov;
384 struct uio auio;
385 proc_t *p;
386 struct vmspace *vm;
387 size_t cnt;
388 int error;
389
390 p = l->l_proc;
391 error = proc_vmspace_getref(p, &vm);
392 if (error) {
393 goto out;
394 }
395 aiov.iov_base = __UNCONST(buf); /* XXXUNCONST kills const */
396 aiov.iov_len = nbyte;
397 auio.uio_iov = &aiov;
398 auio.uio_iovcnt = 1;
399 auio.uio_resid = nbyte;
400 auio.uio_rw = UIO_WRITE;
401 auio.uio_vmspace = vm;
402
403 /*
404 * Writes return ssize_t because -1 is returned on error. Therefore
405 * we must restrict the length to SSIZE_MAX to avoid garbage return
406 * values.
407 */
408 if (auio.uio_resid > SSIZE_MAX) {
409 error = EINVAL;
410 goto out;
411 }
412
413 cnt = auio.uio_resid;
414 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
415 if (error) {
416 if (auio.uio_resid != cnt && (error == ERESTART ||
417 error == EINTR || error == EWOULDBLOCK))
418 error = 0;
419 if (error == EPIPE) {
420 mutex_enter(&proclist_mutex);
421 psignal(p, SIGPIPE);
422 mutex_exit(&proclist_mutex);
423 }
424 }
425 cnt -= auio.uio_resid;
426 ktrgenio(fd, UIO_WRITE, buf, nbyte, error);
427 *retval = cnt;
428 out:
429 FILE_UNUSE(fp, l);
430 uvmspace_free(vm);
431 return (error);
432 }
433
434 /*
435 * Gather write system call
436 */
437 int
438 sys_writev(lwp_t *l, void *v, register_t *retval)
439 {
440 struct sys_writev_args /* {
441 syscallarg(int) fd;
442 syscallarg(const struct iovec *) iovp;
443 syscallarg(int) iovcnt;
444 } */ *uap = v;
445
446 return do_filewritev(l, SCARG(uap, fd), SCARG(uap, iovp),
447 SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
448 }
449
450 int
451 do_filewritev(struct lwp *l, int fd, const struct iovec *iovp, int iovcnt,
452 off_t *offset, int flags, register_t *retval)
453 {
454 struct proc *p;
455 struct uio auio;
456 struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV];
457 struct vmspace *vm;
458 int i, error;
459 size_t cnt;
460 u_int iovlen;
461 struct file *fp;
462 struct filedesc *fdp;
463 struct iovec *ktriov = NULL;
464
465 if (iovcnt == 0)
466 return EINVAL;
467
468 p = l->l_proc;
469 fdp = p->p_fd;
470
471 if ((fp = fd_getfile(fdp, fd)) == NULL)
472 return EBADF;
473
474 if ((fp->f_flag & FWRITE) == 0) {
475 mutex_exit(&fp->f_lock);
476 return EBADF;
477 }
478
479 FILE_USE(fp);
480
481 if (offset == NULL)
482 offset = &fp->f_offset;
483 else {
484 struct vnode *vp = fp->f_data;
485 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
486 error = ESPIPE;
487 goto out;
488 }
489 /*
490 * Test that the device is seekable ?
491 * XXX This works because no file systems actually
492 * XXX take any action on the seek operation.
493 */
494 error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
495 if (error != 0)
496 goto out;
497 }
498
499 error = proc_vmspace_getref(p, &vm);
500 if (error)
501 goto out;
502
503 iovlen = iovcnt * sizeof(struct iovec);
504 if (flags & FOF_IOV_SYSSPACE)
505 iov = __UNCONST(iovp);
506 else {
507 iov = aiov;
508 if ((u_int)iovcnt > UIO_SMALLIOV) {
509 if ((u_int)iovcnt > IOV_MAX) {
510 error = EINVAL;
511 goto out;
512 }
513 iov = kmem_alloc(iovlen, KM_SLEEP);
514 if (iov == NULL) {
515 error = ENOMEM;
516 goto out;
517 }
518 needfree = iov;
519 }
520 error = copyin(iovp, iov, iovlen);
521 if (error)
522 goto done;
523 }
524
525 auio.uio_iov = iov;
526 auio.uio_iovcnt = iovcnt;
527 auio.uio_rw = UIO_WRITE;
528 auio.uio_vmspace = vm;
529
530 auio.uio_resid = 0;
531 for (i = 0; i < iovcnt; i++, iov++) {
532 auio.uio_resid += iov->iov_len;
533 /*
534 * Writes return ssize_t because -1 is returned on error.
535 * Therefore we must restrict the length to SSIZE_MAX to
536 * avoid garbage return values.
537 */
538 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
539 error = EINVAL;
540 goto done;
541 }
542 }
543
544 /*
545 * if tracing, save a copy of iovec
546 */
547 if (ktrpoint(KTR_GENIO)) {
548 ktriov = kmem_alloc(iovlen, KM_SLEEP);
549 if (ktriov != NULL)
550 memcpy(ktriov, auio.uio_iov, iovlen);
551 }
552
553 cnt = auio.uio_resid;
554 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
555 if (error) {
556 if (auio.uio_resid != cnt && (error == ERESTART ||
557 error == EINTR || error == EWOULDBLOCK))
558 error = 0;
559 if (error == EPIPE) {
560 mutex_enter(&proclist_mutex);
561 psignal(p, SIGPIPE);
562 mutex_exit(&proclist_mutex);
563 }
564 }
565 cnt -= auio.uio_resid;
566 *retval = cnt;
567
568 if (ktriov != NULL) {
569 ktrgeniov(fd, UIO_WRITE, ktriov, cnt, error);
570 kmem_free(ktriov, iovlen);
571 }
572
573 done:
574 if (needfree)
575 kmem_free(needfree, iovlen);
576 out:
577 FILE_UNUSE(fp, l);
578 uvmspace_free(vm);
579 return (error);
580 }
581
582 /*
583 * Ioctl system call
584 */
585 /* ARGSUSED */
586 int
587 sys_ioctl(lwp_t *l, void *v, register_t *retval)
588 {
589 struct sys_ioctl_args /* {
590 syscallarg(int) fd;
591 syscallarg(u_long) com;
592 syscallarg(void *) data;
593 } */ *uap = v;
594 struct file *fp;
595 proc_t *p;
596 struct filedesc *fdp;
597 u_long com;
598 int error;
599 u_int size;
600 void *data, *memp;
601 #define STK_PARAMS 128
602 u_long stkbuf[STK_PARAMS/sizeof(u_long)];
603
604 error = 0;
605 p = l->l_proc;
606 fdp = p->p_fd;
607
608 if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
609 return (EBADF);
610
611 FILE_USE(fp);
612
613 if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
614 error = EBADF;
615 com = 0;
616 goto out;
617 }
618
619 switch (com = SCARG(uap, com)) {
620 case FIONCLEX:
621 rw_enter(&fdp->fd_lock, RW_WRITER);
622 fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
623 rw_exit(&fdp->fd_lock);
624 goto out;
625
626 case FIOCLEX:
627 rw_enter(&fdp->fd_lock, RW_WRITER);
628 fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
629 rw_exit(&fdp->fd_lock);
630 goto out;
631 }
632
633 /*
634 * Interpret high order word to find amount of data to be
635 * copied to/from the user's address space.
636 */
637 size = IOCPARM_LEN(com);
638 if (size > IOCPARM_MAX) {
639 error = ENOTTY;
640 goto out;
641 }
642 memp = NULL;
643 if (size > sizeof(stkbuf)) {
644 memp = kmem_alloc(size, KM_SLEEP);
645 data = memp;
646 } else
647 data = (void *)stkbuf;
648 if (com&IOC_IN) {
649 if (size) {
650 error = copyin(SCARG(uap, data), data, size);
651 if (error) {
652 if (memp)
653 kmem_free(memp, size);
654 goto out;
655 }
656 ktrgenio(SCARG(uap, fd), UIO_WRITE, SCARG(uap, data),
657 size, 0);
658 } else
659 *(void **)data = SCARG(uap, data);
660 } else if ((com&IOC_OUT) && size)
661 /*
662 * Zero the buffer so the user always
663 * gets back something deterministic.
664 */
665 memset(data, 0, size);
666 else if (com&IOC_VOID)
667 *(void **)data = SCARG(uap, data);
668
669 switch (com) {
670
671 case FIONBIO:
672 mutex_enter(&fp->f_lock);
673 if (*(int *)data != 0)
674 fp->f_flag |= FNONBLOCK;
675 else
676 fp->f_flag &= ~FNONBLOCK;
677 mutex_exit(&fp->f_lock);
678 error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data, l);
679 break;
680
681 case FIOASYNC:
682 mutex_enter(&fp->f_lock);
683 if (*(int *)data != 0)
684 fp->f_flag |= FASYNC;
685 else
686 fp->f_flag &= ~FASYNC;
687 mutex_exit(&fp->f_lock);
688 error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data, l);
689 break;
690
691 default:
692 error = (*fp->f_ops->fo_ioctl)(fp, com, data, l);
693 /*
694 * Copy any data to user, size was
695 * already set and checked above.
696 */
697 if (error == 0 && (com&IOC_OUT) && size) {
698 error = copyout(data, SCARG(uap, data), size);
699 ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, data),
700 size, error);
701 }
702 break;
703 }
704 if (memp)
705 kmem_free(memp, size);
706 out:
707 FILE_UNUSE(fp, l);
708 switch (error) {
709 case -1:
710 printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
711 "pid=%d comm=%s\n",
712 (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
713 (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
714 p->p_pid, p->p_comm);
715 /* FALLTHROUGH */
716 case EPASSTHROUGH:
717 error = ENOTTY;
718 /* FALLTHROUGH */
719 default:
720 return (error);
721 }
722 }
723
724 /*
725 * Select system call.
726 */
727 int
728 sys_pselect(lwp_t *l, void *v, register_t *retval)
729 {
730 struct sys_pselect_args /* {
731 syscallarg(int) nd;
732 syscallarg(fd_set *) in;
733 syscallarg(fd_set *) ou;
734 syscallarg(fd_set *) ex;
735 syscallarg(const struct timespec *) ts;
736 syscallarg(sigset_t *) mask;
737 } */ * const uap = v;
738 struct timespec ats;
739 struct timeval atv, *tv = NULL;
740 sigset_t amask, *mask = NULL;
741 int error;
742
743 if (SCARG(uap, ts)) {
744 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
745 if (error)
746 return error;
747 atv.tv_sec = ats.tv_sec;
748 atv.tv_usec = ats.tv_nsec / 1000;
749 tv = &atv;
750 }
751 if (SCARG(uap, mask) != NULL) {
752 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
753 if (error)
754 return error;
755 mask = &amask;
756 }
757
758 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
759 SCARG(uap, ou), SCARG(uap, ex), tv, mask);
760 }
761
762 int
763 inittimeleft(struct timeval *tv, struct timeval *sleeptv)
764 {
765 if (itimerfix(tv))
766 return -1;
767 getmicrouptime(sleeptv);
768 return 0;
769 }
770
771 int
772 gettimeleft(struct timeval *tv, struct timeval *sleeptv)
773 {
774 /*
775 * We have to recalculate the timeout on every retry.
776 */
777 struct timeval slepttv;
778 /*
779 * reduce tv by elapsed time
780 * based on monotonic time scale
781 */
782 getmicrouptime(&slepttv);
783 timeradd(tv, sleeptv, tv);
784 timersub(tv, &slepttv, tv);
785 *sleeptv = slepttv;
786 return tvtohz(tv);
787 }
788
789 int
790 sys_select(lwp_t *l, void *v, register_t *retval)
791 {
792 struct sys_select_args /* {
793 syscallarg(int) nd;
794 syscallarg(fd_set *) in;
795 syscallarg(fd_set *) ou;
796 syscallarg(fd_set *) ex;
797 syscallarg(struct timeval *) tv;
798 } */ * const uap = v;
799 struct timeval atv, *tv = NULL;
800 int error;
801
802 if (SCARG(uap, tv)) {
803 error = copyin(SCARG(uap, tv), (void *)&atv,
804 sizeof(atv));
805 if (error)
806 return error;
807 tv = &atv;
808 }
809
810 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
811 SCARG(uap, ou), SCARG(uap, ex), tv, NULL);
812 }
813
814 int
815 selcommon(lwp_t *l, register_t *retval, int nd, fd_set *u_in,
816 fd_set *u_ou, fd_set *u_ex, struct timeval *tv, sigset_t *mask)
817 {
818 char smallbits[howmany(FD_SETSIZE, NFDBITS) *
819 sizeof(fd_mask) * 6];
820 proc_t * const p = l->l_proc;
821 char *bits;
822 int ncoll, error, timo;
823 size_t ni;
824 sigset_t oldmask;
825 struct timeval sleeptv;
826
827 error = 0;
828 if (nd < 0)
829 return (EINVAL);
830 if (nd > p->p_fd->fd_nfiles) {
831 /* forgiving; slightly wrong */
832 nd = p->p_fd->fd_nfiles;
833 }
834 ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
835 if (ni * 6 > sizeof(smallbits))
836 bits = kmem_alloc(ni * 6, KM_SLEEP);
837 else
838 bits = smallbits;
839
840 #define getbits(name, x) \
841 if (u_ ## name) { \
842 error = copyin(u_ ## name, bits + ni * x, ni); \
843 if (error) \
844 goto done; \
845 } else \
846 memset(bits + ni * x, 0, ni);
847 getbits(in, 0);
848 getbits(ou, 1);
849 getbits(ex, 2);
850 #undef getbits
851
852 timo = 0;
853 if (tv && inittimeleft(tv, &sleeptv) == -1) {
854 error = EINVAL;
855 goto done;
856 }
857
858 if (mask) {
859 sigminusset(&sigcantmask, mask);
860 mutex_enter(&p->p_smutex);
861 oldmask = l->l_sigmask;
862 l->l_sigmask = *mask;
863 mutex_exit(&p->p_smutex);
864 } else
865 oldmask = l->l_sigmask; /* XXXgcc */
866
867 mutex_enter(&select_lock);
868 SLIST_INIT(&l->l_selwait);
869 for (;;) {
870 l->l_selflag = SEL_SCANNING;
871 ncoll = nselcoll;
872 mutex_exit(&select_lock);
873
874 error = selscan(l, (fd_mask *)(bits + ni * 0),
875 (fd_mask *)(bits + ni * 3), nd, retval);
876
877 mutex_enter(&select_lock);
878 if (error || *retval)
879 break;
880 if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
881 break;
882 if (l->l_selflag != SEL_SCANNING || ncoll != nselcoll)
883 continue;
884 l->l_selflag = SEL_BLOCKING;
885 error = cv_timedwait_sig(&select_cv, &select_lock, timo);
886 if (error != 0)
887 break;
888 }
889 selclear();
890 mutex_exit(&select_lock);
891
892 if (mask) {
893 mutex_enter(&p->p_smutex);
894 l->l_sigmask = oldmask;
895 mutex_exit(&p->p_smutex);
896 }
897
898 done:
899 /* select is not restarted after signals... */
900 if (error == ERESTART)
901 error = EINTR;
902 if (error == EWOULDBLOCK)
903 error = 0;
904 if (error == 0 && u_in != NULL)
905 error = copyout(bits + ni * 3, u_in, ni);
906 if (error == 0 && u_ou != NULL)
907 error = copyout(bits + ni * 4, u_ou, ni);
908 if (error == 0 && u_ex != NULL)
909 error = copyout(bits + ni * 5, u_ex, ni);
910 if (bits != smallbits)
911 kmem_free(bits, ni * 6);
912 return (error);
913 }
914
915 int
916 selscan(lwp_t *l, fd_mask *ibitp, fd_mask *obitp, int nfd,
917 register_t *retval)
918 {
919 static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
920 POLLWRNORM | POLLHUP | POLLERR,
921 POLLRDBAND };
922 proc_t *p = l->l_proc;
923 struct filedesc *fdp;
924 int msk, i, j, fd, n;
925 fd_mask ibits, obits;
926 struct file *fp;
927
928 fdp = p->p_fd;
929 n = 0;
930 for (msk = 0; msk < 3; msk++) {
931 for (i = 0; i < nfd; i += NFDBITS) {
932 ibits = *ibitp++;
933 obits = 0;
934 while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
935 ibits &= ~(1 << j);
936 if ((fp = fd_getfile(fdp, fd)) == NULL)
937 return (EBADF);
938 FILE_USE(fp);
939 if ((*fp->f_ops->fo_poll)(fp, flag[msk], l)) {
940 obits |= (1 << j);
941 n++;
942 }
943 FILE_UNUSE(fp, l);
944 }
945 *obitp++ = obits;
946 }
947 }
948 *retval = n;
949 return (0);
950 }
951
952 /*
953 * Poll system call.
954 */
955 int
956 sys_poll(lwp_t *l, void *v, register_t *retval)
957 {
958 struct sys_poll_args /* {
959 syscallarg(struct pollfd *) fds;
960 syscallarg(u_int) nfds;
961 syscallarg(int) timeout;
962 } */ * const uap = v;
963 struct timeval atv, *tv = NULL;
964
965 if (SCARG(uap, timeout) != INFTIM) {
966 atv.tv_sec = SCARG(uap, timeout) / 1000;
967 atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
968 tv = &atv;
969 }
970
971 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
972 tv, NULL);
973 }
974
975 /*
976 * Poll system call.
977 */
978 int
979 sys_pollts(lwp_t *l, void *v, register_t *retval)
980 {
981 struct sys_pollts_args /* {
982 syscallarg(struct pollfd *) fds;
983 syscallarg(u_int) nfds;
984 syscallarg(const struct timespec *) ts;
985 syscallarg(const sigset_t *) mask;
986 } */ * const uap = v;
987 struct timespec ats;
988 struct timeval atv, *tv = NULL;
989 sigset_t amask, *mask = NULL;
990 int error;
991
992 if (SCARG(uap, ts)) {
993 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
994 if (error)
995 return error;
996 atv.tv_sec = ats.tv_sec;
997 atv.tv_usec = ats.tv_nsec / 1000;
998 tv = &atv;
999 }
1000 if (SCARG(uap, mask)) {
1001 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
1002 if (error)
1003 return error;
1004 mask = &amask;
1005 }
1006
1007 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
1008 tv, mask);
1009 }
1010
1011 int
1012 pollcommon(lwp_t *l, register_t *retval,
1013 struct pollfd *u_fds, u_int nfds,
1014 struct timeval *tv, sigset_t *mask)
1015 {
1016 char smallbits[32 * sizeof(struct pollfd)];
1017 proc_t * const p = l->l_proc;
1018 void * bits;
1019 sigset_t oldmask;
1020 int ncoll, error, timo;
1021 size_t ni;
1022 struct timeval sleeptv;
1023
1024 if (nfds > p->p_fd->fd_nfiles) {
1025 /* forgiving; slightly wrong */
1026 nfds = p->p_fd->fd_nfiles;
1027 }
1028 ni = nfds * sizeof(struct pollfd);
1029 if (ni > sizeof(smallbits))
1030 bits = kmem_alloc(ni, KM_SLEEP);
1031 else
1032 bits = smallbits;
1033
1034 error = copyin(u_fds, bits, ni);
1035 if (error)
1036 goto done;
1037
1038 timo = 0;
1039 if (tv && inittimeleft(tv, &sleeptv) == -1) {
1040 error = EINVAL;
1041 goto done;
1042 }
1043
1044 if (mask) {
1045 sigminusset(&sigcantmask, mask);
1046 mutex_enter(&p->p_smutex);
1047 oldmask = l->l_sigmask;
1048 l->l_sigmask = *mask;
1049 mutex_exit(&p->p_smutex);
1050 } else
1051 oldmask = l->l_sigmask; /* XXXgcc */
1052
1053 mutex_enter(&select_lock);
1054 SLIST_INIT(&l->l_selwait);
1055 for (;;) {
1056 ncoll = nselcoll;
1057 l->l_selflag = SEL_SCANNING;
1058 mutex_exit(&select_lock);
1059
1060 error = pollscan(l, (struct pollfd *)bits, nfds, retval);
1061
1062 mutex_enter(&select_lock);
1063 if (error || *retval)
1064 break;
1065 if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
1066 break;
1067 if (l->l_selflag != SEL_SCANNING || nselcoll != ncoll)
1068 continue;
1069 l->l_selflag = SEL_BLOCKING;
1070 error = cv_timedwait_sig(&select_cv, &select_lock, timo);
1071 if (error != 0)
1072 break;
1073 }
1074 selclear();
1075 mutex_exit(&select_lock);
1076
1077 if (mask) {
1078 mutex_enter(&p->p_smutex);
1079 l->l_sigmask = oldmask;
1080 mutex_exit(&p->p_smutex);
1081 }
1082 done:
1083 /* poll is not restarted after signals... */
1084 if (error == ERESTART)
1085 error = EINTR;
1086 if (error == EWOULDBLOCK)
1087 error = 0;
1088 if (error == 0)
1089 error = copyout(bits, u_fds, ni);
1090 if (bits != smallbits)
1091 kmem_free(bits, ni);
1092 return (error);
1093 }
1094
1095 int
1096 pollscan(lwp_t *l, struct pollfd *fds, int nfd, register_t *retval)
1097 {
1098 proc_t *p = l->l_proc;
1099 struct filedesc *fdp;
1100 int i, n;
1101 struct file *fp;
1102
1103 fdp = p->p_fd;
1104 n = 0;
1105 for (i = 0; i < nfd; i++, fds++) {
1106 if (fds->fd >= fdp->fd_nfiles) {
1107 fds->revents = POLLNVAL;
1108 n++;
1109 } else if (fds->fd < 0) {
1110 fds->revents = 0;
1111 } else {
1112 if ((fp = fd_getfile(fdp, fds->fd)) == NULL) {
1113 fds->revents = POLLNVAL;
1114 n++;
1115 } else {
1116 FILE_USE(fp);
1117 fds->revents = (*fp->f_ops->fo_poll)(fp,
1118 fds->events | POLLERR | POLLHUP, l);
1119 if (fds->revents != 0)
1120 n++;
1121 FILE_UNUSE(fp, l);
1122 }
1123 }
1124 }
1125 *retval = n;
1126 return (0);
1127 }
1128
1129 /*ARGSUSED*/
1130 int
1131 seltrue(dev_t dev, int events, lwp_t *l)
1132 {
1133
1134 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1135 }
1136
1137 /*
1138 * Record a select request.
1139 */
1140 void
1141 selrecord(lwp_t *selector, struct selinfo *sip)
1142 {
1143
1144 mutex_enter(&select_lock);
1145 if (sip->sel_lwp == NULL) {
1146 /* First named waiter, although there may be more. */
1147 sip->sel_lwp = selector;
1148 SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
1149 } else if (sip->sel_lwp != selector) {
1150 /* Multiple waiters. */
1151 sip->sel_collision = true;
1152 }
1153 mutex_exit(&select_lock);
1154 }
1155
1156 /*
1157 * Do a wakeup when a selectable event occurs.
1158 */
1159 void
1160 selwakeup(struct selinfo *sip)
1161 {
1162 lwp_t *l;
1163
1164 mutex_enter(&select_lock);
1165 if (sip->sel_collision) {
1166 /* Multiple waiters - just notify everybody. */
1167 nselcoll++;
1168 sip->sel_collision = false;
1169 cv_broadcast(&select_cv);
1170 } else if (sip->sel_lwp != NULL) {
1171 /* Only one LWP waiting. */
1172 l = sip->sel_lwp;
1173 if (l->l_selflag == SEL_BLOCKING) {
1174 /*
1175 * If it's sleeping, wake it up. If not, it's
1176 * already awake but hasn't yet removed itself
1177 * from the selector. We reset the state below
1178 * so that we only attempt to do this once.
1179 */
1180 lwp_lock(l);
1181 if (l->l_wchan == &select_cv) {
1182 /* lwp_unsleep() releases the LWP lock. */
1183 lwp_unsleep(l);
1184 } else
1185 lwp_unlock(l);
1186 } else {
1187 /*
1188 * Not yet asleep. Reset its state below so that
1189 * it will go around again.
1190 */
1191 }
1192 l->l_selflag = SEL_RESET;
1193 }
1194 mutex_exit(&select_lock);
1195 }
1196
1197 void
1198 selnotify(struct selinfo *sip, long knhint)
1199 {
1200
1201 selwakeup(sip);
1202 KNOTE(&sip->sel_klist, knhint);
1203 }
1204
1205 /*
1206 * Remove an LWP from all objects that it is waiting for.
1207 */
1208 static void
1209 selclear(void)
1210 {
1211 struct selinfo *sip;
1212 lwp_t *l = curlwp;
1213
1214 KASSERT(mutex_owned(&select_lock));
1215
1216 SLIST_FOREACH(sip, &l->l_selwait, sel_chain) {
1217 KASSERT(sip->sel_lwp == l);
1218 sip->sel_lwp = NULL;
1219 }
1220 }
1221
1222 /*
1223 * Initialize the select/poll system calls.
1224 */
1225 void
1226 selsysinit(void)
1227 {
1228
1229 mutex_init(&select_lock, MUTEX_DRIVER, IPL_VM);
1230 cv_init(&select_cv, "select");
1231 }
1232
1233 /*
1234 * Initialize a selector.
1235 */
1236 void
1237 selinit(struct selinfo *sip)
1238 {
1239
1240 memset(sip, 0, sizeof(*sip));
1241 }
1242
1243 /*
1244 * Destroy a selector.
1245 */
1246 void
1247 seldestroy(struct selinfo *sip)
1248 {
1249 lwp_t *l;
1250
1251 if (sip->sel_lwp == NULL)
1252 return;
1253
1254 mutex_enter(&select_lock);
1255 if ((l = sip->sel_lwp) != NULL) {
1256 /* This should rarely happen, so SLIST_REMOVE() is OK. */
1257 SLIST_REMOVE(&l->l_selwait, sip, selinfo, sel_chain);
1258 sip->sel_lwp = NULL;
1259 }
1260 mutex_exit(&select_lock);
1261 }
1262