sys_generic.c revision 1.107 1 /* $NetBSD: sys_generic.c,v 1.107 2007/09/25 13:53:11 ad Exp $ */
2
3 /*-
4 * Copyright (c) 2007 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38
39 /*
40 * Copyright (c) 1982, 1986, 1989, 1993
41 * The Regents of the University of California. All rights reserved.
42 * (c) UNIX System Laboratories, Inc.
43 * All or some portions of this file are derived from material licensed
44 * to the University of California by American Telephone and Telegraph
45 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
46 * the permission of UNIX System Laboratories, Inc.
47 *
48 * Redistribution and use in source and binary forms, with or without
49 * modification, are permitted provided that the following conditions
50 * are met:
51 * 1. Redistributions of source code must retain the above copyright
52 * notice, this list of conditions and the following disclaimer.
53 * 2. Redistributions in binary form must reproduce the above copyright
54 * notice, this list of conditions and the following disclaimer in the
55 * documentation and/or other materials provided with the distribution.
56 * 3. Neither the name of the University nor the names of its contributors
57 * may be used to endorse or promote products derived from this software
58 * without specific prior written permission.
59 *
60 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
61 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
62 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
63 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
64 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
65 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
66 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
67 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
68 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
69 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
70 * SUCH DAMAGE.
71 *
72 * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95
73 */
74
75 /*
76 * System calls relating to files.
77 */
78
79 #include <sys/cdefs.h>
80 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.107 2007/09/25 13:53:11 ad Exp $");
81
82 #include <sys/param.h>
83 #include <sys/systm.h>
84 #include <sys/filedesc.h>
85 #include <sys/ioctl.h>
86 #include <sys/file.h>
87 #include <sys/proc.h>
88 #include <sys/socketvar.h>
89 #include <sys/signalvar.h>
90 #include <sys/uio.h>
91 #include <sys/kernel.h>
92 #include <sys/stat.h>
93 #include <sys/kmem.h>
94 #include <sys/poll.h>
95 #include <sys/vnode.h>
96 #include <sys/mount.h>
97 #include <sys/syscallargs.h>
98 #include <sys/ktrace.h>
99
100 #include <uvm/uvm_extern.h>
101
102 /* Flags for lwp::l_selflag. */
103 #define SEL_RESET 0 /* awoken, interrupted, or not yet polling */
104 #define SEL_SCANNING 1 /* polling descriptors */
105 #define SEL_BLOCKING 2 /* about to block on select_cv */
106
107 static int selscan(lwp_t *, fd_mask *, fd_mask *, int, register_t *);
108 static int pollscan(lwp_t *, struct pollfd *, int, register_t *);
109 static void selclear(void);
110
111 /* Global state for select()/poll(). */
112 kmutex_t select_lock;
113 kcondvar_t select_cv;
114 int nselcoll;
115
116 /*
117 * Read system call.
118 */
119 /* ARGSUSED */
120 int
121 sys_read(lwp_t *l, void *v, register_t *retval)
122 {
123 struct sys_read_args /* {
124 syscallarg(int) fd;
125 syscallarg(void *) buf;
126 syscallarg(size_t) nbyte;
127 } */ *uap = v;
128 int fd;
129 struct file *fp;
130 proc_t *p;
131 struct filedesc *fdp;
132
133 fd = SCARG(uap, fd);
134 p = l->l_proc;
135 fdp = p->p_fd;
136
137 if ((fp = fd_getfile(fdp, fd)) == NULL)
138 return (EBADF);
139
140 if ((fp->f_flag & FREAD) == 0) {
141 simple_unlock(&fp->f_slock);
142 return (EBADF);
143 }
144
145 FILE_USE(fp);
146
147 /* dofileread() will unuse the descriptor for us */
148 return (dofileread(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
149 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
150 }
151
152 int
153 dofileread(lwp_t *l, int fd, struct file *fp, void *buf, size_t nbyte,
154 off_t *offset, int flags, register_t *retval)
155 {
156 struct iovec aiov;
157 struct uio auio;
158 proc_t *p;
159 struct vmspace *vm;
160 size_t cnt;
161 int error;
162 p = l->l_proc;
163
164 error = proc_vmspace_getref(p, &vm);
165 if (error) {
166 FILE_UNUSE(fp, l);
167 return error;
168 }
169
170 aiov.iov_base = (void *)buf;
171 aiov.iov_len = nbyte;
172 auio.uio_iov = &aiov;
173 auio.uio_iovcnt = 1;
174 auio.uio_resid = nbyte;
175 auio.uio_rw = UIO_READ;
176 auio.uio_vmspace = vm;
177
178 /*
179 * Reads return ssize_t because -1 is returned on error. Therefore
180 * we must restrict the length to SSIZE_MAX to avoid garbage return
181 * values.
182 */
183 if (auio.uio_resid > SSIZE_MAX) {
184 error = EINVAL;
185 goto out;
186 }
187
188 cnt = auio.uio_resid;
189 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
190 if (error)
191 if (auio.uio_resid != cnt && (error == ERESTART ||
192 error == EINTR || error == EWOULDBLOCK))
193 error = 0;
194 cnt -= auio.uio_resid;
195 ktrgenio(fd, UIO_READ, buf, cnt, error);
196 *retval = cnt;
197 out:
198 FILE_UNUSE(fp, l);
199 uvmspace_free(vm);
200 return (error);
201 }
202
203 /*
204 * Scatter read system call.
205 */
206 int
207 sys_readv(lwp_t *l, void *v, register_t *retval)
208 {
209 struct sys_readv_args /* {
210 syscallarg(int) fd;
211 syscallarg(const struct iovec *) iovp;
212 syscallarg(int) iovcnt;
213 } */ *uap = v;
214
215 return do_filereadv(l, SCARG(uap, fd), SCARG(uap, iovp),
216 SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
217 }
218
219 int
220 do_filereadv(struct lwp *l, int fd, const struct iovec *iovp, int iovcnt,
221 off_t *offset, int flags, register_t *retval)
222 {
223 struct proc *p;
224 struct uio auio;
225 struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV];
226 struct vmspace *vm;
227 int i, error;
228 size_t cnt;
229 u_int iovlen;
230 struct file *fp;
231 struct filedesc *fdp;
232 struct iovec *ktriov = NULL;
233
234 if (iovcnt == 0)
235 return EINVAL;
236
237 p = l->l_proc;
238 fdp = p->p_fd;
239
240 if ((fp = fd_getfile(fdp, fd)) == NULL)
241 return EBADF;
242
243 if ((fp->f_flag & FREAD) == 0) {
244 simple_unlock(&fp->f_slock);
245 return EBADF;
246 }
247
248 FILE_USE(fp);
249
250 error = proc_vmspace_getref(p, &vm);
251 if (error) {
252 FILE_UNUSE(fp, l);
253 return error;
254 }
255
256 if (offset == NULL)
257 offset = &fp->f_offset;
258 else {
259 struct vnode *vp = fp->f_data;
260 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
261 error = ESPIPE;
262 goto out;
263 }
264 /*
265 * Test that the device is seekable ?
266 * XXX This works because no file systems actually
267 * XXX take any action on the seek operation.
268 */
269 error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
270 if (error != 0)
271 goto out;
272 }
273
274 iovlen = iovcnt * sizeof(struct iovec);
275 if (flags & FOF_IOV_SYSSPACE)
276 iov = __UNCONST(iovp);
277 else {
278 iov = aiov;
279 if ((u_int)iovcnt > UIO_SMALLIOV) {
280 if ((u_int)iovcnt > IOV_MAX) {
281 error = EINVAL;
282 goto out;
283 }
284 iov = kmem_alloc(iovlen, KM_SLEEP);
285 if (iov == NULL) {
286 error = ENOMEM;
287 goto out;
288 }
289 needfree = iov;
290 }
291 error = copyin(iovp, iov, iovlen);
292 if (error)
293 goto done;
294 }
295
296 auio.uio_iov = iov;
297 auio.uio_iovcnt = iovcnt;
298 auio.uio_rw = UIO_READ;
299 auio.uio_vmspace = vm;
300
301 auio.uio_resid = 0;
302 for (i = 0; i < iovcnt; i++, iov++) {
303 auio.uio_resid += iov->iov_len;
304 /*
305 * Reads return ssize_t because -1 is returned on error.
306 * Therefore we must restrict the length to SSIZE_MAX to
307 * avoid garbage return values.
308 */
309 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
310 error = EINVAL;
311 goto done;
312 }
313 }
314
315 /*
316 * if tracing, save a copy of iovec
317 */
318 if (ktrpoint(KTR_GENIO)) {
319 ktriov = kmem_alloc(iovlen, KM_SLEEP);
320 if (ktriov != NULL)
321 memcpy(ktriov, auio.uio_iov, iovlen);
322 }
323
324 cnt = auio.uio_resid;
325 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
326 if (error)
327 if (auio.uio_resid != cnt && (error == ERESTART ||
328 error == EINTR || error == EWOULDBLOCK))
329 error = 0;
330 cnt -= auio.uio_resid;
331 *retval = cnt;
332
333 if (ktriov != NULL) {
334 ktrgeniov(fd, UIO_READ, ktriov, cnt, error);
335 kmem_free(ktriov, iovlen);
336 }
337
338 done:
339 if (needfree)
340 kmem_free(needfree, iovlen);
341 out:
342 FILE_UNUSE(fp, l);
343 uvmspace_free(vm);
344 return (error);
345 }
346
347 /*
348 * Write system call
349 */
350 int
351 sys_write(lwp_t *l, void *v, register_t *retval)
352 {
353 struct sys_write_args /* {
354 syscallarg(int) fd;
355 syscallarg(const void *) buf;
356 syscallarg(size_t) nbyte;
357 } */ *uap = v;
358 int fd;
359 struct file *fp;
360 proc_t *p;
361 struct filedesc *fdp;
362
363 fd = SCARG(uap, fd);
364 p = l->l_proc;
365 fdp = p->p_fd;
366
367 if ((fp = fd_getfile(fdp, fd)) == NULL)
368 return (EBADF);
369
370 if ((fp->f_flag & FWRITE) == 0) {
371 simple_unlock(&fp->f_slock);
372 return (EBADF);
373 }
374
375 FILE_USE(fp);
376
377 /* dofilewrite() will unuse the descriptor for us */
378 return (dofilewrite(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
379 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
380 }
381
382 int
383 dofilewrite(lwp_t *l, int fd, struct file *fp, const void *buf,
384 size_t nbyte, off_t *offset, int flags, register_t *retval)
385 {
386 struct iovec aiov;
387 struct uio auio;
388 proc_t *p;
389 struct vmspace *vm;
390 size_t cnt;
391 int error;
392
393 p = l->l_proc;
394 error = proc_vmspace_getref(p, &vm);
395 if (error) {
396 FILE_UNUSE(fp, l);
397 return error;
398 }
399 aiov.iov_base = __UNCONST(buf); /* XXXUNCONST kills const */
400 aiov.iov_len = nbyte;
401 auio.uio_iov = &aiov;
402 auio.uio_iovcnt = 1;
403 auio.uio_resid = nbyte;
404 auio.uio_rw = UIO_WRITE;
405 auio.uio_vmspace = vm;
406
407 /*
408 * Writes return ssize_t because -1 is returned on error. Therefore
409 * we must restrict the length to SSIZE_MAX to avoid garbage return
410 * values.
411 */
412 if (auio.uio_resid > SSIZE_MAX) {
413 error = EINVAL;
414 goto out;
415 }
416
417 cnt = auio.uio_resid;
418 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
419 if (error) {
420 if (auio.uio_resid != cnt && (error == ERESTART ||
421 error == EINTR || error == EWOULDBLOCK))
422 error = 0;
423 if (error == EPIPE) {
424 mutex_enter(&proclist_mutex);
425 psignal(p, SIGPIPE);
426 mutex_exit(&proclist_mutex);
427 }
428 }
429 cnt -= auio.uio_resid;
430 ktrgenio(fd, UIO_WRITE, buf, cnt, error);
431 *retval = cnt;
432 out:
433 FILE_UNUSE(fp, l);
434 uvmspace_free(vm);
435 return (error);
436 }
437
438 /*
439 * Gather write system call
440 */
441 int
442 sys_writev(lwp_t *l, void *v, register_t *retval)
443 {
444 struct sys_writev_args /* {
445 syscallarg(int) fd;
446 syscallarg(const struct iovec *) iovp;
447 syscallarg(int) iovcnt;
448 } */ *uap = v;
449
450 return do_filewritev(l, SCARG(uap, fd), SCARG(uap, iovp),
451 SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
452 }
453
454 int
455 do_filewritev(struct lwp *l, int fd, const struct iovec *iovp, int iovcnt,
456 off_t *offset, int flags, register_t *retval)
457 {
458 struct proc *p;
459 struct uio auio;
460 struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV];
461 struct vmspace *vm;
462 int i, error;
463 size_t cnt;
464 u_int iovlen;
465 struct file *fp;
466 struct filedesc *fdp;
467 struct iovec *ktriov = NULL;
468
469 if (iovcnt == 0)
470 return EINVAL;
471
472 p = l->l_proc;
473 fdp = p->p_fd;
474
475 if ((fp = fd_getfile(fdp, fd)) == NULL)
476 return EBADF;
477
478 if ((fp->f_flag & FWRITE) == 0) {
479 simple_unlock(&fp->f_slock);
480 return EBADF;
481 }
482
483 FILE_USE(fp);
484
485 error = proc_vmspace_getref(p, &vm);
486 if (error) {
487 FILE_UNUSE(fp, l);
488 return error;
489 }
490
491 if (offset == NULL)
492 offset = &fp->f_offset;
493 else {
494 struct vnode *vp = fp->f_data;
495 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
496 error = ESPIPE;
497 goto out;
498 }
499 /*
500 * Test that the device is seekable ?
501 * XXX This works because no file systems actually
502 * XXX take any action on the seek operation.
503 */
504 error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
505 if (error != 0)
506 goto out;
507 }
508
509 iovlen = iovcnt * sizeof(struct iovec);
510 if (flags & FOF_IOV_SYSSPACE)
511 iov = __UNCONST(iovp);
512 else {
513 iov = aiov;
514 if ((u_int)iovcnt > UIO_SMALLIOV) {
515 if ((u_int)iovcnt > IOV_MAX) {
516 error = EINVAL;
517 goto out;
518 }
519 iov = kmem_alloc(iovlen, KM_SLEEP);
520 if (iov == NULL) {
521 error = ENOMEM;
522 goto out;
523 }
524 needfree = iov;
525 }
526 error = copyin(iovp, iov, iovlen);
527 if (error)
528 goto done;
529 }
530
531 auio.uio_iov = iov;
532 auio.uio_iovcnt = iovcnt;
533 auio.uio_rw = UIO_WRITE;
534 auio.uio_vmspace = vm;
535
536 auio.uio_resid = 0;
537 for (i = 0; i < iovcnt; i++, iov++) {
538 auio.uio_resid += iov->iov_len;
539 /*
540 * Writes return ssize_t because -1 is returned on error.
541 * Therefore we must restrict the length to SSIZE_MAX to
542 * avoid garbage return values.
543 */
544 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
545 error = EINVAL;
546 goto done;
547 }
548 }
549
550 /*
551 * if tracing, save a copy of iovec
552 */
553 if (ktrpoint(KTR_GENIO)) {
554 ktriov = kmem_alloc(iovlen, KM_SLEEP);
555 if (ktriov != NULL)
556 memcpy(ktriov, auio.uio_iov, iovlen);
557 }
558
559 cnt = auio.uio_resid;
560 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
561 if (error) {
562 if (auio.uio_resid != cnt && (error == ERESTART ||
563 error == EINTR || error == EWOULDBLOCK))
564 error = 0;
565 if (error == EPIPE) {
566 mutex_enter(&proclist_mutex);
567 psignal(p, SIGPIPE);
568 mutex_exit(&proclist_mutex);
569 }
570 }
571 cnt -= auio.uio_resid;
572 *retval = cnt;
573
574 if (ktriov != NULL) {
575 ktrgeniov(fd, UIO_WRITE, ktriov, cnt, error);
576 kmem_free(ktriov, iovlen);
577 }
578
579 done:
580 if (needfree)
581 kmem_free(needfree, iovlen);
582 out:
583 FILE_UNUSE(fp, l);
584 uvmspace_free(vm);
585 return (error);
586 }
587
588 /*
589 * Ioctl system call
590 */
591 /* ARGSUSED */
592 int
593 sys_ioctl(struct lwp *l, void *v, register_t *retval)
594 {
595 struct sys_ioctl_args /* {
596 syscallarg(int) fd;
597 syscallarg(u_long) com;
598 syscallarg(void *) data;
599 } */ *uap = v;
600 struct file *fp;
601 proc_t *p;
602 struct filedesc *fdp;
603 u_long com;
604 int error;
605 u_int size;
606 void *data, *memp;
607 #define STK_PARAMS 128
608 u_long stkbuf[STK_PARAMS/sizeof(u_long)];
609
610 error = 0;
611 p = l->l_proc;
612 fdp = p->p_fd;
613
614 if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
615 return (EBADF);
616
617 FILE_USE(fp);
618
619 if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
620 error = EBADF;
621 com = 0;
622 goto out;
623 }
624
625 switch (com = SCARG(uap, com)) {
626 case FIONCLEX:
627 fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
628 goto out;
629
630 case FIOCLEX:
631 fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
632 goto out;
633 }
634
635 /*
636 * Interpret high order word to find amount of data to be
637 * copied to/from the user's address space.
638 */
639 size = IOCPARM_LEN(com);
640 if (size > IOCPARM_MAX) {
641 error = ENOTTY;
642 goto out;
643 }
644 memp = NULL;
645 if (size > sizeof(stkbuf)) {
646 memp = kmem_alloc(size, KM_SLEEP);
647 data = memp;
648 } else
649 data = (void *)stkbuf;
650 if (com&IOC_IN) {
651 if (size) {
652 error = copyin(SCARG(uap, data), data, size);
653 if (error) {
654 if (memp)
655 kmem_free(memp, size);
656 goto out;
657 }
658 ktrgenio(SCARG(uap, fd), UIO_WRITE, SCARG(uap, data),
659 size, 0);
660 } else
661 *(void **)data = SCARG(uap, data);
662 } else if ((com&IOC_OUT) && size)
663 /*
664 * Zero the buffer so the user always
665 * gets back something deterministic.
666 */
667 memset(data, 0, size);
668 else if (com&IOC_VOID)
669 *(void **)data = SCARG(uap, data);
670
671 switch (com) {
672
673 case FIONBIO:
674 if (*(int *)data != 0)
675 fp->f_flag |= FNONBLOCK;
676 else
677 fp->f_flag &= ~FNONBLOCK;
678 error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data, l);
679 break;
680
681 case FIOASYNC:
682 if (*(int *)data != 0)
683 fp->f_flag |= FASYNC;
684 else
685 fp->f_flag &= ~FASYNC;
686 error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data, l);
687 break;
688
689 default:
690 error = (*fp->f_ops->fo_ioctl)(fp, com, data, l);
691 /*
692 * Copy any data to user, size was
693 * already set and checked above.
694 */
695 if (error == 0 && (com&IOC_OUT) && size) {
696 error = copyout(data, SCARG(uap, data), size);
697 ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, data),
698 size, error);
699 }
700 break;
701 }
702 if (memp)
703 kmem_free(memp, size);
704 out:
705 FILE_UNUSE(fp, l);
706 switch (error) {
707 case -1:
708 printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
709 "pid=%d comm=%s\n",
710 (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
711 (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
712 p->p_pid, p->p_comm);
713 /* FALLTHROUGH */
714 case EPASSTHROUGH:
715 error = ENOTTY;
716 /* FALLTHROUGH */
717 default:
718 return (error);
719 }
720 }
721
722 /*
723 * Select system call.
724 */
725 int
726 sys_pselect(lwp_t *l, void *v, register_t *retval)
727 {
728 struct sys_pselect_args /* {
729 syscallarg(int) nd;
730 syscallarg(fd_set *) in;
731 syscallarg(fd_set *) ou;
732 syscallarg(fd_set *) ex;
733 syscallarg(const struct timespec *) ts;
734 syscallarg(sigset_t *) mask;
735 } */ * const uap = v;
736 struct timespec ats;
737 struct timeval atv, *tv = NULL;
738 sigset_t amask, *mask = NULL;
739 int error;
740
741 if (SCARG(uap, ts)) {
742 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
743 if (error)
744 return error;
745 atv.tv_sec = ats.tv_sec;
746 atv.tv_usec = ats.tv_nsec / 1000;
747 tv = &atv;
748 }
749 if (SCARG(uap, mask) != NULL) {
750 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
751 if (error)
752 return error;
753 mask = &amask;
754 }
755
756 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
757 SCARG(uap, ou), SCARG(uap, ex), tv, mask);
758 }
759
760 int
761 inittimeleft(struct timeval *tv, struct timeval *sleeptv)
762 {
763 if (itimerfix(tv))
764 return -1;
765 getmicrouptime(sleeptv);
766 return 0;
767 }
768
769 int
770 gettimeleft(struct timeval *tv, struct timeval *sleeptv)
771 {
772 /*
773 * We have to recalculate the timeout on every retry.
774 */
775 struct timeval slepttv;
776 /*
777 * reduce tv by elapsed time
778 * based on monotonic time scale
779 */
780 getmicrouptime(&slepttv);
781 timeradd(tv, sleeptv, tv);
782 timersub(tv, &slepttv, tv);
783 *sleeptv = slepttv;
784 return tvtohz(tv);
785 }
786
787 int
788 sys_select(lwp_t *l, void *v, register_t *retval)
789 {
790 struct sys_select_args /* {
791 syscallarg(int) nd;
792 syscallarg(fd_set *) in;
793 syscallarg(fd_set *) ou;
794 syscallarg(fd_set *) ex;
795 syscallarg(struct timeval *) tv;
796 } */ * const uap = v;
797 struct timeval atv, *tv = NULL;
798 int error;
799
800 if (SCARG(uap, tv)) {
801 error = copyin(SCARG(uap, tv), (void *)&atv,
802 sizeof(atv));
803 if (error)
804 return error;
805 tv = &atv;
806 }
807
808 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
809 SCARG(uap, ou), SCARG(uap, ex), tv, NULL);
810 }
811
812 int
813 selcommon(lwp_t *l, register_t *retval, int nd, fd_set *u_in,
814 fd_set *u_ou, fd_set *u_ex, struct timeval *tv, sigset_t *mask)
815 {
816 char smallbits[howmany(FD_SETSIZE, NFDBITS) *
817 sizeof(fd_mask) * 6];
818 proc_t * const p = l->l_proc;
819 char *bits;
820 int ncoll, error, timo;
821 size_t ni;
822 sigset_t oldmask;
823 struct timeval sleeptv;
824
825 error = 0;
826 if (nd < 0)
827 return (EINVAL);
828 if (nd > p->p_fd->fd_nfiles) {
829 /* forgiving; slightly wrong */
830 nd = p->p_fd->fd_nfiles;
831 }
832 ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
833 if (ni * 6 > sizeof(smallbits))
834 bits = kmem_alloc(ni * 6, KM_SLEEP);
835 else
836 bits = smallbits;
837
838 #define getbits(name, x) \
839 if (u_ ## name) { \
840 error = copyin(u_ ## name, bits + ni * x, ni); \
841 if (error) \
842 goto done; \
843 } else \
844 memset(bits + ni * x, 0, ni);
845 getbits(in, 0);
846 getbits(ou, 1);
847 getbits(ex, 2);
848 #undef getbits
849
850 timo = 0;
851 if (tv && inittimeleft(tv, &sleeptv) == -1) {
852 error = EINVAL;
853 goto done;
854 }
855
856 if (mask) {
857 sigminusset(&sigcantmask, mask);
858 mutex_enter(&p->p_smutex);
859 oldmask = l->l_sigmask;
860 l->l_sigmask = *mask;
861 mutex_exit(&p->p_smutex);
862 } else
863 oldmask = l->l_sigmask; /* XXXgcc */
864
865 mutex_enter(&select_lock);
866 SLIST_INIT(&l->l_selwait);
867 for (;;) {
868 l->l_selflag = SEL_SCANNING;
869 ncoll = nselcoll;
870 mutex_exit(&select_lock);
871
872 error = selscan(l, (fd_mask *)(bits + ni * 0),
873 (fd_mask *)(bits + ni * 3), nd, retval);
874
875 mutex_enter(&select_lock);
876 if (error || *retval)
877 break;
878 if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
879 break;
880 if (l->l_selflag != SEL_SCANNING || ncoll != nselcoll)
881 continue;
882 l->l_selflag = SEL_BLOCKING;
883 error = cv_timedwait_sig(&select_cv, &select_lock, timo);
884 if (error != 0)
885 break;
886 }
887 selclear();
888 mutex_exit(&select_lock);
889
890 if (mask) {
891 mutex_enter(&p->p_smutex);
892 l->l_sigmask = oldmask;
893 mutex_exit(&p->p_smutex);
894 }
895
896 done:
897 /* select is not restarted after signals... */
898 if (error == ERESTART)
899 error = EINTR;
900 if (error == EWOULDBLOCK)
901 error = 0;
902 if (error == 0 && u_in != NULL)
903 error = copyout(bits + ni * 3, u_in, ni);
904 if (error == 0 && u_ou != NULL)
905 error = copyout(bits + ni * 4, u_ou, ni);
906 if (error == 0 && u_ex != NULL)
907 error = copyout(bits + ni * 5, u_ex, ni);
908 if (bits != smallbits)
909 kmem_free(bits, ni * 6);
910 return (error);
911 }
912
913 int
914 selscan(lwp_t *l, fd_mask *ibitp, fd_mask *obitp, int nfd,
915 register_t *retval)
916 {
917 static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
918 POLLWRNORM | POLLHUP | POLLERR,
919 POLLRDBAND };
920 proc_t *p = l->l_proc;
921 struct filedesc *fdp;
922 int msk, i, j, fd, n;
923 fd_mask ibits, obits;
924 struct file *fp;
925
926 fdp = p->p_fd;
927 n = 0;
928 for (msk = 0; msk < 3; msk++) {
929 for (i = 0; i < nfd; i += NFDBITS) {
930 ibits = *ibitp++;
931 obits = 0;
932 while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
933 ibits &= ~(1 << j);
934 if ((fp = fd_getfile(fdp, fd)) == NULL)
935 return (EBADF);
936 FILE_USE(fp);
937 if ((*fp->f_ops->fo_poll)(fp, flag[msk], l)) {
938 obits |= (1 << j);
939 n++;
940 }
941 FILE_UNUSE(fp, l);
942 }
943 *obitp++ = obits;
944 }
945 }
946 *retval = n;
947 return (0);
948 }
949
950 /*
951 * Poll system call.
952 */
953 int
954 sys_poll(lwp_t *l, void *v, register_t *retval)
955 {
956 struct sys_poll_args /* {
957 syscallarg(struct pollfd *) fds;
958 syscallarg(u_int) nfds;
959 syscallarg(int) timeout;
960 } */ * const uap = v;
961 struct timeval atv, *tv = NULL;
962
963 if (SCARG(uap, timeout) != INFTIM) {
964 atv.tv_sec = SCARG(uap, timeout) / 1000;
965 atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
966 tv = &atv;
967 }
968
969 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
970 tv, NULL);
971 }
972
973 /*
974 * Poll system call.
975 */
976 int
977 sys_pollts(lwp_t *l, void *v, register_t *retval)
978 {
979 struct sys_pollts_args /* {
980 syscallarg(struct pollfd *) fds;
981 syscallarg(u_int) nfds;
982 syscallarg(const struct timespec *) ts;
983 syscallarg(const sigset_t *) mask;
984 } */ * const uap = v;
985 struct timespec ats;
986 struct timeval atv, *tv = NULL;
987 sigset_t amask, *mask = NULL;
988 int error;
989
990 if (SCARG(uap, ts)) {
991 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
992 if (error)
993 return error;
994 atv.tv_sec = ats.tv_sec;
995 atv.tv_usec = ats.tv_nsec / 1000;
996 tv = &atv;
997 }
998 if (SCARG(uap, mask)) {
999 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
1000 if (error)
1001 return error;
1002 mask = &amask;
1003 }
1004
1005 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
1006 tv, mask);
1007 }
1008
1009 int
1010 pollcommon(lwp_t *l, register_t *retval,
1011 struct pollfd *u_fds, u_int nfds,
1012 struct timeval *tv, sigset_t *mask)
1013 {
1014 char smallbits[32 * sizeof(struct pollfd)];
1015 proc_t * const p = l->l_proc;
1016 void * bits;
1017 sigset_t oldmask;
1018 int ncoll, error, timo;
1019 size_t ni;
1020 struct timeval sleeptv;
1021
1022 if (nfds > p->p_fd->fd_nfiles) {
1023 /* forgiving; slightly wrong */
1024 nfds = p->p_fd->fd_nfiles;
1025 }
1026 ni = nfds * sizeof(struct pollfd);
1027 if (ni > sizeof(smallbits))
1028 bits = kmem_alloc(ni, KM_SLEEP);
1029 else
1030 bits = smallbits;
1031
1032 error = copyin(u_fds, bits, ni);
1033 if (error)
1034 goto done;
1035
1036 timo = 0;
1037 if (tv && inittimeleft(tv, &sleeptv) == -1) {
1038 error = EINVAL;
1039 goto done;
1040 }
1041
1042 if (mask) {
1043 sigminusset(&sigcantmask, mask);
1044 mutex_enter(&p->p_smutex);
1045 oldmask = l->l_sigmask;
1046 l->l_sigmask = *mask;
1047 mutex_exit(&p->p_smutex);
1048 } else
1049 oldmask = l->l_sigmask; /* XXXgcc */
1050
1051 mutex_enter(&select_lock);
1052 SLIST_INIT(&l->l_selwait);
1053 for (;;) {
1054 ncoll = nselcoll;
1055 l->l_selflag = SEL_SCANNING;
1056 mutex_exit(&select_lock);
1057
1058 error = pollscan(l, (struct pollfd *)bits, nfds, retval);
1059
1060 mutex_enter(&select_lock);
1061 if (error || *retval)
1062 break;
1063 if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
1064 break;
1065 if (l->l_selflag != SEL_SCANNING || nselcoll != ncoll)
1066 continue;
1067 l->l_selflag = SEL_BLOCKING;
1068 error = cv_timedwait_sig(&select_cv, &select_lock, timo);
1069 if (error != 0)
1070 break;
1071 }
1072 selclear();
1073 mutex_exit(&select_lock);
1074
1075 if (mask) {
1076 mutex_enter(&p->p_smutex);
1077 l->l_sigmask = oldmask;
1078 mutex_exit(&p->p_smutex);
1079 }
1080 done:
1081 /* poll is not restarted after signals... */
1082 if (error == ERESTART)
1083 error = EINTR;
1084 if (error == EWOULDBLOCK)
1085 error = 0;
1086 if (error == 0)
1087 error = copyout(bits, u_fds, ni);
1088 if (bits != smallbits)
1089 kmem_free(bits, ni);
1090 return (error);
1091 }
1092
1093 int
1094 pollscan(lwp_t *l, struct pollfd *fds, int nfd, register_t *retval)
1095 {
1096 proc_t *p = l->l_proc;
1097 struct filedesc *fdp;
1098 int i, n;
1099 struct file *fp;
1100
1101 fdp = p->p_fd;
1102 n = 0;
1103 for (i = 0; i < nfd; i++, fds++) {
1104 if (fds->fd >= fdp->fd_nfiles) {
1105 fds->revents = POLLNVAL;
1106 n++;
1107 } else if (fds->fd < 0) {
1108 fds->revents = 0;
1109 } else {
1110 if ((fp = fd_getfile(fdp, fds->fd)) == NULL) {
1111 fds->revents = POLLNVAL;
1112 n++;
1113 } else {
1114 FILE_USE(fp);
1115 fds->revents = (*fp->f_ops->fo_poll)(fp,
1116 fds->events | POLLERR | POLLHUP, l);
1117 if (fds->revents != 0)
1118 n++;
1119 FILE_UNUSE(fp, l);
1120 }
1121 }
1122 }
1123 *retval = n;
1124 return (0);
1125 }
1126
1127 /*ARGSUSED*/
1128 int
1129 seltrue(dev_t dev, int events, lwp_t *l)
1130 {
1131
1132 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1133 }
1134
1135 /*
1136 * Record a select request.
1137 */
1138 void
1139 selrecord(lwp_t *selector, struct selinfo *sip)
1140 {
1141
1142 mutex_enter(&select_lock);
1143 if (sip->sel_lwp == NULL) {
1144 /* First named waiter, although there may be more. */
1145 sip->sel_lwp = selector;
1146 SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
1147 } else if (sip->sel_lwp != selector) {
1148 /* Multiple waiters. */
1149 sip->sel_collision = true;
1150 }
1151 mutex_exit(&select_lock);
1152 }
1153
1154 /*
1155 * Do a wakeup when a selectable event occurs.
1156 */
1157 void
1158 selwakeup(struct selinfo *sip)
1159 {
1160 lwp_t *l;
1161
1162 mutex_enter(&select_lock);
1163 if (sip->sel_collision) {
1164 /* Multiple waiters - just notify everybody. */
1165 nselcoll++;
1166 sip->sel_collision = false;
1167 cv_broadcast(&select_cv);
1168 } else if (sip->sel_lwp != NULL) {
1169 /* Only one LWP waiting. */
1170 l = sip->sel_lwp;
1171 if (l->l_selflag == SEL_BLOCKING) {
1172 /*
1173 * If it's sleeping, wake it up. If not, it's
1174 * already awake but hasn't yet removed itself
1175 * from the selector. We reset the state below
1176 * so that we only attempt to do this once.
1177 */
1178 lwp_lock(l);
1179 if (l->l_wchan == &select_cv) {
1180 /* lwp_unsleep() releases the LWP lock. */
1181 lwp_unsleep(l);
1182 } else
1183 lwp_unlock(l);
1184 } else {
1185 /*
1186 * Not yet asleep. Reset its state below so that
1187 * it will go around again.
1188 */
1189 }
1190 l->l_selflag = SEL_RESET;
1191 }
1192 mutex_exit(&select_lock);
1193 }
1194
1195 void
1196 selnotify(struct selinfo *sip, long knhint)
1197 {
1198
1199 selwakeup(sip);
1200 KNOTE(&sip->sel_klist, knhint);
1201 }
1202
1203 /*
1204 * Remove an LWP from all objects that it is waiting for.
1205 */
1206 static void
1207 selclear(void)
1208 {
1209 struct selinfo *sip;
1210 lwp_t *l = curlwp;
1211
1212 KASSERT(mutex_owned(&select_lock));
1213
1214 SLIST_FOREACH(sip, &l->l_selwait, sel_chain) {
1215 KASSERT(sip->sel_lwp == l);
1216 sip->sel_lwp = NULL;
1217 }
1218 }
1219
1220 /*
1221 * Initialize the select/poll system calls.
1222 */
1223 void
1224 selsysinit(void)
1225 {
1226
1227 mutex_init(&select_lock, MUTEX_DRIVER, IPL_VM);
1228 cv_init(&select_cv, "select");
1229 }
1230
1231 /*
1232 * Initialize a selector.
1233 */
1234 void
1235 selinit(struct selinfo *sip)
1236 {
1237
1238 memset(sip, 0, sizeof(*sip));
1239 }
1240
1241 /*
1242 * Destroy a selector. The owning object must not gain new
1243 * references while this is in progress: all activity on the
1244 * selector must be stopped.
1245 */
1246 void
1247 seldestroy(struct selinfo *sip)
1248 {
1249 lwp_t *l;
1250
1251 if (sip->sel_lwp == NULL)
1252 return;
1253
1254 mutex_enter(&select_lock);
1255 if ((l = sip->sel_lwp) != NULL) {
1256 /* This should rarely happen, so SLIST_REMOVE() is OK. */
1257 SLIST_REMOVE(&l->l_selwait, sip, selinfo, sel_chain);
1258 sip->sel_lwp = NULL;
1259 }
1260 mutex_exit(&select_lock);
1261 }
1262