sys_generic.c revision 1.114 1 /* $NetBSD: sys_generic.c,v 1.114 2008/03/17 18:01:44 ad Exp $ */
2
3 /*-
4 * Copyright (c) 2007, 2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38
39 /*
40 * Copyright (c) 1982, 1986, 1989, 1993
41 * The Regents of the University of California. All rights reserved.
42 * (c) UNIX System Laboratories, Inc.
43 * All or some portions of this file are derived from material licensed
44 * to the University of California by American Telephone and Telegraph
45 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
46 * the permission of UNIX System Laboratories, Inc.
47 *
48 * Redistribution and use in source and binary forms, with or without
49 * modification, are permitted provided that the following conditions
50 * are met:
51 * 1. Redistributions of source code must retain the above copyright
52 * notice, this list of conditions and the following disclaimer.
53 * 2. Redistributions in binary form must reproduce the above copyright
54 * notice, this list of conditions and the following disclaimer in the
55 * documentation and/or other materials provided with the distribution.
56 * 3. Neither the name of the University nor the names of its contributors
57 * may be used to endorse or promote products derived from this software
58 * without specific prior written permission.
59 *
60 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
61 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
62 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
63 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
64 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
65 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
66 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
67 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
68 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
69 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
70 * SUCH DAMAGE.
71 *
72 * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95
73 */
74
75 /*
76 * System calls relating to files.
77 */
78
79 #include <sys/cdefs.h>
80 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.114 2008/03/17 18:01:44 ad Exp $");
81
82 #include <sys/param.h>
83 #include <sys/systm.h>
84 #include <sys/filedesc.h>
85 #include <sys/ioctl.h>
86 #include <sys/file.h>
87 #include <sys/proc.h>
88 #include <sys/socketvar.h>
89 #include <sys/signalvar.h>
90 #include <sys/uio.h>
91 #include <sys/kernel.h>
92 #include <sys/stat.h>
93 #include <sys/kmem.h>
94 #include <sys/poll.h>
95 #include <sys/vnode.h>
96 #include <sys/mount.h>
97 #include <sys/syscallargs.h>
98 #include <sys/ktrace.h>
99
100 #include <uvm/uvm_extern.h>
101
102 /* Flags for lwp::l_selflag. */
103 #define SEL_RESET 0 /* awoken, interrupted, or not yet polling */
104 #define SEL_SCANNING 1 /* polling descriptors */
105 #define SEL_BLOCKING 2 /* about to block on select_cv */
106
107 static int selscan(lwp_t *, fd_mask *, fd_mask *, int, register_t *);
108 static int pollscan(lwp_t *, struct pollfd *, int, register_t *);
109
110 /* Global state for select()/poll(). */
111 kmutex_t select_lock;
112 kcondvar_t select_cv;
113 int nselcoll;
114
115 /*
116 * Read system call.
117 */
118 /* ARGSUSED */
119 int
120 sys_read(struct lwp *l, const struct sys_read_args *uap, register_t *retval)
121 {
122 /* {
123 syscallarg(int) fd;
124 syscallarg(void *) buf;
125 syscallarg(size_t) nbyte;
126 } */
127 int fd;
128 struct file *fp;
129 proc_t *p;
130 struct filedesc *fdp;
131
132 fd = SCARG(uap, fd);
133 p = l->l_proc;
134 fdp = p->p_fd;
135
136 if ((fp = fd_getfile(fdp, fd)) == NULL)
137 return (EBADF);
138
139 if ((fp->f_flag & FREAD) == 0) {
140 FILE_UNLOCK(fp);
141 return (EBADF);
142 }
143
144 FILE_USE(fp);
145
146 /* dofileread() will unuse the descriptor for us */
147 return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
148 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
149 }
150
151 int
152 dofileread(int fd, struct file *fp, void *buf, size_t nbyte,
153 off_t *offset, int flags, register_t *retval)
154 {
155 struct iovec aiov;
156 struct uio auio;
157 size_t cnt;
158 int error;
159 lwp_t *l;
160
161 l = curlwp;
162
163 aiov.iov_base = (void *)buf;
164 aiov.iov_len = nbyte;
165 auio.uio_iov = &aiov;
166 auio.uio_iovcnt = 1;
167 auio.uio_resid = nbyte;
168 auio.uio_rw = UIO_READ;
169 auio.uio_vmspace = l->l_proc->p_vmspace;
170
171 /*
172 * Reads return ssize_t because -1 is returned on error. Therefore
173 * we must restrict the length to SSIZE_MAX to avoid garbage return
174 * values.
175 */
176 if (auio.uio_resid > SSIZE_MAX) {
177 error = EINVAL;
178 goto out;
179 }
180
181 cnt = auio.uio_resid;
182 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
183 if (error)
184 if (auio.uio_resid != cnt && (error == ERESTART ||
185 error == EINTR || error == EWOULDBLOCK))
186 error = 0;
187 cnt -= auio.uio_resid;
188 ktrgenio(fd, UIO_READ, buf, cnt, error);
189 *retval = cnt;
190 out:
191 FILE_UNUSE(fp, l);
192 return (error);
193 }
194
195 /*
196 * Scatter read system call.
197 */
198 int
199 sys_readv(struct lwp *l, const struct sys_readv_args *uap, register_t *retval)
200 {
201 /* {
202 syscallarg(int) fd;
203 syscallarg(const struct iovec *) iovp;
204 syscallarg(int) iovcnt;
205 } */
206
207 return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
208 SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
209 }
210
211 int
212 do_filereadv(int fd, const struct iovec *iovp, int iovcnt,
213 off_t *offset, int flags, register_t *retval)
214 {
215 struct uio auio;
216 struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV];
217 int i, error;
218 size_t cnt;
219 u_int iovlen;
220 struct file *fp;
221 struct iovec *ktriov = NULL;
222 lwp_t *l;
223
224 if (iovcnt == 0)
225 return EINVAL;
226
227 l = curlwp;
228
229 if ((fp = fd_getfile(l->l_proc->p_fd, fd)) == NULL)
230 return EBADF;
231
232 if ((fp->f_flag & FREAD) == 0) {
233 FILE_UNLOCK(fp);
234 return EBADF;
235 }
236
237 FILE_USE(fp);
238
239 if (offset == NULL)
240 offset = &fp->f_offset;
241 else {
242 struct vnode *vp = fp->f_data;
243 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
244 error = ESPIPE;
245 goto out;
246 }
247 /*
248 * Test that the device is seekable ?
249 * XXX This works because no file systems actually
250 * XXX take any action on the seek operation.
251 */
252 error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
253 if (error != 0)
254 goto out;
255 }
256
257 iovlen = iovcnt * sizeof(struct iovec);
258 if (flags & FOF_IOV_SYSSPACE)
259 iov = __UNCONST(iovp);
260 else {
261 iov = aiov;
262 if ((u_int)iovcnt > UIO_SMALLIOV) {
263 if ((u_int)iovcnt > IOV_MAX) {
264 error = EINVAL;
265 goto out;
266 }
267 iov = kmem_alloc(iovlen, KM_SLEEP);
268 if (iov == NULL) {
269 error = ENOMEM;
270 goto out;
271 }
272 needfree = iov;
273 }
274 error = copyin(iovp, iov, iovlen);
275 if (error)
276 goto done;
277 }
278
279 auio.uio_iov = iov;
280 auio.uio_iovcnt = iovcnt;
281 auio.uio_rw = UIO_READ;
282 auio.uio_vmspace = l->l_proc->p_vmspace;
283
284 auio.uio_resid = 0;
285 for (i = 0; i < iovcnt; i++, iov++) {
286 auio.uio_resid += iov->iov_len;
287 /*
288 * Reads return ssize_t because -1 is returned on error.
289 * Therefore we must restrict the length to SSIZE_MAX to
290 * avoid garbage return values.
291 */
292 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
293 error = EINVAL;
294 goto done;
295 }
296 }
297
298 /*
299 * if tracing, save a copy of iovec
300 */
301 if (ktrpoint(KTR_GENIO)) {
302 ktriov = kmem_alloc(iovlen, KM_SLEEP);
303 if (ktriov != NULL)
304 memcpy(ktriov, auio.uio_iov, iovlen);
305 }
306
307 cnt = auio.uio_resid;
308 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
309 if (error)
310 if (auio.uio_resid != cnt && (error == ERESTART ||
311 error == EINTR || error == EWOULDBLOCK))
312 error = 0;
313 cnt -= auio.uio_resid;
314 *retval = cnt;
315
316 if (ktriov != NULL) {
317 ktrgeniov(fd, UIO_READ, ktriov, cnt, error);
318 kmem_free(ktriov, iovlen);
319 }
320
321 done:
322 if (needfree)
323 kmem_free(needfree, iovlen);
324 out:
325 FILE_UNUSE(fp, l);
326 return (error);
327 }
328
329 /*
330 * Write system call
331 */
332 int
333 sys_write(struct lwp *l, const struct sys_write_args *uap, register_t *retval)
334 {
335 /* {
336 syscallarg(int) fd;
337 syscallarg(const void *) buf;
338 syscallarg(size_t) nbyte;
339 } */
340 int fd;
341 struct file *fp;
342
343 fd = SCARG(uap, fd);
344
345 if ((fp = fd_getfile(curproc->p_fd, fd)) == NULL)
346 return (EBADF);
347
348 if ((fp->f_flag & FWRITE) == 0) {
349 FILE_UNLOCK(fp);
350 return (EBADF);
351 }
352
353 FILE_USE(fp);
354
355 /* dofilewrite() will unuse the descriptor for us */
356 return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
357 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
358 }
359
360 int
361 dofilewrite(int fd, struct file *fp, const void *buf,
362 size_t nbyte, off_t *offset, int flags, register_t *retval)
363 {
364 struct iovec aiov;
365 struct uio auio;
366 size_t cnt;
367 int error;
368 lwp_t *l;
369
370 l = curlwp;
371
372 aiov.iov_base = __UNCONST(buf); /* XXXUNCONST kills const */
373 aiov.iov_len = nbyte;
374 auio.uio_iov = &aiov;
375 auio.uio_iovcnt = 1;
376 auio.uio_resid = nbyte;
377 auio.uio_rw = UIO_WRITE;
378 auio.uio_vmspace = l->l_proc->p_vmspace;
379
380 /*
381 * Writes return ssize_t because -1 is returned on error. Therefore
382 * we must restrict the length to SSIZE_MAX to avoid garbage return
383 * values.
384 */
385 if (auio.uio_resid > SSIZE_MAX) {
386 error = EINVAL;
387 goto out;
388 }
389
390 cnt = auio.uio_resid;
391 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
392 if (error) {
393 if (auio.uio_resid != cnt && (error == ERESTART ||
394 error == EINTR || error == EWOULDBLOCK))
395 error = 0;
396 if (error == EPIPE) {
397 mutex_enter(&proclist_mutex);
398 psignal(l->l_proc, SIGPIPE);
399 mutex_exit(&proclist_mutex);
400 }
401 }
402 cnt -= auio.uio_resid;
403 ktrgenio(fd, UIO_WRITE, buf, cnt, error);
404 *retval = cnt;
405 out:
406 FILE_UNUSE(fp, l);
407 return (error);
408 }
409
410 /*
411 * Gather write system call
412 */
413 int
414 sys_writev(struct lwp *l, const struct sys_writev_args *uap, register_t *retval)
415 {
416 /* {
417 syscallarg(int) fd;
418 syscallarg(const struct iovec *) iovp;
419 syscallarg(int) iovcnt;
420 } */
421
422 return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
423 SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
424 }
425
426 int
427 do_filewritev(int fd, const struct iovec *iovp, int iovcnt,
428 off_t *offset, int flags, register_t *retval)
429 {
430 struct uio auio;
431 struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV];
432 int i, error;
433 size_t cnt;
434 u_int iovlen;
435 struct file *fp;
436 struct iovec *ktriov = NULL;
437 lwp_t *l;
438
439 l = curlwp;
440
441 if (iovcnt == 0)
442 return EINVAL;
443
444 if ((fp = fd_getfile(l->l_proc->p_fd, fd)) == NULL)
445 return EBADF;
446
447 if ((fp->f_flag & FWRITE) == 0) {
448 FILE_UNLOCK(fp);
449 return EBADF;
450 }
451
452 FILE_USE(fp);
453
454 if (offset == NULL)
455 offset = &fp->f_offset;
456 else {
457 struct vnode *vp = fp->f_data;
458 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
459 error = ESPIPE;
460 goto out;
461 }
462 /*
463 * Test that the device is seekable ?
464 * XXX This works because no file systems actually
465 * XXX take any action on the seek operation.
466 */
467 error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
468 if (error != 0)
469 goto out;
470 }
471
472 iovlen = iovcnt * sizeof(struct iovec);
473 if (flags & FOF_IOV_SYSSPACE)
474 iov = __UNCONST(iovp);
475 else {
476 iov = aiov;
477 if ((u_int)iovcnt > UIO_SMALLIOV) {
478 if ((u_int)iovcnt > IOV_MAX) {
479 error = EINVAL;
480 goto out;
481 }
482 iov = kmem_alloc(iovlen, KM_SLEEP);
483 if (iov == NULL) {
484 error = ENOMEM;
485 goto out;
486 }
487 needfree = iov;
488 }
489 error = copyin(iovp, iov, iovlen);
490 if (error)
491 goto done;
492 }
493
494 auio.uio_iov = iov;
495 auio.uio_iovcnt = iovcnt;
496 auio.uio_rw = UIO_WRITE;
497 auio.uio_vmspace = curproc->p_vmspace;
498
499 auio.uio_resid = 0;
500 for (i = 0; i < iovcnt; i++, iov++) {
501 auio.uio_resid += iov->iov_len;
502 /*
503 * Writes return ssize_t because -1 is returned on error.
504 * Therefore we must restrict the length to SSIZE_MAX to
505 * avoid garbage return values.
506 */
507 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
508 error = EINVAL;
509 goto done;
510 }
511 }
512
513 /*
514 * if tracing, save a copy of iovec
515 */
516 if (ktrpoint(KTR_GENIO)) {
517 ktriov = kmem_alloc(iovlen, KM_SLEEP);
518 if (ktriov != NULL)
519 memcpy(ktriov, auio.uio_iov, iovlen);
520 }
521
522 cnt = auio.uio_resid;
523 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
524 if (error) {
525 if (auio.uio_resid != cnt && (error == ERESTART ||
526 error == EINTR || error == EWOULDBLOCK))
527 error = 0;
528 if (error == EPIPE) {
529 mutex_enter(&proclist_mutex);
530 psignal(l->l_proc, SIGPIPE);
531 mutex_exit(&proclist_mutex);
532 }
533 }
534 cnt -= auio.uio_resid;
535 *retval = cnt;
536
537 if (ktriov != NULL) {
538 ktrgeniov(fd, UIO_WRITE, ktriov, cnt, error);
539 kmem_free(ktriov, iovlen);
540 }
541
542 done:
543 if (needfree)
544 kmem_free(needfree, iovlen);
545 out:
546 FILE_UNUSE(fp, l);
547 return (error);
548 }
549
550 /*
551 * Ioctl system call
552 */
553 /* ARGSUSED */
554 int
555 sys_ioctl(struct lwp *l, const struct sys_ioctl_args *uap, register_t *retval)
556 {
557 /* {
558 syscallarg(int) fd;
559 syscallarg(u_long) com;
560 syscallarg(void *) data;
561 } */
562 struct file *fp;
563 proc_t *p;
564 struct filedesc *fdp;
565 u_long com;
566 int error;
567 u_int size;
568 void *data, *memp;
569 #define STK_PARAMS 128
570 u_long stkbuf[STK_PARAMS/sizeof(u_long)];
571
572 error = 0;
573 p = l->l_proc;
574 fdp = p->p_fd;
575
576 if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
577 return (EBADF);
578
579 FILE_USE(fp);
580
581 if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
582 error = EBADF;
583 com = 0;
584 goto out;
585 }
586
587 switch (com = SCARG(uap, com)) {
588 case FIONCLEX:
589 rw_enter(&fdp->fd_lock, RW_WRITER);
590 fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
591 rw_exit(&fdp->fd_lock);
592 goto out;
593
594 case FIOCLEX:
595 rw_enter(&fdp->fd_lock, RW_WRITER);
596 fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
597 rw_exit(&fdp->fd_lock);
598 goto out;
599 }
600
601 /*
602 * Interpret high order word to find amount of data to be
603 * copied to/from the user's address space.
604 */
605 size = IOCPARM_LEN(com);
606 if (size > IOCPARM_MAX) {
607 error = ENOTTY;
608 goto out;
609 }
610 memp = NULL;
611 if (size > sizeof(stkbuf)) {
612 memp = kmem_alloc(size, KM_SLEEP);
613 data = memp;
614 } else
615 data = (void *)stkbuf;
616 if (com&IOC_IN) {
617 if (size) {
618 error = copyin(SCARG(uap, data), data, size);
619 if (error) {
620 if (memp)
621 kmem_free(memp, size);
622 goto out;
623 }
624 ktrgenio(SCARG(uap, fd), UIO_WRITE, SCARG(uap, data),
625 size, 0);
626 } else
627 *(void **)data = SCARG(uap, data);
628 } else if ((com&IOC_OUT) && size)
629 /*
630 * Zero the buffer so the user always
631 * gets back something deterministic.
632 */
633 memset(data, 0, size);
634 else if (com&IOC_VOID)
635 *(void **)data = SCARG(uap, data);
636
637 switch (com) {
638
639 case FIONBIO:
640 FILE_LOCK(fp);
641 if (*(int *)data != 0)
642 fp->f_flag |= FNONBLOCK;
643 else
644 fp->f_flag &= ~FNONBLOCK;
645 FILE_UNLOCK(fp);
646 error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data, l);
647 break;
648
649 case FIOASYNC:
650 FILE_LOCK(fp);
651 if (*(int *)data != 0)
652 fp->f_flag |= FASYNC;
653 else
654 fp->f_flag &= ~FASYNC;
655 FILE_UNLOCK(fp);
656 error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data, l);
657 break;
658
659 default:
660 error = (*fp->f_ops->fo_ioctl)(fp, com, data, l);
661 /*
662 * Copy any data to user, size was
663 * already set and checked above.
664 */
665 if (error == 0 && (com&IOC_OUT) && size) {
666 error = copyout(data, SCARG(uap, data), size);
667 ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, data),
668 size, error);
669 }
670 break;
671 }
672 if (memp)
673 kmem_free(memp, size);
674 out:
675 FILE_UNUSE(fp, l);
676 switch (error) {
677 case -1:
678 printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
679 "pid=%d comm=%s\n",
680 (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
681 (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
682 p->p_pid, p->p_comm);
683 /* FALLTHROUGH */
684 case EPASSTHROUGH:
685 error = ENOTTY;
686 /* FALLTHROUGH */
687 default:
688 return (error);
689 }
690 }
691
692 /*
693 * Select system call.
694 */
695 int
696 sys_pselect(struct lwp *l, const struct sys_pselect_args *uap, register_t *retval)
697 {
698 /* {
699 syscallarg(int) nd;
700 syscallarg(fd_set *) in;
701 syscallarg(fd_set *) ou;
702 syscallarg(fd_set *) ex;
703 syscallarg(const struct timespec *) ts;
704 syscallarg(sigset_t *) mask;
705 } */
706 struct timespec ats;
707 struct timeval atv, *tv = NULL;
708 sigset_t amask, *mask = NULL;
709 int error;
710
711 if (SCARG(uap, ts)) {
712 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
713 if (error)
714 return error;
715 atv.tv_sec = ats.tv_sec;
716 atv.tv_usec = ats.tv_nsec / 1000;
717 tv = &atv;
718 }
719 if (SCARG(uap, mask) != NULL) {
720 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
721 if (error)
722 return error;
723 mask = &amask;
724 }
725
726 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
727 SCARG(uap, ou), SCARG(uap, ex), tv, mask);
728 }
729
730 int
731 inittimeleft(struct timeval *tv, struct timeval *sleeptv)
732 {
733 if (itimerfix(tv))
734 return -1;
735 getmicrouptime(sleeptv);
736 return 0;
737 }
738
739 int
740 gettimeleft(struct timeval *tv, struct timeval *sleeptv)
741 {
742 /*
743 * We have to recalculate the timeout on every retry.
744 */
745 struct timeval slepttv;
746 /*
747 * reduce tv by elapsed time
748 * based on monotonic time scale
749 */
750 getmicrouptime(&slepttv);
751 timeradd(tv, sleeptv, tv);
752 timersub(tv, &slepttv, tv);
753 *sleeptv = slepttv;
754 return tvtohz(tv);
755 }
756
757 int
758 sys_select(struct lwp *l, const struct sys_select_args *uap, register_t *retval)
759 {
760 /* {
761 syscallarg(int) nd;
762 syscallarg(fd_set *) in;
763 syscallarg(fd_set *) ou;
764 syscallarg(fd_set *) ex;
765 syscallarg(struct timeval *) tv;
766 } */
767 struct timeval atv, *tv = NULL;
768 int error;
769
770 if (SCARG(uap, tv)) {
771 error = copyin(SCARG(uap, tv), (void *)&atv,
772 sizeof(atv));
773 if (error)
774 return error;
775 tv = &atv;
776 }
777
778 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
779 SCARG(uap, ou), SCARG(uap, ex), tv, NULL);
780 }
781
782 int
783 selcommon(lwp_t *l, register_t *retval, int nd, fd_set *u_in,
784 fd_set *u_ou, fd_set *u_ex, struct timeval *tv, sigset_t *mask)
785 {
786 char smallbits[howmany(FD_SETSIZE, NFDBITS) *
787 sizeof(fd_mask) * 6];
788 proc_t * const p = l->l_proc;
789 char *bits;
790 int ncoll, error, timo;
791 size_t ni;
792 sigset_t oldmask;
793 struct timeval sleeptv;
794
795 error = 0;
796 if (nd < 0)
797 return (EINVAL);
798 if (nd > p->p_fd->fd_nfiles) {
799 /* forgiving; slightly wrong */
800 nd = p->p_fd->fd_nfiles;
801 }
802 ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
803 if (ni * 6 > sizeof(smallbits))
804 bits = kmem_alloc(ni * 6, KM_SLEEP);
805 else
806 bits = smallbits;
807
808 #define getbits(name, x) \
809 if (u_ ## name) { \
810 error = copyin(u_ ## name, bits + ni * x, ni); \
811 if (error) \
812 goto done; \
813 } else \
814 memset(bits + ni * x, 0, ni);
815 getbits(in, 0);
816 getbits(ou, 1);
817 getbits(ex, 2);
818 #undef getbits
819
820 timo = 0;
821 if (tv && inittimeleft(tv, &sleeptv) == -1) {
822 error = EINVAL;
823 goto done;
824 }
825
826 if (mask) {
827 sigminusset(&sigcantmask, mask);
828 mutex_enter(&p->p_smutex);
829 oldmask = l->l_sigmask;
830 l->l_sigmask = *mask;
831 mutex_exit(&p->p_smutex);
832 } else
833 oldmask = l->l_sigmask; /* XXXgcc */
834
835 mutex_enter(&select_lock);
836 SLIST_INIT(&l->l_selwait);
837 for (;;) {
838 l->l_selflag = SEL_SCANNING;
839 ncoll = nselcoll;
840 mutex_exit(&select_lock);
841
842 error = selscan(l, (fd_mask *)(bits + ni * 0),
843 (fd_mask *)(bits + ni * 3), nd, retval);
844
845 mutex_enter(&select_lock);
846 if (error || *retval)
847 break;
848 if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
849 break;
850 if (l->l_selflag != SEL_SCANNING || ncoll != nselcoll)
851 continue;
852 l->l_selflag = SEL_BLOCKING;
853 error = cv_timedwait_sig(&select_cv, &select_lock, timo);
854 if (error != 0)
855 break;
856 }
857 selclear();
858 mutex_exit(&select_lock);
859
860 if (mask) {
861 mutex_enter(&p->p_smutex);
862 l->l_sigmask = oldmask;
863 mutex_exit(&p->p_smutex);
864 }
865
866 done:
867 /* select is not restarted after signals... */
868 if (error == ERESTART)
869 error = EINTR;
870 if (error == EWOULDBLOCK)
871 error = 0;
872 if (error == 0 && u_in != NULL)
873 error = copyout(bits + ni * 3, u_in, ni);
874 if (error == 0 && u_ou != NULL)
875 error = copyout(bits + ni * 4, u_ou, ni);
876 if (error == 0 && u_ex != NULL)
877 error = copyout(bits + ni * 5, u_ex, ni);
878 if (bits != smallbits)
879 kmem_free(bits, ni * 6);
880 return (error);
881 }
882
883 int
884 selscan(lwp_t *l, fd_mask *ibitp, fd_mask *obitp, int nfd,
885 register_t *retval)
886 {
887 static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
888 POLLWRNORM | POLLHUP | POLLERR,
889 POLLRDBAND };
890 proc_t *p = l->l_proc;
891 struct filedesc *fdp;
892 int msk, i, j, fd, n;
893 fd_mask ibits, obits;
894 struct file *fp;
895
896 fdp = p->p_fd;
897 n = 0;
898 for (msk = 0; msk < 3; msk++) {
899 for (i = 0; i < nfd; i += NFDBITS) {
900 ibits = *ibitp++;
901 obits = 0;
902 while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
903 ibits &= ~(1 << j);
904 if ((fp = fd_getfile(fdp, fd)) == NULL)
905 return (EBADF);
906 FILE_USE(fp);
907 if ((*fp->f_ops->fo_poll)(fp, flag[msk], l)) {
908 obits |= (1 << j);
909 n++;
910 }
911 FILE_UNUSE(fp, l);
912 }
913 *obitp++ = obits;
914 }
915 }
916 *retval = n;
917 return (0);
918 }
919
920 /*
921 * Poll system call.
922 */
923 int
924 sys_poll(struct lwp *l, const struct sys_poll_args *uap, register_t *retval)
925 {
926 /* {
927 syscallarg(struct pollfd *) fds;
928 syscallarg(u_int) nfds;
929 syscallarg(int) timeout;
930 } */
931 struct timeval atv, *tv = NULL;
932
933 if (SCARG(uap, timeout) != INFTIM) {
934 atv.tv_sec = SCARG(uap, timeout) / 1000;
935 atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
936 tv = &atv;
937 }
938
939 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
940 tv, NULL);
941 }
942
943 /*
944 * Poll system call.
945 */
946 int
947 sys_pollts(struct lwp *l, const struct sys_pollts_args *uap, register_t *retval)
948 {
949 /* {
950 syscallarg(struct pollfd *) fds;
951 syscallarg(u_int) nfds;
952 syscallarg(const struct timespec *) ts;
953 syscallarg(const sigset_t *) mask;
954 } */
955 struct timespec ats;
956 struct timeval atv, *tv = NULL;
957 sigset_t amask, *mask = NULL;
958 int error;
959
960 if (SCARG(uap, ts)) {
961 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
962 if (error)
963 return error;
964 atv.tv_sec = ats.tv_sec;
965 atv.tv_usec = ats.tv_nsec / 1000;
966 tv = &atv;
967 }
968 if (SCARG(uap, mask)) {
969 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
970 if (error)
971 return error;
972 mask = &amask;
973 }
974
975 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
976 tv, mask);
977 }
978
979 int
980 pollcommon(lwp_t *l, register_t *retval,
981 struct pollfd *u_fds, u_int nfds,
982 struct timeval *tv, sigset_t *mask)
983 {
984 char smallbits[32 * sizeof(struct pollfd)];
985 proc_t * const p = l->l_proc;
986 void * bits;
987 sigset_t oldmask;
988 int ncoll, error, timo;
989 size_t ni;
990 struct timeval sleeptv;
991
992 if (nfds > p->p_fd->fd_nfiles) {
993 /* forgiving; slightly wrong */
994 nfds = p->p_fd->fd_nfiles;
995 }
996 ni = nfds * sizeof(struct pollfd);
997 if (ni > sizeof(smallbits))
998 bits = kmem_alloc(ni, KM_SLEEP);
999 else
1000 bits = smallbits;
1001
1002 error = copyin(u_fds, bits, ni);
1003 if (error)
1004 goto done;
1005
1006 timo = 0;
1007 if (tv && inittimeleft(tv, &sleeptv) == -1) {
1008 error = EINVAL;
1009 goto done;
1010 }
1011
1012 if (mask) {
1013 sigminusset(&sigcantmask, mask);
1014 mutex_enter(&p->p_smutex);
1015 oldmask = l->l_sigmask;
1016 l->l_sigmask = *mask;
1017 mutex_exit(&p->p_smutex);
1018 } else
1019 oldmask = l->l_sigmask; /* XXXgcc */
1020
1021 mutex_enter(&select_lock);
1022 SLIST_INIT(&l->l_selwait);
1023 for (;;) {
1024 ncoll = nselcoll;
1025 l->l_selflag = SEL_SCANNING;
1026 mutex_exit(&select_lock);
1027
1028 error = pollscan(l, (struct pollfd *)bits, nfds, retval);
1029
1030 mutex_enter(&select_lock);
1031 if (error || *retval)
1032 break;
1033 if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
1034 break;
1035 if (l->l_selflag != SEL_SCANNING || nselcoll != ncoll)
1036 continue;
1037 l->l_selflag = SEL_BLOCKING;
1038 error = cv_timedwait_sig(&select_cv, &select_lock, timo);
1039 if (error != 0)
1040 break;
1041 }
1042 selclear();
1043 mutex_exit(&select_lock);
1044
1045 if (mask) {
1046 mutex_enter(&p->p_smutex);
1047 l->l_sigmask = oldmask;
1048 mutex_exit(&p->p_smutex);
1049 }
1050 done:
1051 /* poll is not restarted after signals... */
1052 if (error == ERESTART)
1053 error = EINTR;
1054 if (error == EWOULDBLOCK)
1055 error = 0;
1056 if (error == 0)
1057 error = copyout(bits, u_fds, ni);
1058 if (bits != smallbits)
1059 kmem_free(bits, ni);
1060 return (error);
1061 }
1062
1063 int
1064 pollscan(lwp_t *l, struct pollfd *fds, int nfd, register_t *retval)
1065 {
1066 proc_t *p = l->l_proc;
1067 struct filedesc *fdp;
1068 int i, n;
1069 struct file *fp;
1070
1071 fdp = p->p_fd;
1072 n = 0;
1073 for (i = 0; i < nfd; i++, fds++) {
1074 if (fds->fd >= fdp->fd_nfiles) {
1075 fds->revents = POLLNVAL;
1076 n++;
1077 } else if (fds->fd < 0) {
1078 fds->revents = 0;
1079 } else {
1080 if ((fp = fd_getfile(fdp, fds->fd)) == NULL) {
1081 fds->revents = POLLNVAL;
1082 n++;
1083 } else {
1084 FILE_USE(fp);
1085 fds->revents = (*fp->f_ops->fo_poll)(fp,
1086 fds->events | POLLERR | POLLHUP, l);
1087 if (fds->revents != 0)
1088 n++;
1089 FILE_UNUSE(fp, l);
1090 }
1091 }
1092 }
1093 *retval = n;
1094 return (0);
1095 }
1096
1097 /*ARGSUSED*/
1098 int
1099 seltrue(dev_t dev, int events, lwp_t *l)
1100 {
1101
1102 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1103 }
1104
1105 /*
1106 * Record a select request.
1107 */
1108 void
1109 selrecord(lwp_t *selector, struct selinfo *sip)
1110 {
1111
1112 mutex_enter(&select_lock);
1113 if (sip->sel_lwp == NULL) {
1114 /* First named waiter, although there may be more. */
1115 sip->sel_lwp = selector;
1116 SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
1117 } else if (sip->sel_lwp != selector) {
1118 /* Multiple waiters. */
1119 sip->sel_collision = true;
1120 }
1121 mutex_exit(&select_lock);
1122 }
1123
1124 /*
1125 * Do a wakeup when a selectable event occurs.
1126 */
1127 void
1128 selnotify(struct selinfo *sip, int events, long knhint)
1129 {
1130 lwp_t *l;
1131
1132 mutex_enter(&select_lock);
1133 if (sip->sel_collision) {
1134 /* Multiple waiters - just notify everybody. */
1135 nselcoll++;
1136 sip->sel_collision = false;
1137 cv_broadcast(&select_cv);
1138 } else if (sip->sel_lwp != NULL) {
1139 /* Only one LWP waiting. */
1140 l = sip->sel_lwp;
1141 if (l->l_selflag == SEL_BLOCKING) {
1142 /*
1143 * If it's sleeping, wake it up. If not, it's
1144 * already awake but hasn't yet removed itself
1145 * from the selector. We reset the state below
1146 * so that we only attempt to do this once.
1147 */
1148 lwp_lock(l);
1149 if (l->l_wchan == &select_cv) {
1150 /* lwp_unsleep() releases the LWP lock. */
1151 (void)lwp_unsleep(l, true);
1152 } else
1153 lwp_unlock(l);
1154 } else {
1155 /*
1156 * Not yet asleep. Reset its state below so that
1157 * it will go around again.
1158 */
1159 }
1160 l->l_selflag = SEL_RESET;
1161 }
1162 mutex_exit(&select_lock);
1163
1164 KNOTE(&sip->sel_klist, knhint);
1165 }
1166
1167 /*
1168 * Remove an LWP from all objects that it is waiting for.
1169 */
1170 void
1171 selclear(void)
1172 {
1173 struct selinfo *sip;
1174 lwp_t *l = curlwp;
1175
1176 KASSERT(mutex_owned(&select_lock));
1177
1178 SLIST_FOREACH(sip, &l->l_selwait, sel_chain) {
1179 KASSERT(sip->sel_lwp == l);
1180 sip->sel_lwp = NULL;
1181 }
1182 }
1183
1184 /*
1185 * Initialize the select/poll system calls.
1186 */
1187 void
1188 selsysinit(void)
1189 {
1190
1191 mutex_init(&select_lock, MUTEX_DEFAULT, IPL_VM);
1192 cv_init(&select_cv, "select");
1193 }
1194
1195 /*
1196 * Initialize a selector.
1197 */
1198 void
1199 selinit(struct selinfo *sip)
1200 {
1201
1202 memset(sip, 0, sizeof(*sip));
1203 }
1204
1205 /*
1206 * Destroy a selector. The owning object must not gain new
1207 * references while this is in progress: all activity on the
1208 * selector must be stopped.
1209 */
1210 void
1211 seldestroy(struct selinfo *sip)
1212 {
1213 lwp_t *l;
1214
1215 if (sip->sel_lwp == NULL)
1216 return;
1217
1218 mutex_enter(&select_lock);
1219 if ((l = sip->sel_lwp) != NULL) {
1220 /* This should rarely happen, so SLIST_REMOVE() is OK. */
1221 SLIST_REMOVE(&l->l_selwait, sip, selinfo, sel_chain);
1222 sip->sel_lwp = NULL;
1223 }
1224 mutex_exit(&select_lock);
1225 }
1226