sys_generic.c revision 1.115 1 /* $NetBSD: sys_generic.c,v 1.115 2008/03/21 21:55:00 ad Exp $ */
2
3 /*-
4 * Copyright (c) 2007, 2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38
39 /*
40 * Copyright (c) 1982, 1986, 1989, 1993
41 * The Regents of the University of California. All rights reserved.
42 * (c) UNIX System Laboratories, Inc.
43 * All or some portions of this file are derived from material licensed
44 * to the University of California by American Telephone and Telegraph
45 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
46 * the permission of UNIX System Laboratories, Inc.
47 *
48 * Redistribution and use in source and binary forms, with or without
49 * modification, are permitted provided that the following conditions
50 * are met:
51 * 1. Redistributions of source code must retain the above copyright
52 * notice, this list of conditions and the following disclaimer.
53 * 2. Redistributions in binary form must reproduce the above copyright
54 * notice, this list of conditions and the following disclaimer in the
55 * documentation and/or other materials provided with the distribution.
56 * 3. Neither the name of the University nor the names of its contributors
57 * may be used to endorse or promote products derived from this software
58 * without specific prior written permission.
59 *
60 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
61 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
62 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
63 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
64 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
65 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
66 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
67 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
68 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
69 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
70 * SUCH DAMAGE.
71 *
72 * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95
73 */
74
75 /*
76 * System calls relating to files.
77 */
78
79 #include <sys/cdefs.h>
80 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.115 2008/03/21 21:55:00 ad Exp $");
81
82 #include <sys/param.h>
83 #include <sys/systm.h>
84 #include <sys/filedesc.h>
85 #include <sys/ioctl.h>
86 #include <sys/file.h>
87 #include <sys/proc.h>
88 #include <sys/socketvar.h>
89 #include <sys/signalvar.h>
90 #include <sys/uio.h>
91 #include <sys/kernel.h>
92 #include <sys/stat.h>
93 #include <sys/kmem.h>
94 #include <sys/poll.h>
95 #include <sys/vnode.h>
96 #include <sys/mount.h>
97 #include <sys/syscallargs.h>
98 #include <sys/ktrace.h>
99
100 #include <uvm/uvm_extern.h>
101
102 /* Flags for lwp::l_selflag. */
103 #define SEL_RESET 0 /* awoken, interrupted, or not yet polling */
104 #define SEL_SCANNING 1 /* polling descriptors */
105 #define SEL_BLOCKING 2 /* about to block on select_cv */
106
107 static int selscan(lwp_t *, fd_mask *, fd_mask *, int, register_t *);
108 static int pollscan(lwp_t *, struct pollfd *, int, register_t *);
109
110 /* Global state for select()/poll(). */
111 kmutex_t select_lock;
112 kcondvar_t select_cv;
113 int nselcoll;
114
115 /*
116 * Read system call.
117 */
118 /* ARGSUSED */
119 int
120 sys_read(struct lwp *l, const struct sys_read_args *uap, register_t *retval)
121 {
122 /* {
123 syscallarg(int) fd;
124 syscallarg(void *) buf;
125 syscallarg(size_t) nbyte;
126 } */
127 file_t *fp;
128 int fd;
129
130 fd = SCARG(uap, fd);
131
132 if ((fp = fd_getfile(fd)) == NULL)
133 return (EBADF);
134
135 if ((fp->f_flag & FREAD) == 0) {
136 fd_putfile(fd);
137 return (EBADF);
138 }
139
140 /* dofileread() will unuse the descriptor for us */
141 return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
142 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
143 }
144
145 int
146 dofileread(int fd, struct file *fp, void *buf, size_t nbyte,
147 off_t *offset, int flags, register_t *retval)
148 {
149 struct iovec aiov;
150 struct uio auio;
151 size_t cnt;
152 int error;
153 lwp_t *l;
154
155 l = curlwp;
156
157 aiov.iov_base = (void *)buf;
158 aiov.iov_len = nbyte;
159 auio.uio_iov = &aiov;
160 auio.uio_iovcnt = 1;
161 auio.uio_resid = nbyte;
162 auio.uio_rw = UIO_READ;
163 auio.uio_vmspace = l->l_proc->p_vmspace;
164
165 /*
166 * Reads return ssize_t because -1 is returned on error. Therefore
167 * we must restrict the length to SSIZE_MAX to avoid garbage return
168 * values.
169 */
170 if (auio.uio_resid > SSIZE_MAX) {
171 error = EINVAL;
172 goto out;
173 }
174
175 cnt = auio.uio_resid;
176 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
177 if (error)
178 if (auio.uio_resid != cnt && (error == ERESTART ||
179 error == EINTR || error == EWOULDBLOCK))
180 error = 0;
181 cnt -= auio.uio_resid;
182 ktrgenio(fd, UIO_READ, buf, cnt, error);
183 *retval = cnt;
184 out:
185 fd_putfile(fd);
186 return (error);
187 }
188
189 /*
190 * Scatter read system call.
191 */
192 int
193 sys_readv(struct lwp *l, const struct sys_readv_args *uap, register_t *retval)
194 {
195 /* {
196 syscallarg(int) fd;
197 syscallarg(const struct iovec *) iovp;
198 syscallarg(int) iovcnt;
199 } */
200
201 return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
202 SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
203 }
204
205 int
206 do_filereadv(int fd, const struct iovec *iovp, int iovcnt,
207 off_t *offset, int flags, register_t *retval)
208 {
209 struct uio auio;
210 struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV];
211 int i, error;
212 size_t cnt;
213 u_int iovlen;
214 struct file *fp;
215 struct iovec *ktriov = NULL;
216
217 if (iovcnt == 0)
218 return EINVAL;
219
220 if ((fp = fd_getfile(fd)) == NULL)
221 return EBADF;
222
223 if ((fp->f_flag & FREAD) == 0) {
224 fd_putfile(fd);
225 return EBADF;
226 }
227
228 if (offset == NULL)
229 offset = &fp->f_offset;
230 else {
231 struct vnode *vp = fp->f_data;
232 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
233 error = ESPIPE;
234 goto out;
235 }
236 /*
237 * Test that the device is seekable ?
238 * XXX This works because no file systems actually
239 * XXX take any action on the seek operation.
240 */
241 error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
242 if (error != 0)
243 goto out;
244 }
245
246 iovlen = iovcnt * sizeof(struct iovec);
247 if (flags & FOF_IOV_SYSSPACE)
248 iov = __UNCONST(iovp);
249 else {
250 iov = aiov;
251 if ((u_int)iovcnt > UIO_SMALLIOV) {
252 if ((u_int)iovcnt > IOV_MAX) {
253 error = EINVAL;
254 goto out;
255 }
256 iov = kmem_alloc(iovlen, KM_SLEEP);
257 if (iov == NULL) {
258 error = ENOMEM;
259 goto out;
260 }
261 needfree = iov;
262 }
263 error = copyin(iovp, iov, iovlen);
264 if (error)
265 goto done;
266 }
267
268 auio.uio_iov = iov;
269 auio.uio_iovcnt = iovcnt;
270 auio.uio_rw = UIO_READ;
271 auio.uio_vmspace = curproc->p_vmspace;
272
273 auio.uio_resid = 0;
274 for (i = 0; i < iovcnt; i++, iov++) {
275 auio.uio_resid += iov->iov_len;
276 /*
277 * Reads return ssize_t because -1 is returned on error.
278 * Therefore we must restrict the length to SSIZE_MAX to
279 * avoid garbage return values.
280 */
281 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
282 error = EINVAL;
283 goto done;
284 }
285 }
286
287 /*
288 * if tracing, save a copy of iovec
289 */
290 if (ktrpoint(KTR_GENIO)) {
291 ktriov = kmem_alloc(iovlen, KM_SLEEP);
292 if (ktriov != NULL)
293 memcpy(ktriov, auio.uio_iov, iovlen);
294 }
295
296 cnt = auio.uio_resid;
297 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
298 if (error)
299 if (auio.uio_resid != cnt && (error == ERESTART ||
300 error == EINTR || error == EWOULDBLOCK))
301 error = 0;
302 cnt -= auio.uio_resid;
303 *retval = cnt;
304
305 if (ktriov != NULL) {
306 ktrgeniov(fd, UIO_READ, ktriov, cnt, error);
307 kmem_free(ktriov, iovlen);
308 }
309
310 done:
311 if (needfree)
312 kmem_free(needfree, iovlen);
313 out:
314 fd_putfile(fd);
315 return (error);
316 }
317
318 /*
319 * Write system call
320 */
321 int
322 sys_write(struct lwp *l, const struct sys_write_args *uap, register_t *retval)
323 {
324 /* {
325 syscallarg(int) fd;
326 syscallarg(const void *) buf;
327 syscallarg(size_t) nbyte;
328 } */
329 file_t *fp;
330 int fd;
331
332 fd = SCARG(uap, fd);
333
334 if ((fp = fd_getfile(fd)) == NULL)
335 return (EBADF);
336
337 if ((fp->f_flag & FWRITE) == 0) {
338 fd_putfile(fd);
339 return (EBADF);
340 }
341
342 /* dofilewrite() will unuse the descriptor for us */
343 return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
344 &fp->f_offset, FOF_UPDATE_OFFSET, retval));
345 }
346
347 int
348 dofilewrite(int fd, struct file *fp, const void *buf,
349 size_t nbyte, off_t *offset, int flags, register_t *retval)
350 {
351 struct iovec aiov;
352 struct uio auio;
353 size_t cnt;
354 int error;
355
356 aiov.iov_base = __UNCONST(buf); /* XXXUNCONST kills const */
357 aiov.iov_len = nbyte;
358 auio.uio_iov = &aiov;
359 auio.uio_iovcnt = 1;
360 auio.uio_resid = nbyte;
361 auio.uio_rw = UIO_WRITE;
362 auio.uio_vmspace = curproc->p_vmspace;
363
364 /*
365 * Writes return ssize_t because -1 is returned on error. Therefore
366 * we must restrict the length to SSIZE_MAX to avoid garbage return
367 * values.
368 */
369 if (auio.uio_resid > SSIZE_MAX) {
370 error = EINVAL;
371 goto out;
372 }
373
374 cnt = auio.uio_resid;
375 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
376 if (error) {
377 if (auio.uio_resid != cnt && (error == ERESTART ||
378 error == EINTR || error == EWOULDBLOCK))
379 error = 0;
380 if (error == EPIPE) {
381 mutex_enter(&proclist_mutex);
382 psignal(curproc, SIGPIPE);
383 mutex_exit(&proclist_mutex);
384 }
385 }
386 cnt -= auio.uio_resid;
387 ktrgenio(fd, UIO_WRITE, buf, cnt, error);
388 *retval = cnt;
389 out:
390 fd_putfile(fd);
391 return (error);
392 }
393
394 /*
395 * Gather write system call
396 */
397 int
398 sys_writev(struct lwp *l, const struct sys_writev_args *uap, register_t *retval)
399 {
400 /* {
401 syscallarg(int) fd;
402 syscallarg(const struct iovec *) iovp;
403 syscallarg(int) iovcnt;
404 } */
405
406 return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
407 SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
408 }
409
410 int
411 do_filewritev(int fd, const struct iovec *iovp, int iovcnt,
412 off_t *offset, int flags, register_t *retval)
413 {
414 struct uio auio;
415 struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV];
416 int i, error;
417 size_t cnt;
418 u_int iovlen;
419 struct file *fp;
420 struct iovec *ktriov = NULL;
421
422 if (iovcnt == 0)
423 return EINVAL;
424
425 if ((fp = fd_getfile(fd)) == NULL)
426 return EBADF;
427
428 if ((fp->f_flag & FWRITE) == 0) {
429 fd_putfile(fd);
430 return EBADF;
431 }
432
433 if (offset == NULL)
434 offset = &fp->f_offset;
435 else {
436 struct vnode *vp = fp->f_data;
437 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
438 error = ESPIPE;
439 goto out;
440 }
441 /*
442 * Test that the device is seekable ?
443 * XXX This works because no file systems actually
444 * XXX take any action on the seek operation.
445 */
446 error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
447 if (error != 0)
448 goto out;
449 }
450
451 iovlen = iovcnt * sizeof(struct iovec);
452 if (flags & FOF_IOV_SYSSPACE)
453 iov = __UNCONST(iovp);
454 else {
455 iov = aiov;
456 if ((u_int)iovcnt > UIO_SMALLIOV) {
457 if ((u_int)iovcnt > IOV_MAX) {
458 error = EINVAL;
459 goto out;
460 }
461 iov = kmem_alloc(iovlen, KM_SLEEP);
462 if (iov == NULL) {
463 error = ENOMEM;
464 goto out;
465 }
466 needfree = iov;
467 }
468 error = copyin(iovp, iov, iovlen);
469 if (error)
470 goto done;
471 }
472
473 auio.uio_iov = iov;
474 auio.uio_iovcnt = iovcnt;
475 auio.uio_rw = UIO_WRITE;
476 auio.uio_vmspace = curproc->p_vmspace;
477
478 auio.uio_resid = 0;
479 for (i = 0; i < iovcnt; i++, iov++) {
480 auio.uio_resid += iov->iov_len;
481 /*
482 * Writes return ssize_t because -1 is returned on error.
483 * Therefore we must restrict the length to SSIZE_MAX to
484 * avoid garbage return values.
485 */
486 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
487 error = EINVAL;
488 goto done;
489 }
490 }
491
492 /*
493 * if tracing, save a copy of iovec
494 */
495 if (ktrpoint(KTR_GENIO)) {
496 ktriov = kmem_alloc(iovlen, KM_SLEEP);
497 if (ktriov != NULL)
498 memcpy(ktriov, auio.uio_iov, iovlen);
499 }
500
501 cnt = auio.uio_resid;
502 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
503 if (error) {
504 if (auio.uio_resid != cnt && (error == ERESTART ||
505 error == EINTR || error == EWOULDBLOCK))
506 error = 0;
507 if (error == EPIPE) {
508 mutex_enter(&proclist_mutex);
509 psignal(curproc, SIGPIPE);
510 mutex_exit(&proclist_mutex);
511 }
512 }
513 cnt -= auio.uio_resid;
514 *retval = cnt;
515
516 if (ktriov != NULL) {
517 ktrgeniov(fd, UIO_WRITE, ktriov, cnt, error);
518 kmem_free(ktriov, iovlen);
519 }
520
521 done:
522 if (needfree)
523 kmem_free(needfree, iovlen);
524 out:
525 fd_putfile(fd);
526 return (error);
527 }
528
529 /*
530 * Ioctl system call
531 */
532 /* ARGSUSED */
533 int
534 sys_ioctl(struct lwp *l, const struct sys_ioctl_args *uap, register_t *retval)
535 {
536 /* {
537 syscallarg(int) fd;
538 syscallarg(u_long) com;
539 syscallarg(void *) data;
540 } */
541 struct file *fp;
542 proc_t *p;
543 struct filedesc *fdp;
544 u_long com;
545 int error;
546 u_int size;
547 void *data, *memp;
548 #define STK_PARAMS 128
549 u_long stkbuf[STK_PARAMS/sizeof(u_long)];
550 fdfile_t *ff;
551
552 error = 0;
553 p = l->l_proc;
554 fdp = p->p_fd;
555
556 if ((fp = fd_getfile(SCARG(uap, fd))) == NULL)
557 return (EBADF);
558
559 if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
560 error = EBADF;
561 com = 0;
562 goto out;
563 }
564
565 ff = fdp->fd_ofiles[SCARG(uap, fd)];
566 switch (com = SCARG(uap, com)) {
567 case FIONCLEX:
568 ff->ff_exclose = 0;
569 goto out;
570
571 case FIOCLEX:
572 ff->ff_exclose = 1;
573 fdp->fd_exclose = 1;
574 goto out;
575 }
576
577 /*
578 * Interpret high order word to find amount of data to be
579 * copied to/from the user's address space.
580 */
581 size = IOCPARM_LEN(com);
582 if (size > IOCPARM_MAX) {
583 error = ENOTTY;
584 goto out;
585 }
586 memp = NULL;
587 if (size > sizeof(stkbuf)) {
588 memp = kmem_alloc(size, KM_SLEEP);
589 data = memp;
590 } else
591 data = (void *)stkbuf;
592 if (com&IOC_IN) {
593 if (size) {
594 error = copyin(SCARG(uap, data), data, size);
595 if (error) {
596 if (memp)
597 kmem_free(memp, size);
598 goto out;
599 }
600 ktrgenio(SCARG(uap, fd), UIO_WRITE, SCARG(uap, data),
601 size, 0);
602 } else
603 *(void **)data = SCARG(uap, data);
604 } else if ((com&IOC_OUT) && size)
605 /*
606 * Zero the buffer so the user always
607 * gets back something deterministic.
608 */
609 memset(data, 0, size);
610 else if (com&IOC_VOID)
611 *(void **)data = SCARG(uap, data);
612
613 switch (com) {
614
615 case FIONBIO:
616 FILE_LOCK(fp);
617 if (*(int *)data != 0)
618 fp->f_flag |= FNONBLOCK;
619 else
620 fp->f_flag &= ~FNONBLOCK;
621 FILE_UNLOCK(fp);
622 error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data);
623 break;
624
625 case FIOASYNC:
626 FILE_LOCK(fp);
627 if (*(int *)data != 0)
628 fp->f_flag |= FASYNC;
629 else
630 fp->f_flag &= ~FASYNC;
631 FILE_UNLOCK(fp);
632 error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data);
633 break;
634
635 default:
636 error = (*fp->f_ops->fo_ioctl)(fp, com, data);
637 /*
638 * Copy any data to user, size was
639 * already set and checked above.
640 */
641 if (error == 0 && (com&IOC_OUT) && size) {
642 error = copyout(data, SCARG(uap, data), size);
643 ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, data),
644 size, error);
645 }
646 break;
647 }
648 if (memp)
649 kmem_free(memp, size);
650 out:
651 fd_putfile(SCARG(uap, fd));
652 switch (error) {
653 case -1:
654 printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
655 "pid=%d comm=%s\n",
656 (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
657 (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
658 p->p_pid, p->p_comm);
659 /* FALLTHROUGH */
660 case EPASSTHROUGH:
661 error = ENOTTY;
662 /* FALLTHROUGH */
663 default:
664 return (error);
665 }
666 }
667
668 /*
669 * Select system call.
670 */
671 int
672 sys_pselect(struct lwp *l, const struct sys_pselect_args *uap, register_t *retval)
673 {
674 /* {
675 syscallarg(int) nd;
676 syscallarg(fd_set *) in;
677 syscallarg(fd_set *) ou;
678 syscallarg(fd_set *) ex;
679 syscallarg(const struct timespec *) ts;
680 syscallarg(sigset_t *) mask;
681 } */
682 struct timespec ats;
683 struct timeval atv, *tv = NULL;
684 sigset_t amask, *mask = NULL;
685 int error;
686
687 if (SCARG(uap, ts)) {
688 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
689 if (error)
690 return error;
691 atv.tv_sec = ats.tv_sec;
692 atv.tv_usec = ats.tv_nsec / 1000;
693 tv = &atv;
694 }
695 if (SCARG(uap, mask) != NULL) {
696 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
697 if (error)
698 return error;
699 mask = &amask;
700 }
701
702 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
703 SCARG(uap, ou), SCARG(uap, ex), tv, mask);
704 }
705
706 int
707 inittimeleft(struct timeval *tv, struct timeval *sleeptv)
708 {
709 if (itimerfix(tv))
710 return -1;
711 getmicrouptime(sleeptv);
712 return 0;
713 }
714
715 int
716 gettimeleft(struct timeval *tv, struct timeval *sleeptv)
717 {
718 /*
719 * We have to recalculate the timeout on every retry.
720 */
721 struct timeval slepttv;
722 /*
723 * reduce tv by elapsed time
724 * based on monotonic time scale
725 */
726 getmicrouptime(&slepttv);
727 timeradd(tv, sleeptv, tv);
728 timersub(tv, &slepttv, tv);
729 *sleeptv = slepttv;
730 return tvtohz(tv);
731 }
732
733 int
734 sys_select(struct lwp *l, const struct sys_select_args *uap, register_t *retval)
735 {
736 /* {
737 syscallarg(int) nd;
738 syscallarg(fd_set *) in;
739 syscallarg(fd_set *) ou;
740 syscallarg(fd_set *) ex;
741 syscallarg(struct timeval *) tv;
742 } */
743 struct timeval atv, *tv = NULL;
744 int error;
745
746 if (SCARG(uap, tv)) {
747 error = copyin(SCARG(uap, tv), (void *)&atv,
748 sizeof(atv));
749 if (error)
750 return error;
751 tv = &atv;
752 }
753
754 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
755 SCARG(uap, ou), SCARG(uap, ex), tv, NULL);
756 }
757
758 int
759 selcommon(lwp_t *l, register_t *retval, int nd, fd_set *u_in,
760 fd_set *u_ou, fd_set *u_ex, struct timeval *tv, sigset_t *mask)
761 {
762 char smallbits[howmany(FD_SETSIZE, NFDBITS) *
763 sizeof(fd_mask) * 6];
764 proc_t * const p = l->l_proc;
765 char *bits;
766 int ncoll, error, timo;
767 size_t ni;
768 sigset_t oldmask;
769 struct timeval sleeptv;
770
771 error = 0;
772 if (nd < 0)
773 return (EINVAL);
774 if (nd > p->p_fd->fd_nfiles) {
775 /* forgiving; slightly wrong */
776 nd = p->p_fd->fd_nfiles;
777 }
778 ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
779 if (ni * 6 > sizeof(smallbits))
780 bits = kmem_alloc(ni * 6, KM_SLEEP);
781 else
782 bits = smallbits;
783
784 #define getbits(name, x) \
785 if (u_ ## name) { \
786 error = copyin(u_ ## name, bits + ni * x, ni); \
787 if (error) \
788 goto done; \
789 } else \
790 memset(bits + ni * x, 0, ni);
791 getbits(in, 0);
792 getbits(ou, 1);
793 getbits(ex, 2);
794 #undef getbits
795
796 timo = 0;
797 if (tv && inittimeleft(tv, &sleeptv) == -1) {
798 error = EINVAL;
799 goto done;
800 }
801
802 if (mask) {
803 sigminusset(&sigcantmask, mask);
804 mutex_enter(&p->p_smutex);
805 oldmask = l->l_sigmask;
806 l->l_sigmask = *mask;
807 mutex_exit(&p->p_smutex);
808 } else
809 oldmask = l->l_sigmask; /* XXXgcc */
810
811 mutex_enter(&select_lock);
812 SLIST_INIT(&l->l_selwait);
813 for (;;) {
814 l->l_selflag = SEL_SCANNING;
815 ncoll = nselcoll;
816 mutex_exit(&select_lock);
817
818 error = selscan(l, (fd_mask *)(bits + ni * 0),
819 (fd_mask *)(bits + ni * 3), nd, retval);
820
821 mutex_enter(&select_lock);
822 if (error || *retval)
823 break;
824 if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
825 break;
826 if (l->l_selflag != SEL_SCANNING || ncoll != nselcoll)
827 continue;
828 l->l_selflag = SEL_BLOCKING;
829 error = cv_timedwait_sig(&select_cv, &select_lock, timo);
830 if (error != 0)
831 break;
832 }
833 selclear();
834 mutex_exit(&select_lock);
835
836 if (mask) {
837 mutex_enter(&p->p_smutex);
838 l->l_sigmask = oldmask;
839 mutex_exit(&p->p_smutex);
840 }
841
842 done:
843 /* select is not restarted after signals... */
844 if (error == ERESTART)
845 error = EINTR;
846 if (error == EWOULDBLOCK)
847 error = 0;
848 if (error == 0 && u_in != NULL)
849 error = copyout(bits + ni * 3, u_in, ni);
850 if (error == 0 && u_ou != NULL)
851 error = copyout(bits + ni * 4, u_ou, ni);
852 if (error == 0 && u_ex != NULL)
853 error = copyout(bits + ni * 5, u_ex, ni);
854 if (bits != smallbits)
855 kmem_free(bits, ni * 6);
856 return (error);
857 }
858
859 int
860 selscan(lwp_t *l, fd_mask *ibitp, fd_mask *obitp, int nfd,
861 register_t *retval)
862 {
863 static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
864 POLLWRNORM | POLLHUP | POLLERR,
865 POLLRDBAND };
866 int msk, i, j, fd, n;
867 fd_mask ibits, obits;
868 file_t *fp;
869
870 n = 0;
871 for (msk = 0; msk < 3; msk++) {
872 for (i = 0; i < nfd; i += NFDBITS) {
873 ibits = *ibitp++;
874 obits = 0;
875 while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
876 ibits &= ~(1 << j);
877 if ((fp = fd_getfile(fd)) == NULL)
878 return (EBADF);
879 if ((*fp->f_ops->fo_poll)(fp, flag[msk])) {
880 obits |= (1 << j);
881 n++;
882 }
883 fd_putfile(fd);
884 }
885 *obitp++ = obits;
886 }
887 }
888 *retval = n;
889 return (0);
890 }
891
892 /*
893 * Poll system call.
894 */
895 int
896 sys_poll(struct lwp *l, const struct sys_poll_args *uap, register_t *retval)
897 {
898 /* {
899 syscallarg(struct pollfd *) fds;
900 syscallarg(u_int) nfds;
901 syscallarg(int) timeout;
902 } */
903 struct timeval atv, *tv = NULL;
904
905 if (SCARG(uap, timeout) != INFTIM) {
906 atv.tv_sec = SCARG(uap, timeout) / 1000;
907 atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
908 tv = &atv;
909 }
910
911 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
912 tv, NULL);
913 }
914
915 /*
916 * Poll system call.
917 */
918 int
919 sys_pollts(struct lwp *l, const struct sys_pollts_args *uap, register_t *retval)
920 {
921 /* {
922 syscallarg(struct pollfd *) fds;
923 syscallarg(u_int) nfds;
924 syscallarg(const struct timespec *) ts;
925 syscallarg(const sigset_t *) mask;
926 } */
927 struct timespec ats;
928 struct timeval atv, *tv = NULL;
929 sigset_t amask, *mask = NULL;
930 int error;
931
932 if (SCARG(uap, ts)) {
933 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
934 if (error)
935 return error;
936 atv.tv_sec = ats.tv_sec;
937 atv.tv_usec = ats.tv_nsec / 1000;
938 tv = &atv;
939 }
940 if (SCARG(uap, mask)) {
941 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
942 if (error)
943 return error;
944 mask = &amask;
945 }
946
947 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
948 tv, mask);
949 }
950
951 int
952 pollcommon(lwp_t *l, register_t *retval,
953 struct pollfd *u_fds, u_int nfds,
954 struct timeval *tv, sigset_t *mask)
955 {
956 char smallbits[32 * sizeof(struct pollfd)];
957 proc_t * const p = l->l_proc;
958 void * bits;
959 sigset_t oldmask;
960 int ncoll, error, timo;
961 size_t ni;
962 struct timeval sleeptv;
963
964 if (nfds > p->p_fd->fd_nfiles) {
965 /* forgiving; slightly wrong */
966 nfds = p->p_fd->fd_nfiles;
967 }
968 ni = nfds * sizeof(struct pollfd);
969 if (ni > sizeof(smallbits))
970 bits = kmem_alloc(ni, KM_SLEEP);
971 else
972 bits = smallbits;
973
974 error = copyin(u_fds, bits, ni);
975 if (error)
976 goto done;
977
978 timo = 0;
979 if (tv && inittimeleft(tv, &sleeptv) == -1) {
980 error = EINVAL;
981 goto done;
982 }
983
984 if (mask) {
985 sigminusset(&sigcantmask, mask);
986 mutex_enter(&p->p_smutex);
987 oldmask = l->l_sigmask;
988 l->l_sigmask = *mask;
989 mutex_exit(&p->p_smutex);
990 } else
991 oldmask = l->l_sigmask; /* XXXgcc */
992
993 mutex_enter(&select_lock);
994 SLIST_INIT(&l->l_selwait);
995 for (;;) {
996 ncoll = nselcoll;
997 l->l_selflag = SEL_SCANNING;
998 mutex_exit(&select_lock);
999
1000 error = pollscan(l, (struct pollfd *)bits, nfds, retval);
1001
1002 mutex_enter(&select_lock);
1003 if (error || *retval)
1004 break;
1005 if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
1006 break;
1007 if (l->l_selflag != SEL_SCANNING || nselcoll != ncoll)
1008 continue;
1009 l->l_selflag = SEL_BLOCKING;
1010 error = cv_timedwait_sig(&select_cv, &select_lock, timo);
1011 if (error != 0)
1012 break;
1013 }
1014 selclear();
1015 mutex_exit(&select_lock);
1016
1017 if (mask) {
1018 mutex_enter(&p->p_smutex);
1019 l->l_sigmask = oldmask;
1020 mutex_exit(&p->p_smutex);
1021 }
1022 done:
1023 /* poll is not restarted after signals... */
1024 if (error == ERESTART)
1025 error = EINTR;
1026 if (error == EWOULDBLOCK)
1027 error = 0;
1028 if (error == 0)
1029 error = copyout(bits, u_fds, ni);
1030 if (bits != smallbits)
1031 kmem_free(bits, ni);
1032 return (error);
1033 }
1034
1035 int
1036 pollscan(lwp_t *l, struct pollfd *fds, int nfd, register_t *retval)
1037 {
1038 int i, n;
1039 file_t *fp;
1040
1041 n = 0;
1042 for (i = 0; i < nfd; i++, fds++) {
1043 if (fds->fd < 0) {
1044 fds->revents = 0;
1045 } else if ((fp = fd_getfile(fds->fd)) == NULL) {
1046 fds->revents = POLLNVAL;
1047 n++;
1048 } else {
1049 fds->revents = (*fp->f_ops->fo_poll)(fp,
1050 fds->events | POLLERR | POLLHUP);
1051 if (fds->revents != 0)
1052 n++;
1053 fd_putfile(fds->fd);
1054 }
1055 }
1056 *retval = n;
1057 return (0);
1058 }
1059
1060 /*ARGSUSED*/
1061 int
1062 seltrue(dev_t dev, int events, lwp_t *l)
1063 {
1064
1065 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1066 }
1067
1068 /*
1069 * Record a select request.
1070 */
1071 void
1072 selrecord(lwp_t *selector, struct selinfo *sip)
1073 {
1074
1075 mutex_enter(&select_lock);
1076 if (sip->sel_lwp == NULL) {
1077 /* First named waiter, although there may be more. */
1078 sip->sel_lwp = selector;
1079 SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
1080 } else if (sip->sel_lwp != selector) {
1081 /* Multiple waiters. */
1082 sip->sel_collision = true;
1083 }
1084 mutex_exit(&select_lock);
1085 }
1086
1087 /*
1088 * Do a wakeup when a selectable event occurs.
1089 */
1090 void
1091 selnotify(struct selinfo *sip, int events, long knhint)
1092 {
1093 lwp_t *l;
1094
1095 mutex_enter(&select_lock);
1096 if (sip->sel_collision) {
1097 /* Multiple waiters - just notify everybody. */
1098 nselcoll++;
1099 sip->sel_collision = false;
1100 cv_broadcast(&select_cv);
1101 } else if (sip->sel_lwp != NULL) {
1102 /* Only one LWP waiting. */
1103 l = sip->sel_lwp;
1104 if (l->l_selflag == SEL_BLOCKING) {
1105 /*
1106 * If it's sleeping, wake it up. If not, it's
1107 * already awake but hasn't yet removed itself
1108 * from the selector. We reset the state below
1109 * so that we only attempt to do this once.
1110 */
1111 lwp_lock(l);
1112 if (l->l_wchan == &select_cv) {
1113 /* lwp_unsleep() releases the LWP lock. */
1114 (void)lwp_unsleep(l, true);
1115 } else
1116 lwp_unlock(l);
1117 } else {
1118 /*
1119 * Not yet asleep. Reset its state below so that
1120 * it will go around again.
1121 */
1122 }
1123 l->l_selflag = SEL_RESET;
1124 }
1125 mutex_exit(&select_lock);
1126
1127 KNOTE(&sip->sel_klist, knhint);
1128 }
1129
1130 /*
1131 * Remove an LWP from all objects that it is waiting for.
1132 */
1133 void
1134 selclear(void)
1135 {
1136 struct selinfo *sip;
1137 lwp_t *l = curlwp;
1138
1139 KASSERT(mutex_owned(&select_lock));
1140
1141 SLIST_FOREACH(sip, &l->l_selwait, sel_chain) {
1142 KASSERT(sip->sel_lwp == l);
1143 sip->sel_lwp = NULL;
1144 }
1145 }
1146
1147 /*
1148 * Initialize the select/poll system calls.
1149 */
1150 void
1151 selsysinit(void)
1152 {
1153
1154 mutex_init(&select_lock, MUTEX_DEFAULT, IPL_VM);
1155 cv_init(&select_cv, "select");
1156 }
1157
1158 /*
1159 * Initialize a selector.
1160 */
1161 void
1162 selinit(struct selinfo *sip)
1163 {
1164
1165 memset(sip, 0, sizeof(*sip));
1166 }
1167
1168 /*
1169 * Destroy a selector. The owning object must not gain new
1170 * references while this is in progress: all activity on the
1171 * selector must be stopped.
1172 */
1173 void
1174 seldestroy(struct selinfo *sip)
1175 {
1176 lwp_t *l;
1177
1178 if (sip->sel_lwp == NULL)
1179 return;
1180
1181 mutex_enter(&select_lock);
1182 if ((l = sip->sel_lwp) != NULL) {
1183 /* This should rarely happen, so SLIST_REMOVE() is OK. */
1184 SLIST_REMOVE(&l->l_selwait, sip, selinfo, sel_chain);
1185 sip->sel_lwp = NULL;
1186 }
1187 mutex_exit(&select_lock);
1188 }
1189