Home | History | Annotate | Line # | Download | only in kern
sys_generic.c revision 1.104
      1 /*	$NetBSD: sys_generic.c,v 1.104 2007/08/15 12:07:34 ad Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2007 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Andrew Doran.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  * 3. All advertising materials mentioning features or use of this software
     19  *    must display the following acknowledgement:
     20  *	This product includes software developed by the NetBSD
     21  *	Foundation, Inc. and its contributors.
     22  * 4. Neither the name of The NetBSD Foundation nor the names of its
     23  *    contributors may be used to endorse or promote products derived
     24  *    from this software without specific prior written permission.
     25  *
     26  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     27  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     28  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     29  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     30  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     31  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     32  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     33  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     34  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     35  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     36  * POSSIBILITY OF SUCH DAMAGE.
     37  */
     38 
     39 /*
     40  * Copyright (c) 1982, 1986, 1989, 1993
     41  *	The Regents of the University of California.  All rights reserved.
     42  * (c) UNIX System Laboratories, Inc.
     43  * All or some portions of this file are derived from material licensed
     44  * to the University of California by American Telephone and Telegraph
     45  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
     46  * the permission of UNIX System Laboratories, Inc.
     47  *
     48  * Redistribution and use in source and binary forms, with or without
     49  * modification, are permitted provided that the following conditions
     50  * are met:
     51  * 1. Redistributions of source code must retain the above copyright
     52  *    notice, this list of conditions and the following disclaimer.
     53  * 2. Redistributions in binary form must reproduce the above copyright
     54  *    notice, this list of conditions and the following disclaimer in the
     55  *    documentation and/or other materials provided with the distribution.
     56  * 3. Neither the name of the University nor the names of its contributors
     57  *    may be used to endorse or promote products derived from this software
     58  *    without specific prior written permission.
     59  *
     60  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     61  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     62  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     63  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     64  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     65  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     66  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     67  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     68  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     69  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     70  * SUCH DAMAGE.
     71  *
     72  *	@(#)sys_generic.c	8.9 (Berkeley) 2/14/95
     73  */
     74 
     75 /*
     76  * System calls relating to files.
     77  */
     78 
     79 #include <sys/cdefs.h>
     80 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.104 2007/08/15 12:07:34 ad Exp $");
     81 
     82 #include <sys/param.h>
     83 #include <sys/systm.h>
     84 #include <sys/filedesc.h>
     85 #include <sys/ioctl.h>
     86 #include <sys/file.h>
     87 #include <sys/proc.h>
     88 #include <sys/socketvar.h>
     89 #include <sys/signalvar.h>
     90 #include <sys/uio.h>
     91 #include <sys/kernel.h>
     92 #include <sys/stat.h>
     93 #include <sys/kmem.h>
     94 #include <sys/poll.h>
     95 #include <sys/vnode.h>
     96 #include <sys/mount.h>
     97 #include <sys/syscallargs.h>
     98 #include <sys/ktrace.h>
     99 
    100 #include <uvm/uvm_extern.h>
    101 
    102 /* Flags for lwp::l_selflag. */
    103 #define	SEL_RESET	0	/* awoken, interrupted, or not yet polling */
    104 #define	SEL_SCANNING	1	/* polling descriptors */
    105 #define	SEL_BLOCKING	2	/* about to block on select_cv */
    106 
    107 static int	selscan(lwp_t *, fd_mask *, fd_mask *, int, register_t *);
    108 static int	pollscan(lwp_t *, struct pollfd *, int, register_t *);
    109 static void	selclear(void);
    110 
    111 /* Global state for select()/poll(). */
    112 kmutex_t	select_lock;
    113 kcondvar_t	select_cv;
    114 int		nselcoll;
    115 
    116 /*
    117  * Read system call.
    118  */
    119 /* ARGSUSED */
    120 int
    121 sys_read(lwp_t *l, void *v, register_t *retval)
    122 {
    123 	struct sys_read_args /* {
    124 		syscallarg(int)		fd;
    125 		syscallarg(void *)	buf;
    126 		syscallarg(size_t)	nbyte;
    127 	} */ *uap = v;
    128 	int		fd;
    129 	struct file	*fp;
    130 	proc_t		*p;
    131 	struct filedesc	*fdp;
    132 
    133 	fd = SCARG(uap, fd);
    134 	p = l->l_proc;
    135 	fdp = p->p_fd;
    136 
    137 	if ((fp = fd_getfile(fdp, fd)) == NULL)
    138 		return (EBADF);
    139 
    140 	if ((fp->f_flag & FREAD) == 0) {
    141 		simple_unlock(&fp->f_slock);
    142 		return (EBADF);
    143 	}
    144 
    145 	FILE_USE(fp);
    146 
    147 	/* dofileread() will unuse the descriptor for us */
    148 	return (dofileread(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
    149 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
    150 }
    151 
    152 int
    153 dofileread(lwp_t *l, int fd, struct file *fp, void *buf, size_t nbyte,
    154 	off_t *offset, int flags, register_t *retval)
    155 {
    156 	struct iovec aiov;
    157 	struct uio auio;
    158 	proc_t *p;
    159 	struct vmspace *vm;
    160 	size_t cnt;
    161 	int error;
    162 	p = l->l_proc;
    163 
    164 	error = proc_vmspace_getref(p, &vm);
    165 	if (error) {
    166 		goto out;
    167 	}
    168 
    169 	aiov.iov_base = (void *)buf;
    170 	aiov.iov_len = nbyte;
    171 	auio.uio_iov = &aiov;
    172 	auio.uio_iovcnt = 1;
    173 	auio.uio_resid = nbyte;
    174 	auio.uio_rw = UIO_READ;
    175 	auio.uio_vmspace = vm;
    176 
    177 	/*
    178 	 * Reads return ssize_t because -1 is returned on error.  Therefore
    179 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
    180 	 * values.
    181 	 */
    182 	if (auio.uio_resid > SSIZE_MAX) {
    183 		error = EINVAL;
    184 		goto out;
    185 	}
    186 
    187 	cnt = auio.uio_resid;
    188 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
    189 	if (error)
    190 		if (auio.uio_resid != cnt && (error == ERESTART ||
    191 		    error == EINTR || error == EWOULDBLOCK))
    192 			error = 0;
    193 	cnt -= auio.uio_resid;
    194 	ktrgenio(fd, UIO_READ, buf, nbyte, error);
    195 	*retval = cnt;
    196  out:
    197 	FILE_UNUSE(fp, l);
    198 	uvmspace_free(vm);
    199 	return (error);
    200 }
    201 
    202 /*
    203  * Scatter read system call.
    204  */
    205 int
    206 sys_readv(lwp_t *l, void *v, register_t *retval)
    207 {
    208 	struct sys_readv_args /* {
    209 		syscallarg(int)				fd;
    210 		syscallarg(const struct iovec *)	iovp;
    211 		syscallarg(int)				iovcnt;
    212 	} */ *uap = v;
    213 
    214 	return do_filereadv(l, SCARG(uap, fd), SCARG(uap, iovp),
    215 	    SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
    216 }
    217 
    218 int
    219 do_filereadv(struct lwp *l, int fd, const struct iovec *iovp, int iovcnt,
    220     off_t *offset, int flags, register_t *retval)
    221 {
    222 	struct proc	*p;
    223 	struct uio	auio;
    224 	struct iovec	*iov, *needfree = NULL, aiov[UIO_SMALLIOV];
    225 	struct vmspace	*vm;
    226 	int		i, error;
    227 	size_t		cnt;
    228 	u_int		iovlen;
    229 	struct file	*fp;
    230 	struct filedesc	*fdp;
    231 	struct iovec	*ktriov = NULL;
    232 
    233 	if (iovcnt == 0)
    234 		return EINVAL;
    235 
    236 	p = l->l_proc;
    237 	fdp = p->p_fd;
    238 
    239 	if ((fp = fd_getfile(fdp, fd)) == NULL)
    240 		return EBADF;
    241 
    242 	if ((fp->f_flag & FREAD) == 0) {
    243 		simple_unlock(&fp->f_slock);
    244 		return EBADF;
    245 	}
    246 
    247 	FILE_USE(fp);
    248 
    249 	if (offset == NULL)
    250 		offset = &fp->f_offset;
    251 	else {
    252 		struct vnode *vp = fp->f_data;
    253 		if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
    254 			error = ESPIPE;
    255 			goto out;
    256 		}
    257 		/*
    258 		 * Test that the device is seekable ?
    259 		 * XXX This works because no file systems actually
    260 		 * XXX take any action on the seek operation.
    261 		 */
    262 		error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
    263 		if (error != 0)
    264 			goto out;
    265 	}
    266 
    267 	error = proc_vmspace_getref(p, &vm);
    268 	if (error)
    269 		goto out;
    270 
    271 	iovlen = iovcnt * sizeof(struct iovec);
    272 	if (flags & FOF_IOV_SYSSPACE)
    273 		iov = __UNCONST(iovp);
    274 	else {
    275 		iov = aiov;
    276 		if ((u_int)iovcnt > UIO_SMALLIOV) {
    277 			if ((u_int)iovcnt > IOV_MAX) {
    278 				error = EINVAL;
    279 				goto out;
    280 			}
    281 			iov = kmem_alloc(iovlen, KM_SLEEP);
    282 			if (iov == NULL) {
    283 				error = ENOMEM;
    284 				goto out;
    285 			}
    286 			needfree = iov;
    287 		}
    288 		error = copyin(iovp, iov, iovlen);
    289 		if (error)
    290 			goto done;
    291 	}
    292 
    293 	auio.uio_iov = iov;
    294 	auio.uio_iovcnt = iovcnt;
    295 	auio.uio_rw = UIO_READ;
    296 	auio.uio_vmspace = vm;
    297 
    298 	auio.uio_resid = 0;
    299 	for (i = 0; i < iovcnt; i++, iov++) {
    300 		auio.uio_resid += iov->iov_len;
    301 		/*
    302 		 * Reads return ssize_t because -1 is returned on error.
    303 		 * Therefore we must restrict the length to SSIZE_MAX to
    304 		 * avoid garbage return values.
    305 		 */
    306 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
    307 			error = EINVAL;
    308 			goto done;
    309 		}
    310 	}
    311 
    312 	/*
    313 	 * if tracing, save a copy of iovec
    314 	 */
    315 	if (ktrpoint(KTR_GENIO))  {
    316 		ktriov = kmem_alloc(iovlen, KM_SLEEP);
    317 		if (ktriov != NULL)
    318 			memcpy(ktriov, auio.uio_iov, iovlen);
    319 	}
    320 
    321 	cnt = auio.uio_resid;
    322 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
    323 	if (error)
    324 		if (auio.uio_resid != cnt && (error == ERESTART ||
    325 		    error == EINTR || error == EWOULDBLOCK))
    326 			error = 0;
    327 	cnt -= auio.uio_resid;
    328 	*retval = cnt;
    329 
    330 	if (ktriov != NULL) {
    331 		ktrgeniov(fd, UIO_READ, ktriov, cnt, error);
    332 		kmem_free(ktriov, iovlen);
    333 	}
    334 
    335  done:
    336 	if (needfree)
    337 		kmem_free(needfree, iovlen);
    338  out:
    339 	FILE_UNUSE(fp, l);
    340 	uvmspace_free(vm);
    341 	return (error);
    342 }
    343 
    344 /*
    345  * Write system call
    346  */
    347 int
    348 sys_write(lwp_t *l, void *v, register_t *retval)
    349 {
    350 	struct sys_write_args /* {
    351 		syscallarg(int)			fd;
    352 		syscallarg(const void *)	buf;
    353 		syscallarg(size_t)		nbyte;
    354 	} */ *uap = v;
    355 	int		fd;
    356 	struct file	*fp;
    357 	proc_t		*p;
    358 	struct filedesc	*fdp;
    359 
    360 	fd = SCARG(uap, fd);
    361 	p = l->l_proc;
    362 	fdp = p->p_fd;
    363 
    364 	if ((fp = fd_getfile(fdp, fd)) == NULL)
    365 		return (EBADF);
    366 
    367 	if ((fp->f_flag & FWRITE) == 0) {
    368 		simple_unlock(&fp->f_slock);
    369 		return (EBADF);
    370 	}
    371 
    372 	FILE_USE(fp);
    373 
    374 	/* dofilewrite() will unuse the descriptor for us */
    375 	return (dofilewrite(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
    376 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
    377 }
    378 
    379 int
    380 dofilewrite(lwp_t *l, int fd, struct file *fp, const void *buf,
    381 	size_t nbyte, off_t *offset, int flags, register_t *retval)
    382 {
    383 	struct iovec aiov;
    384 	struct uio auio;
    385 	proc_t *p;
    386 	struct vmspace *vm;
    387 	size_t cnt;
    388 	int error;
    389 
    390 	p = l->l_proc;
    391 	error = proc_vmspace_getref(p, &vm);
    392 	if (error) {
    393 		goto out;
    394 	}
    395 	aiov.iov_base = __UNCONST(buf);		/* XXXUNCONST kills const */
    396 	aiov.iov_len = nbyte;
    397 	auio.uio_iov = &aiov;
    398 	auio.uio_iovcnt = 1;
    399 	auio.uio_resid = nbyte;
    400 	auio.uio_rw = UIO_WRITE;
    401 	auio.uio_vmspace = vm;
    402 
    403 	/*
    404 	 * Writes return ssize_t because -1 is returned on error.  Therefore
    405 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
    406 	 * values.
    407 	 */
    408 	if (auio.uio_resid > SSIZE_MAX) {
    409 		error = EINVAL;
    410 		goto out;
    411 	}
    412 
    413 	cnt = auio.uio_resid;
    414 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
    415 	if (error) {
    416 		if (auio.uio_resid != cnt && (error == ERESTART ||
    417 		    error == EINTR || error == EWOULDBLOCK))
    418 			error = 0;
    419 		if (error == EPIPE) {
    420 			mutex_enter(&proclist_mutex);
    421 			psignal(p, SIGPIPE);
    422 			mutex_exit(&proclist_mutex);
    423 		}
    424 	}
    425 	cnt -= auio.uio_resid;
    426 	ktrgenio(fd, UIO_WRITE, buf, nbyte, error);
    427 	*retval = cnt;
    428  out:
    429 	FILE_UNUSE(fp, l);
    430 	uvmspace_free(vm);
    431 	return (error);
    432 }
    433 
    434 /*
    435  * Gather write system call
    436  */
    437 int
    438 sys_writev(lwp_t *l, void *v, register_t *retval)
    439 {
    440 	struct sys_writev_args /* {
    441 		syscallarg(int)				fd;
    442 		syscallarg(const struct iovec *)	iovp;
    443 		syscallarg(int)				iovcnt;
    444 	} */ *uap = v;
    445 
    446 	return do_filewritev(l, SCARG(uap, fd), SCARG(uap, iovp),
    447 	    SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
    448 }
    449 
    450 int
    451 do_filewritev(struct lwp *l, int fd, const struct iovec *iovp, int iovcnt,
    452     off_t *offset, int flags, register_t *retval)
    453 {
    454 	struct proc	*p;
    455 	struct uio	auio;
    456 	struct iovec	*iov, *needfree = NULL, aiov[UIO_SMALLIOV];
    457 	struct vmspace	*vm;
    458 	int		i, error;
    459 	size_t		cnt;
    460 	u_int		iovlen;
    461 	struct file	*fp;
    462 	struct filedesc	*fdp;
    463 	struct iovec	*ktriov = NULL;
    464 
    465 	if (iovcnt == 0)
    466 		return EINVAL;
    467 
    468 	p = l->l_proc;
    469 	fdp = p->p_fd;
    470 
    471 	if ((fp = fd_getfile(fdp, fd)) == NULL)
    472 		return EBADF;
    473 
    474 	if ((fp->f_flag & FWRITE) == 0) {
    475 		simple_unlock(&fp->f_slock);
    476 		return EBADF;
    477 	}
    478 
    479 	FILE_USE(fp);
    480 
    481 	if (offset == NULL)
    482 		offset = &fp->f_offset;
    483 	else {
    484 		struct vnode *vp = fp->f_data;
    485 		if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
    486 			error = ESPIPE;
    487 			goto out;
    488 		}
    489 		/*
    490 		 * Test that the device is seekable ?
    491 		 * XXX This works because no file systems actually
    492 		 * XXX take any action on the seek operation.
    493 		 */
    494 		error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
    495 		if (error != 0)
    496 			goto out;
    497 	}
    498 
    499 	error = proc_vmspace_getref(p, &vm);
    500 	if (error)
    501 		goto out;
    502 
    503 	iovlen = iovcnt * sizeof(struct iovec);
    504 	if (flags & FOF_IOV_SYSSPACE)
    505 		iov = __UNCONST(iovp);
    506 	else {
    507 		iov = aiov;
    508 		if ((u_int)iovcnt > UIO_SMALLIOV) {
    509 			if ((u_int)iovcnt > IOV_MAX) {
    510 				error = EINVAL;
    511 				goto out;
    512 			}
    513 			iov = kmem_alloc(iovlen, KM_SLEEP);
    514 			if (iov == NULL) {
    515 				error = ENOMEM;
    516 				goto out;
    517 			}
    518 			needfree = iov;
    519 		}
    520 		error = copyin(iovp, iov, iovlen);
    521 		if (error)
    522 			goto done;
    523 	}
    524 
    525 	auio.uio_iov = iov;
    526 	auio.uio_iovcnt = iovcnt;
    527 	auio.uio_rw = UIO_WRITE;
    528 	auio.uio_vmspace = vm;
    529 
    530 	auio.uio_resid = 0;
    531 	for (i = 0; i < iovcnt; i++, iov++) {
    532 		auio.uio_resid += iov->iov_len;
    533 		/*
    534 		 * Writes return ssize_t because -1 is returned on error.
    535 		 * Therefore we must restrict the length to SSIZE_MAX to
    536 		 * avoid garbage return values.
    537 		 */
    538 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
    539 			error = EINVAL;
    540 			goto done;
    541 		}
    542 	}
    543 
    544 	/*
    545 	 * if tracing, save a copy of iovec
    546 	 */
    547 	if (ktrpoint(KTR_GENIO))  {
    548 		ktriov = kmem_alloc(iovlen, KM_SLEEP);
    549 		if (ktriov != NULL)
    550 			memcpy(ktriov, auio.uio_iov, iovlen);
    551 	}
    552 
    553 	cnt = auio.uio_resid;
    554 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
    555 	if (error) {
    556 		if (auio.uio_resid != cnt && (error == ERESTART ||
    557 		    error == EINTR || error == EWOULDBLOCK))
    558 			error = 0;
    559 		if (error == EPIPE) {
    560 			mutex_enter(&proclist_mutex);
    561 			psignal(p, SIGPIPE);
    562 			mutex_exit(&proclist_mutex);
    563 		}
    564 	}
    565 	cnt -= auio.uio_resid;
    566 	*retval = cnt;
    567 
    568 	if (ktriov != NULL) {
    569 		ktrgeniov(fd, UIO_WRITE, ktriov, cnt, error);
    570 		kmem_free(ktriov, iovlen);
    571 	}
    572 
    573  done:
    574 	if (needfree)
    575 		kmem_free(needfree, iovlen);
    576  out:
    577 	FILE_UNUSE(fp, l);
    578 	uvmspace_free(vm);
    579 	return (error);
    580 }
    581 
    582 /*
    583  * Ioctl system call
    584  */
    585 /* ARGSUSED */
    586 int
    587 sys_ioctl(struct lwp *l, void *v, register_t *retval)
    588 {
    589 	struct sys_ioctl_args /* {
    590 		syscallarg(int)		fd;
    591 		syscallarg(u_long)	com;
    592 		syscallarg(void *)	data;
    593 	} */ *uap = v;
    594 	struct file	*fp;
    595 	proc_t		*p;
    596 	struct filedesc	*fdp;
    597 	u_long		com;
    598 	int		error;
    599 	u_int		size;
    600 	void 		*data, *memp;
    601 #define	STK_PARAMS	128
    602 	u_long		stkbuf[STK_PARAMS/sizeof(u_long)];
    603 
    604 	error = 0;
    605 	p = l->l_proc;
    606 	fdp = p->p_fd;
    607 
    608 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
    609 		return (EBADF);
    610 
    611 	FILE_USE(fp);
    612 
    613 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
    614 		error = EBADF;
    615 		com = 0;
    616 		goto out;
    617 	}
    618 
    619 	switch (com = SCARG(uap, com)) {
    620 	case FIONCLEX:
    621 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
    622 		goto out;
    623 
    624 	case FIOCLEX:
    625 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
    626 		goto out;
    627 	}
    628 
    629 	/*
    630 	 * Interpret high order word to find amount of data to be
    631 	 * copied to/from the user's address space.
    632 	 */
    633 	size = IOCPARM_LEN(com);
    634 	if (size > IOCPARM_MAX) {
    635 		error = ENOTTY;
    636 		goto out;
    637 	}
    638 	memp = NULL;
    639 	if (size > sizeof(stkbuf)) {
    640 		memp = kmem_alloc(size, KM_SLEEP);
    641 		data = memp;
    642 	} else
    643 		data = (void *)stkbuf;
    644 	if (com&IOC_IN) {
    645 		if (size) {
    646 			error = copyin(SCARG(uap, data), data, size);
    647 			if (error) {
    648 				if (memp)
    649 					kmem_free(memp, size);
    650 				goto out;
    651 			}
    652 			ktrgenio(SCARG(uap, fd), UIO_WRITE, SCARG(uap, data),
    653 			    size, 0);
    654 		} else
    655 			*(void **)data = SCARG(uap, data);
    656 	} else if ((com&IOC_OUT) && size)
    657 		/*
    658 		 * Zero the buffer so the user always
    659 		 * gets back something deterministic.
    660 		 */
    661 		memset(data, 0, size);
    662 	else if (com&IOC_VOID)
    663 		*(void **)data = SCARG(uap, data);
    664 
    665 	switch (com) {
    666 
    667 	case FIONBIO:
    668 		if (*(int *)data != 0)
    669 			fp->f_flag |= FNONBLOCK;
    670 		else
    671 			fp->f_flag &= ~FNONBLOCK;
    672 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data, l);
    673 		break;
    674 
    675 	case FIOASYNC:
    676 		if (*(int *)data != 0)
    677 			fp->f_flag |= FASYNC;
    678 		else
    679 			fp->f_flag &= ~FASYNC;
    680 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data, l);
    681 		break;
    682 
    683 	default:
    684 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, l);
    685 		/*
    686 		 * Copy any data to user, size was
    687 		 * already set and checked above.
    688 		 */
    689 		if (error == 0 && (com&IOC_OUT) && size) {
    690 			error = copyout(data, SCARG(uap, data), size);
    691 			ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, data),
    692 			    size, error);
    693 		}
    694 		break;
    695 	}
    696 	if (memp)
    697 		kmem_free(memp, size);
    698  out:
    699 	FILE_UNUSE(fp, l);
    700 	switch (error) {
    701 	case -1:
    702 		printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
    703 		    "pid=%d comm=%s\n",
    704 		    (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
    705 		    (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
    706 		    p->p_pid, p->p_comm);
    707 		/* FALLTHROUGH */
    708 	case EPASSTHROUGH:
    709 		error = ENOTTY;
    710 		/* FALLTHROUGH */
    711 	default:
    712 		return (error);
    713 	}
    714 }
    715 
    716 /*
    717  * Select system call.
    718  */
    719 int
    720 sys_pselect(lwp_t *l, void *v, register_t *retval)
    721 {
    722 	struct sys_pselect_args /* {
    723 		syscallarg(int)				nd;
    724 		syscallarg(fd_set *)			in;
    725 		syscallarg(fd_set *)			ou;
    726 		syscallarg(fd_set *)			ex;
    727 		syscallarg(const struct timespec *)	ts;
    728 		syscallarg(sigset_t *)			mask;
    729 	} */ * const uap = v;
    730 	struct timespec	ats;
    731 	struct timeval	atv, *tv = NULL;
    732 	sigset_t	amask, *mask = NULL;
    733 	int		error;
    734 
    735 	if (SCARG(uap, ts)) {
    736 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
    737 		if (error)
    738 			return error;
    739 		atv.tv_sec = ats.tv_sec;
    740 		atv.tv_usec = ats.tv_nsec / 1000;
    741 		tv = &atv;
    742 	}
    743 	if (SCARG(uap, mask) != NULL) {
    744 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
    745 		if (error)
    746 			return error;
    747 		mask = &amask;
    748 	}
    749 
    750 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
    751 	    SCARG(uap, ou), SCARG(uap, ex), tv, mask);
    752 }
    753 
    754 int
    755 inittimeleft(struct timeval *tv, struct timeval *sleeptv)
    756 {
    757 	if (itimerfix(tv))
    758 		return -1;
    759 	getmicrouptime(sleeptv);
    760 	return 0;
    761 }
    762 
    763 int
    764 gettimeleft(struct timeval *tv, struct timeval *sleeptv)
    765 {
    766 	/*
    767 	 * We have to recalculate the timeout on every retry.
    768 	 */
    769 	struct timeval slepttv;
    770 	/*
    771 	 * reduce tv by elapsed time
    772 	 * based on monotonic time scale
    773 	 */
    774 	getmicrouptime(&slepttv);
    775 	timeradd(tv, sleeptv, tv);
    776 	timersub(tv, &slepttv, tv);
    777 	*sleeptv = slepttv;
    778 	return tvtohz(tv);
    779 }
    780 
    781 int
    782 sys_select(lwp_t *l, void *v, register_t *retval)
    783 {
    784 	struct sys_select_args /* {
    785 		syscallarg(int)			nd;
    786 		syscallarg(fd_set *)		in;
    787 		syscallarg(fd_set *)		ou;
    788 		syscallarg(fd_set *)		ex;
    789 		syscallarg(struct timeval *)	tv;
    790 	} */ * const uap = v;
    791 	struct timeval atv, *tv = NULL;
    792 	int error;
    793 
    794 	if (SCARG(uap, tv)) {
    795 		error = copyin(SCARG(uap, tv), (void *)&atv,
    796 			sizeof(atv));
    797 		if (error)
    798 			return error;
    799 		tv = &atv;
    800 	}
    801 
    802 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
    803 	    SCARG(uap, ou), SCARG(uap, ex), tv, NULL);
    804 }
    805 
    806 int
    807 selcommon(lwp_t *l, register_t *retval, int nd, fd_set *u_in,
    808 	  fd_set *u_ou, fd_set *u_ex, struct timeval *tv, sigset_t *mask)
    809 {
    810 	char		smallbits[howmany(FD_SETSIZE, NFDBITS) *
    811 			    sizeof(fd_mask) * 6];
    812 	proc_t		* const p = l->l_proc;
    813 	char 		*bits;
    814 	int		ncoll, error, timo;
    815 	size_t		ni;
    816 	sigset_t	oldmask;
    817 	struct timeval  sleeptv;
    818 
    819 	error = 0;
    820 	if (nd < 0)
    821 		return (EINVAL);
    822 	if (nd > p->p_fd->fd_nfiles) {
    823 		/* forgiving; slightly wrong */
    824 		nd = p->p_fd->fd_nfiles;
    825 	}
    826 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
    827 	if (ni * 6 > sizeof(smallbits))
    828 		bits = kmem_alloc(ni * 6, KM_SLEEP);
    829 	else
    830 		bits = smallbits;
    831 
    832 #define	getbits(name, x)						\
    833 	if (u_ ## name) {						\
    834 		error = copyin(u_ ## name, bits + ni * x, ni);		\
    835 		if (error)						\
    836 			goto done;					\
    837 	} else								\
    838 		memset(bits + ni * x, 0, ni);
    839 	getbits(in, 0);
    840 	getbits(ou, 1);
    841 	getbits(ex, 2);
    842 #undef	getbits
    843 
    844 	timo = 0;
    845 	if (tv && inittimeleft(tv, &sleeptv) == -1) {
    846 		error = EINVAL;
    847 		goto done;
    848 	}
    849 
    850 	if (mask) {
    851 		sigminusset(&sigcantmask, mask);
    852 		mutex_enter(&p->p_smutex);
    853 		oldmask = l->l_sigmask;
    854 		l->l_sigmask = *mask;
    855 		mutex_exit(&p->p_smutex);
    856 	} else
    857 		oldmask = l->l_sigmask;	/* XXXgcc */
    858 
    859 	mutex_enter(&select_lock);
    860 	SLIST_INIT(&l->l_selwait);
    861 	for (;;) {
    862 	 	l->l_selflag = SEL_SCANNING;
    863 		ncoll = nselcoll;
    864  		mutex_exit(&select_lock);
    865 
    866 		error = selscan(l, (fd_mask *)(bits + ni * 0),
    867 		    (fd_mask *)(bits + ni * 3), nd, retval);
    868 
    869 		mutex_enter(&select_lock);
    870 		if (error || *retval)
    871 			break;
    872 		if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
    873 			break;
    874 		if (l->l_selflag != SEL_SCANNING || ncoll != nselcoll)
    875 			continue;
    876 		l->l_selflag = SEL_BLOCKING;
    877 		error = cv_timedwait_sig(&select_cv, &select_lock, timo);
    878 		if (error != 0)
    879 			break;
    880 	}
    881 	selclear();
    882 	mutex_exit(&select_lock);
    883 
    884 	if (mask) {
    885 		mutex_enter(&p->p_smutex);
    886 		l->l_sigmask = oldmask;
    887 		mutex_exit(&p->p_smutex);
    888 	}
    889 
    890  done:
    891 	/* select is not restarted after signals... */
    892 	if (error == ERESTART)
    893 		error = EINTR;
    894 	if (error == EWOULDBLOCK)
    895 		error = 0;
    896 	if (error == 0 && u_in != NULL)
    897 		error = copyout(bits + ni * 3, u_in, ni);
    898 	if (error == 0 && u_ou != NULL)
    899 		error = copyout(bits + ni * 4, u_ou, ni);
    900 	if (error == 0 && u_ex != NULL)
    901 		error = copyout(bits + ni * 5, u_ex, ni);
    902 	if (bits != smallbits)
    903 		kmem_free(bits, ni * 6);
    904 	return (error);
    905 }
    906 
    907 int
    908 selscan(lwp_t *l, fd_mask *ibitp, fd_mask *obitp, int nfd,
    909 	register_t *retval)
    910 {
    911 	static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
    912 			       POLLWRNORM | POLLHUP | POLLERR,
    913 			       POLLRDBAND };
    914 	proc_t *p = l->l_proc;
    915 	struct filedesc	*fdp;
    916 	int msk, i, j, fd, n;
    917 	fd_mask ibits, obits;
    918 	struct file *fp;
    919 
    920 	fdp = p->p_fd;
    921 	n = 0;
    922 	for (msk = 0; msk < 3; msk++) {
    923 		for (i = 0; i < nfd; i += NFDBITS) {
    924 			ibits = *ibitp++;
    925 			obits = 0;
    926 			while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
    927 				ibits &= ~(1 << j);
    928 				if ((fp = fd_getfile(fdp, fd)) == NULL)
    929 					return (EBADF);
    930 				FILE_USE(fp);
    931 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], l)) {
    932 					obits |= (1 << j);
    933 					n++;
    934 				}
    935 				FILE_UNUSE(fp, l);
    936 			}
    937 			*obitp++ = obits;
    938 		}
    939 	}
    940 	*retval = n;
    941 	return (0);
    942 }
    943 
    944 /*
    945  * Poll system call.
    946  */
    947 int
    948 sys_poll(lwp_t *l, void *v, register_t *retval)
    949 {
    950 	struct sys_poll_args /* {
    951 		syscallarg(struct pollfd *)	fds;
    952 		syscallarg(u_int)		nfds;
    953 		syscallarg(int)			timeout;
    954 	} */ * const uap = v;
    955 	struct timeval	atv, *tv = NULL;
    956 
    957 	if (SCARG(uap, timeout) != INFTIM) {
    958 		atv.tv_sec = SCARG(uap, timeout) / 1000;
    959 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
    960 		tv = &atv;
    961 	}
    962 
    963 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
    964 		tv, NULL);
    965 }
    966 
    967 /*
    968  * Poll system call.
    969  */
    970 int
    971 sys_pollts(lwp_t *l, void *v, register_t *retval)
    972 {
    973 	struct sys_pollts_args /* {
    974 		syscallarg(struct pollfd *)		fds;
    975 		syscallarg(u_int)			nfds;
    976 		syscallarg(const struct timespec *)	ts;
    977 		syscallarg(const sigset_t *)		mask;
    978 	} */ * const uap = v;
    979 	struct timespec	ats;
    980 	struct timeval	atv, *tv = NULL;
    981 	sigset_t	amask, *mask = NULL;
    982 	int		error;
    983 
    984 	if (SCARG(uap, ts)) {
    985 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
    986 		if (error)
    987 			return error;
    988 		atv.tv_sec = ats.tv_sec;
    989 		atv.tv_usec = ats.tv_nsec / 1000;
    990 		tv = &atv;
    991 	}
    992 	if (SCARG(uap, mask)) {
    993 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
    994 		if (error)
    995 			return error;
    996 		mask = &amask;
    997 	}
    998 
    999 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
   1000 		tv, mask);
   1001 }
   1002 
   1003 int
   1004 pollcommon(lwp_t *l, register_t *retval,
   1005 	struct pollfd *u_fds, u_int nfds,
   1006 	struct timeval *tv, sigset_t *mask)
   1007 {
   1008 	char		smallbits[32 * sizeof(struct pollfd)];
   1009 	proc_t		* const p = l->l_proc;
   1010 	void *		bits;
   1011 	sigset_t	oldmask;
   1012 	int		ncoll, error, timo;
   1013 	size_t		ni;
   1014 	struct timeval	sleeptv;
   1015 
   1016 	if (nfds > p->p_fd->fd_nfiles) {
   1017 		/* forgiving; slightly wrong */
   1018 		nfds = p->p_fd->fd_nfiles;
   1019 	}
   1020 	ni = nfds * sizeof(struct pollfd);
   1021 	if (ni > sizeof(smallbits))
   1022 		bits = kmem_alloc(ni, KM_SLEEP);
   1023 	else
   1024 		bits = smallbits;
   1025 
   1026 	error = copyin(u_fds, bits, ni);
   1027 	if (error)
   1028 		goto done;
   1029 
   1030 	timo = 0;
   1031 	if (tv && inittimeleft(tv, &sleeptv) == -1) {
   1032 		error = EINVAL;
   1033 		goto done;
   1034 	}
   1035 
   1036 	if (mask) {
   1037 		sigminusset(&sigcantmask, mask);
   1038 		mutex_enter(&p->p_smutex);
   1039 		oldmask = l->l_sigmask;
   1040 		l->l_sigmask = *mask;
   1041 		mutex_exit(&p->p_smutex);
   1042 	} else
   1043 		oldmask = l->l_sigmask;	/* XXXgcc */
   1044 
   1045 	mutex_enter(&select_lock);
   1046 	SLIST_INIT(&l->l_selwait);
   1047 	for (;;) {
   1048 		ncoll = nselcoll;
   1049 		l->l_selflag = SEL_SCANNING;
   1050 		mutex_exit(&select_lock);
   1051 
   1052 		error = pollscan(l, (struct pollfd *)bits, nfds, retval);
   1053 
   1054 		mutex_enter(&select_lock);
   1055 		if (error || *retval)
   1056 			break;
   1057 		if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
   1058 			break;
   1059 		if (l->l_selflag != SEL_SCANNING || nselcoll != ncoll)
   1060 			continue;
   1061 		l->l_selflag = SEL_BLOCKING;
   1062 		error = cv_timedwait_sig(&select_cv, &select_lock, timo);
   1063 		if (error != 0)
   1064 			break;
   1065 	}
   1066 	selclear();
   1067 	mutex_exit(&select_lock);
   1068 
   1069 	if (mask) {
   1070 		mutex_enter(&p->p_smutex);
   1071 		l->l_sigmask = oldmask;
   1072 		mutex_exit(&p->p_smutex);
   1073 	}
   1074  done:
   1075 	/* poll is not restarted after signals... */
   1076 	if (error == ERESTART)
   1077 		error = EINTR;
   1078 	if (error == EWOULDBLOCK)
   1079 		error = 0;
   1080 	if (error == 0)
   1081 		error = copyout(bits, u_fds, ni);
   1082 	if (bits != smallbits)
   1083 		kmem_free(bits, ni);
   1084 	return (error);
   1085 }
   1086 
   1087 int
   1088 pollscan(lwp_t *l, struct pollfd *fds, int nfd, register_t *retval)
   1089 {
   1090 	proc_t		*p = l->l_proc;
   1091 	struct filedesc	*fdp;
   1092 	int		i, n;
   1093 	struct file	*fp;
   1094 
   1095 	fdp = p->p_fd;
   1096 	n = 0;
   1097 	for (i = 0; i < nfd; i++, fds++) {
   1098 		if (fds->fd >= fdp->fd_nfiles) {
   1099 			fds->revents = POLLNVAL;
   1100 			n++;
   1101 		} else if (fds->fd < 0) {
   1102 			fds->revents = 0;
   1103 		} else {
   1104 			if ((fp = fd_getfile(fdp, fds->fd)) == NULL) {
   1105 				fds->revents = POLLNVAL;
   1106 				n++;
   1107 			} else {
   1108 				FILE_USE(fp);
   1109 				fds->revents = (*fp->f_ops->fo_poll)(fp,
   1110 				    fds->events | POLLERR | POLLHUP, l);
   1111 				if (fds->revents != 0)
   1112 					n++;
   1113 				FILE_UNUSE(fp, l);
   1114 			}
   1115 		}
   1116 	}
   1117 	*retval = n;
   1118 	return (0);
   1119 }
   1120 
   1121 /*ARGSUSED*/
   1122 int
   1123 seltrue(dev_t dev, int events, lwp_t *l)
   1124 {
   1125 
   1126 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
   1127 }
   1128 
   1129 /*
   1130  * Record a select request.
   1131  */
   1132 void
   1133 selrecord(lwp_t *selector, struct selinfo *sip)
   1134 {
   1135 
   1136 	mutex_enter(&select_lock);
   1137 	if (sip->sel_lwp == NULL) {
   1138 		/* First named waiter, although there may be more. */
   1139 		sip->sel_lwp = selector;
   1140 		SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
   1141 	} else if (sip->sel_lwp != selector) {
   1142 		/* Multiple waiters. */
   1143 		sip->sel_collision = true;
   1144 	}
   1145 	mutex_exit(&select_lock);
   1146 }
   1147 
   1148 /*
   1149  * Do a wakeup when a selectable event occurs.
   1150  */
   1151 void
   1152 selwakeup(struct selinfo *sip)
   1153 {
   1154 	lwp_t *l;
   1155 
   1156 	mutex_enter(&select_lock);
   1157 	if (sip->sel_collision) {
   1158 		/* Multiple waiters - just notify everybody. */
   1159 		nselcoll++;
   1160 		sip->sel_collision = false;
   1161 		cv_broadcast(&select_cv);
   1162 	} else if (sip->sel_lwp != NULL) {
   1163 		/* Only one LWP waiting. */
   1164 		l = sip->sel_lwp;
   1165 		if (l->l_selflag == SEL_BLOCKING) {
   1166 			/*
   1167 			 * If it's sleeping, wake it up.  If not, it's
   1168 			 * already awake but hasn't yet removed itself
   1169 			 * from the selector.  We reset the state below
   1170 			 * so that we only attempt to do this once.
   1171 			 */
   1172 			lwp_lock(l);
   1173 			if (l->l_wchan == &select_cv) {
   1174 				/* lwp_unsleep() releases the LWP lock. */
   1175 				lwp_unsleep(l);
   1176 			} else
   1177 				lwp_unlock(l);
   1178 		} else {
   1179 			/*
   1180 			 * Not yet asleep.  Reset its state below so that
   1181 			 * it will go around again.
   1182 			 */
   1183 		}
   1184 		l->l_selflag = SEL_RESET;
   1185 	}
   1186 	mutex_exit(&select_lock);
   1187 }
   1188 
   1189 void
   1190 selnotify(struct selinfo *sip, long knhint)
   1191 {
   1192 
   1193 	selwakeup(sip);
   1194 	KNOTE(&sip->sel_klist, knhint);
   1195 }
   1196 
   1197 /*
   1198  * Remove an LWP from all objects that it is waiting for.
   1199  */
   1200 static void
   1201 selclear(void)
   1202 {
   1203 	struct selinfo *sip;
   1204 	lwp_t *l = curlwp;
   1205 
   1206 	KASSERT(mutex_owned(&select_lock));
   1207 
   1208 	SLIST_FOREACH(sip, &l->l_selwait, sel_chain) {
   1209 		KASSERT(sip->sel_lwp == l);
   1210 		sip->sel_lwp = NULL;
   1211 	}
   1212 }
   1213 
   1214 /*
   1215  * Initialize the select/poll system calls.
   1216  */
   1217 void
   1218 selsysinit(void)
   1219 {
   1220 
   1221 	mutex_init(&select_lock, MUTEX_DRIVER, IPL_VM);
   1222 	cv_init(&select_cv, "select");
   1223 }
   1224