Home | History | Annotate | Line # | Download | only in kern
sys_generic.c revision 1.107
      1 /*	$NetBSD: sys_generic.c,v 1.107 2007/09/25 13:53:11 ad Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2007 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Andrew Doran.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  * 3. All advertising materials mentioning features or use of this software
     19  *    must display the following acknowledgement:
     20  *	This product includes software developed by the NetBSD
     21  *	Foundation, Inc. and its contributors.
     22  * 4. Neither the name of The NetBSD Foundation nor the names of its
     23  *    contributors may be used to endorse or promote products derived
     24  *    from this software without specific prior written permission.
     25  *
     26  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     27  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     28  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     29  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     30  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     31  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     32  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     33  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     34  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     35  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     36  * POSSIBILITY OF SUCH DAMAGE.
     37  */
     38 
     39 /*
     40  * Copyright (c) 1982, 1986, 1989, 1993
     41  *	The Regents of the University of California.  All rights reserved.
     42  * (c) UNIX System Laboratories, Inc.
     43  * All or some portions of this file are derived from material licensed
     44  * to the University of California by American Telephone and Telegraph
     45  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
     46  * the permission of UNIX System Laboratories, Inc.
     47  *
     48  * Redistribution and use in source and binary forms, with or without
     49  * modification, are permitted provided that the following conditions
     50  * are met:
     51  * 1. Redistributions of source code must retain the above copyright
     52  *    notice, this list of conditions and the following disclaimer.
     53  * 2. Redistributions in binary form must reproduce the above copyright
     54  *    notice, this list of conditions and the following disclaimer in the
     55  *    documentation and/or other materials provided with the distribution.
     56  * 3. Neither the name of the University nor the names of its contributors
     57  *    may be used to endorse or promote products derived from this software
     58  *    without specific prior written permission.
     59  *
     60  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     61  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     62  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     63  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     64  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     65  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     66  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     67  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     68  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     69  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     70  * SUCH DAMAGE.
     71  *
     72  *	@(#)sys_generic.c	8.9 (Berkeley) 2/14/95
     73  */
     74 
     75 /*
     76  * System calls relating to files.
     77  */
     78 
     79 #include <sys/cdefs.h>
     80 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.107 2007/09/25 13:53:11 ad Exp $");
     81 
     82 #include <sys/param.h>
     83 #include <sys/systm.h>
     84 #include <sys/filedesc.h>
     85 #include <sys/ioctl.h>
     86 #include <sys/file.h>
     87 #include <sys/proc.h>
     88 #include <sys/socketvar.h>
     89 #include <sys/signalvar.h>
     90 #include <sys/uio.h>
     91 #include <sys/kernel.h>
     92 #include <sys/stat.h>
     93 #include <sys/kmem.h>
     94 #include <sys/poll.h>
     95 #include <sys/vnode.h>
     96 #include <sys/mount.h>
     97 #include <sys/syscallargs.h>
     98 #include <sys/ktrace.h>
     99 
    100 #include <uvm/uvm_extern.h>
    101 
    102 /* Flags for lwp::l_selflag. */
    103 #define	SEL_RESET	0	/* awoken, interrupted, or not yet polling */
    104 #define	SEL_SCANNING	1	/* polling descriptors */
    105 #define	SEL_BLOCKING	2	/* about to block on select_cv */
    106 
    107 static int	selscan(lwp_t *, fd_mask *, fd_mask *, int, register_t *);
    108 static int	pollscan(lwp_t *, struct pollfd *, int, register_t *);
    109 static void	selclear(void);
    110 
    111 /* Global state for select()/poll(). */
    112 kmutex_t	select_lock;
    113 kcondvar_t	select_cv;
    114 int		nselcoll;
    115 
    116 /*
    117  * Read system call.
    118  */
    119 /* ARGSUSED */
    120 int
    121 sys_read(lwp_t *l, void *v, register_t *retval)
    122 {
    123 	struct sys_read_args /* {
    124 		syscallarg(int)		fd;
    125 		syscallarg(void *)	buf;
    126 		syscallarg(size_t)	nbyte;
    127 	} */ *uap = v;
    128 	int		fd;
    129 	struct file	*fp;
    130 	proc_t		*p;
    131 	struct filedesc	*fdp;
    132 
    133 	fd = SCARG(uap, fd);
    134 	p = l->l_proc;
    135 	fdp = p->p_fd;
    136 
    137 	if ((fp = fd_getfile(fdp, fd)) == NULL)
    138 		return (EBADF);
    139 
    140 	if ((fp->f_flag & FREAD) == 0) {
    141 		simple_unlock(&fp->f_slock);
    142 		return (EBADF);
    143 	}
    144 
    145 	FILE_USE(fp);
    146 
    147 	/* dofileread() will unuse the descriptor for us */
    148 	return (dofileread(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
    149 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
    150 }
    151 
    152 int
    153 dofileread(lwp_t *l, int fd, struct file *fp, void *buf, size_t nbyte,
    154 	off_t *offset, int flags, register_t *retval)
    155 {
    156 	struct iovec aiov;
    157 	struct uio auio;
    158 	proc_t *p;
    159 	struct vmspace *vm;
    160 	size_t cnt;
    161 	int error;
    162 	p = l->l_proc;
    163 
    164 	error = proc_vmspace_getref(p, &vm);
    165 	if (error) {
    166 		FILE_UNUSE(fp, l);
    167 		return error;
    168 	}
    169 
    170 	aiov.iov_base = (void *)buf;
    171 	aiov.iov_len = nbyte;
    172 	auio.uio_iov = &aiov;
    173 	auio.uio_iovcnt = 1;
    174 	auio.uio_resid = nbyte;
    175 	auio.uio_rw = UIO_READ;
    176 	auio.uio_vmspace = vm;
    177 
    178 	/*
    179 	 * Reads return ssize_t because -1 is returned on error.  Therefore
    180 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
    181 	 * values.
    182 	 */
    183 	if (auio.uio_resid > SSIZE_MAX) {
    184 		error = EINVAL;
    185 		goto out;
    186 	}
    187 
    188 	cnt = auio.uio_resid;
    189 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
    190 	if (error)
    191 		if (auio.uio_resid != cnt && (error == ERESTART ||
    192 		    error == EINTR || error == EWOULDBLOCK))
    193 			error = 0;
    194 	cnt -= auio.uio_resid;
    195 	ktrgenio(fd, UIO_READ, buf, cnt, error);
    196 	*retval = cnt;
    197  out:
    198 	FILE_UNUSE(fp, l);
    199 	uvmspace_free(vm);
    200 	return (error);
    201 }
    202 
    203 /*
    204  * Scatter read system call.
    205  */
    206 int
    207 sys_readv(lwp_t *l, void *v, register_t *retval)
    208 {
    209 	struct sys_readv_args /* {
    210 		syscallarg(int)				fd;
    211 		syscallarg(const struct iovec *)	iovp;
    212 		syscallarg(int)				iovcnt;
    213 	} */ *uap = v;
    214 
    215 	return do_filereadv(l, SCARG(uap, fd), SCARG(uap, iovp),
    216 	    SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
    217 }
    218 
    219 int
    220 do_filereadv(struct lwp *l, int fd, const struct iovec *iovp, int iovcnt,
    221     off_t *offset, int flags, register_t *retval)
    222 {
    223 	struct proc	*p;
    224 	struct uio	auio;
    225 	struct iovec	*iov, *needfree = NULL, aiov[UIO_SMALLIOV];
    226 	struct vmspace	*vm;
    227 	int		i, error;
    228 	size_t		cnt;
    229 	u_int		iovlen;
    230 	struct file	*fp;
    231 	struct filedesc	*fdp;
    232 	struct iovec	*ktriov = NULL;
    233 
    234 	if (iovcnt == 0)
    235 		return EINVAL;
    236 
    237 	p = l->l_proc;
    238 	fdp = p->p_fd;
    239 
    240 	if ((fp = fd_getfile(fdp, fd)) == NULL)
    241 		return EBADF;
    242 
    243 	if ((fp->f_flag & FREAD) == 0) {
    244 		simple_unlock(&fp->f_slock);
    245 		return EBADF;
    246 	}
    247 
    248 	FILE_USE(fp);
    249 
    250 	error = proc_vmspace_getref(p, &vm);
    251 	if (error) {
    252 		FILE_UNUSE(fp, l);
    253 		return error;
    254 	}
    255 
    256 	if (offset == NULL)
    257 		offset = &fp->f_offset;
    258 	else {
    259 		struct vnode *vp = fp->f_data;
    260 		if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
    261 			error = ESPIPE;
    262 			goto out;
    263 		}
    264 		/*
    265 		 * Test that the device is seekable ?
    266 		 * XXX This works because no file systems actually
    267 		 * XXX take any action on the seek operation.
    268 		 */
    269 		error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
    270 		if (error != 0)
    271 			goto out;
    272 	}
    273 
    274 	iovlen = iovcnt * sizeof(struct iovec);
    275 	if (flags & FOF_IOV_SYSSPACE)
    276 		iov = __UNCONST(iovp);
    277 	else {
    278 		iov = aiov;
    279 		if ((u_int)iovcnt > UIO_SMALLIOV) {
    280 			if ((u_int)iovcnt > IOV_MAX) {
    281 				error = EINVAL;
    282 				goto out;
    283 			}
    284 			iov = kmem_alloc(iovlen, KM_SLEEP);
    285 			if (iov == NULL) {
    286 				error = ENOMEM;
    287 				goto out;
    288 			}
    289 			needfree = iov;
    290 		}
    291 		error = copyin(iovp, iov, iovlen);
    292 		if (error)
    293 			goto done;
    294 	}
    295 
    296 	auio.uio_iov = iov;
    297 	auio.uio_iovcnt = iovcnt;
    298 	auio.uio_rw = UIO_READ;
    299 	auio.uio_vmspace = vm;
    300 
    301 	auio.uio_resid = 0;
    302 	for (i = 0; i < iovcnt; i++, iov++) {
    303 		auio.uio_resid += iov->iov_len;
    304 		/*
    305 		 * Reads return ssize_t because -1 is returned on error.
    306 		 * Therefore we must restrict the length to SSIZE_MAX to
    307 		 * avoid garbage return values.
    308 		 */
    309 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
    310 			error = EINVAL;
    311 			goto done;
    312 		}
    313 	}
    314 
    315 	/*
    316 	 * if tracing, save a copy of iovec
    317 	 */
    318 	if (ktrpoint(KTR_GENIO))  {
    319 		ktriov = kmem_alloc(iovlen, KM_SLEEP);
    320 		if (ktriov != NULL)
    321 			memcpy(ktriov, auio.uio_iov, iovlen);
    322 	}
    323 
    324 	cnt = auio.uio_resid;
    325 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
    326 	if (error)
    327 		if (auio.uio_resid != cnt && (error == ERESTART ||
    328 		    error == EINTR || error == EWOULDBLOCK))
    329 			error = 0;
    330 	cnt -= auio.uio_resid;
    331 	*retval = cnt;
    332 
    333 	if (ktriov != NULL) {
    334 		ktrgeniov(fd, UIO_READ, ktriov, cnt, error);
    335 		kmem_free(ktriov, iovlen);
    336 	}
    337 
    338  done:
    339 	if (needfree)
    340 		kmem_free(needfree, iovlen);
    341  out:
    342 	FILE_UNUSE(fp, l);
    343 	uvmspace_free(vm);
    344 	return (error);
    345 }
    346 
    347 /*
    348  * Write system call
    349  */
    350 int
    351 sys_write(lwp_t *l, void *v, register_t *retval)
    352 {
    353 	struct sys_write_args /* {
    354 		syscallarg(int)			fd;
    355 		syscallarg(const void *)	buf;
    356 		syscallarg(size_t)		nbyte;
    357 	} */ *uap = v;
    358 	int		fd;
    359 	struct file	*fp;
    360 	proc_t		*p;
    361 	struct filedesc	*fdp;
    362 
    363 	fd = SCARG(uap, fd);
    364 	p = l->l_proc;
    365 	fdp = p->p_fd;
    366 
    367 	if ((fp = fd_getfile(fdp, fd)) == NULL)
    368 		return (EBADF);
    369 
    370 	if ((fp->f_flag & FWRITE) == 0) {
    371 		simple_unlock(&fp->f_slock);
    372 		return (EBADF);
    373 	}
    374 
    375 	FILE_USE(fp);
    376 
    377 	/* dofilewrite() will unuse the descriptor for us */
    378 	return (dofilewrite(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
    379 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
    380 }
    381 
    382 int
    383 dofilewrite(lwp_t *l, int fd, struct file *fp, const void *buf,
    384 	size_t nbyte, off_t *offset, int flags, register_t *retval)
    385 {
    386 	struct iovec aiov;
    387 	struct uio auio;
    388 	proc_t *p;
    389 	struct vmspace *vm;
    390 	size_t cnt;
    391 	int error;
    392 
    393 	p = l->l_proc;
    394 	error = proc_vmspace_getref(p, &vm);
    395 	if (error) {
    396 		FILE_UNUSE(fp, l);
    397 		return error;
    398 	}
    399 	aiov.iov_base = __UNCONST(buf);		/* XXXUNCONST kills const */
    400 	aiov.iov_len = nbyte;
    401 	auio.uio_iov = &aiov;
    402 	auio.uio_iovcnt = 1;
    403 	auio.uio_resid = nbyte;
    404 	auio.uio_rw = UIO_WRITE;
    405 	auio.uio_vmspace = vm;
    406 
    407 	/*
    408 	 * Writes return ssize_t because -1 is returned on error.  Therefore
    409 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
    410 	 * values.
    411 	 */
    412 	if (auio.uio_resid > SSIZE_MAX) {
    413 		error = EINVAL;
    414 		goto out;
    415 	}
    416 
    417 	cnt = auio.uio_resid;
    418 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
    419 	if (error) {
    420 		if (auio.uio_resid != cnt && (error == ERESTART ||
    421 		    error == EINTR || error == EWOULDBLOCK))
    422 			error = 0;
    423 		if (error == EPIPE) {
    424 			mutex_enter(&proclist_mutex);
    425 			psignal(p, SIGPIPE);
    426 			mutex_exit(&proclist_mutex);
    427 		}
    428 	}
    429 	cnt -= auio.uio_resid;
    430 	ktrgenio(fd, UIO_WRITE, buf, cnt, error);
    431 	*retval = cnt;
    432  out:
    433 	FILE_UNUSE(fp, l);
    434 	uvmspace_free(vm);
    435 	return (error);
    436 }
    437 
    438 /*
    439  * Gather write system call
    440  */
    441 int
    442 sys_writev(lwp_t *l, void *v, register_t *retval)
    443 {
    444 	struct sys_writev_args /* {
    445 		syscallarg(int)				fd;
    446 		syscallarg(const struct iovec *)	iovp;
    447 		syscallarg(int)				iovcnt;
    448 	} */ *uap = v;
    449 
    450 	return do_filewritev(l, SCARG(uap, fd), SCARG(uap, iovp),
    451 	    SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
    452 }
    453 
    454 int
    455 do_filewritev(struct lwp *l, int fd, const struct iovec *iovp, int iovcnt,
    456     off_t *offset, int flags, register_t *retval)
    457 {
    458 	struct proc	*p;
    459 	struct uio	auio;
    460 	struct iovec	*iov, *needfree = NULL, aiov[UIO_SMALLIOV];
    461 	struct vmspace	*vm;
    462 	int		i, error;
    463 	size_t		cnt;
    464 	u_int		iovlen;
    465 	struct file	*fp;
    466 	struct filedesc	*fdp;
    467 	struct iovec	*ktriov = NULL;
    468 
    469 	if (iovcnt == 0)
    470 		return EINVAL;
    471 
    472 	p = l->l_proc;
    473 	fdp = p->p_fd;
    474 
    475 	if ((fp = fd_getfile(fdp, fd)) == NULL)
    476 		return EBADF;
    477 
    478 	if ((fp->f_flag & FWRITE) == 0) {
    479 		simple_unlock(&fp->f_slock);
    480 		return EBADF;
    481 	}
    482 
    483 	FILE_USE(fp);
    484 
    485 	error = proc_vmspace_getref(p, &vm);
    486 	if (error) {
    487 		FILE_UNUSE(fp, l);
    488 		return error;
    489 	}
    490 
    491 	if (offset == NULL)
    492 		offset = &fp->f_offset;
    493 	else {
    494 		struct vnode *vp = fp->f_data;
    495 		if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
    496 			error = ESPIPE;
    497 			goto out;
    498 		}
    499 		/*
    500 		 * Test that the device is seekable ?
    501 		 * XXX This works because no file systems actually
    502 		 * XXX take any action on the seek operation.
    503 		 */
    504 		error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
    505 		if (error != 0)
    506 			goto out;
    507 	}
    508 
    509 	iovlen = iovcnt * sizeof(struct iovec);
    510 	if (flags & FOF_IOV_SYSSPACE)
    511 		iov = __UNCONST(iovp);
    512 	else {
    513 		iov = aiov;
    514 		if ((u_int)iovcnt > UIO_SMALLIOV) {
    515 			if ((u_int)iovcnt > IOV_MAX) {
    516 				error = EINVAL;
    517 				goto out;
    518 			}
    519 			iov = kmem_alloc(iovlen, KM_SLEEP);
    520 			if (iov == NULL) {
    521 				error = ENOMEM;
    522 				goto out;
    523 			}
    524 			needfree = iov;
    525 		}
    526 		error = copyin(iovp, iov, iovlen);
    527 		if (error)
    528 			goto done;
    529 	}
    530 
    531 	auio.uio_iov = iov;
    532 	auio.uio_iovcnt = iovcnt;
    533 	auio.uio_rw = UIO_WRITE;
    534 	auio.uio_vmspace = vm;
    535 
    536 	auio.uio_resid = 0;
    537 	for (i = 0; i < iovcnt; i++, iov++) {
    538 		auio.uio_resid += iov->iov_len;
    539 		/*
    540 		 * Writes return ssize_t because -1 is returned on error.
    541 		 * Therefore we must restrict the length to SSIZE_MAX to
    542 		 * avoid garbage return values.
    543 		 */
    544 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
    545 			error = EINVAL;
    546 			goto done;
    547 		}
    548 	}
    549 
    550 	/*
    551 	 * if tracing, save a copy of iovec
    552 	 */
    553 	if (ktrpoint(KTR_GENIO))  {
    554 		ktriov = kmem_alloc(iovlen, KM_SLEEP);
    555 		if (ktriov != NULL)
    556 			memcpy(ktriov, auio.uio_iov, iovlen);
    557 	}
    558 
    559 	cnt = auio.uio_resid;
    560 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
    561 	if (error) {
    562 		if (auio.uio_resid != cnt && (error == ERESTART ||
    563 		    error == EINTR || error == EWOULDBLOCK))
    564 			error = 0;
    565 		if (error == EPIPE) {
    566 			mutex_enter(&proclist_mutex);
    567 			psignal(p, SIGPIPE);
    568 			mutex_exit(&proclist_mutex);
    569 		}
    570 	}
    571 	cnt -= auio.uio_resid;
    572 	*retval = cnt;
    573 
    574 	if (ktriov != NULL) {
    575 		ktrgeniov(fd, UIO_WRITE, ktriov, cnt, error);
    576 		kmem_free(ktriov, iovlen);
    577 	}
    578 
    579  done:
    580 	if (needfree)
    581 		kmem_free(needfree, iovlen);
    582  out:
    583 	FILE_UNUSE(fp, l);
    584 	uvmspace_free(vm);
    585 	return (error);
    586 }
    587 
    588 /*
    589  * Ioctl system call
    590  */
    591 /* ARGSUSED */
    592 int
    593 sys_ioctl(struct lwp *l, void *v, register_t *retval)
    594 {
    595 	struct sys_ioctl_args /* {
    596 		syscallarg(int)		fd;
    597 		syscallarg(u_long)	com;
    598 		syscallarg(void *)	data;
    599 	} */ *uap = v;
    600 	struct file	*fp;
    601 	proc_t		*p;
    602 	struct filedesc	*fdp;
    603 	u_long		com;
    604 	int		error;
    605 	u_int		size;
    606 	void 		*data, *memp;
    607 #define	STK_PARAMS	128
    608 	u_long		stkbuf[STK_PARAMS/sizeof(u_long)];
    609 
    610 	error = 0;
    611 	p = l->l_proc;
    612 	fdp = p->p_fd;
    613 
    614 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
    615 		return (EBADF);
    616 
    617 	FILE_USE(fp);
    618 
    619 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
    620 		error = EBADF;
    621 		com = 0;
    622 		goto out;
    623 	}
    624 
    625 	switch (com = SCARG(uap, com)) {
    626 	case FIONCLEX:
    627 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
    628 		goto out;
    629 
    630 	case FIOCLEX:
    631 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
    632 		goto out;
    633 	}
    634 
    635 	/*
    636 	 * Interpret high order word to find amount of data to be
    637 	 * copied to/from the user's address space.
    638 	 */
    639 	size = IOCPARM_LEN(com);
    640 	if (size > IOCPARM_MAX) {
    641 		error = ENOTTY;
    642 		goto out;
    643 	}
    644 	memp = NULL;
    645 	if (size > sizeof(stkbuf)) {
    646 		memp = kmem_alloc(size, KM_SLEEP);
    647 		data = memp;
    648 	} else
    649 		data = (void *)stkbuf;
    650 	if (com&IOC_IN) {
    651 		if (size) {
    652 			error = copyin(SCARG(uap, data), data, size);
    653 			if (error) {
    654 				if (memp)
    655 					kmem_free(memp, size);
    656 				goto out;
    657 			}
    658 			ktrgenio(SCARG(uap, fd), UIO_WRITE, SCARG(uap, data),
    659 			    size, 0);
    660 		} else
    661 			*(void **)data = SCARG(uap, data);
    662 	} else if ((com&IOC_OUT) && size)
    663 		/*
    664 		 * Zero the buffer so the user always
    665 		 * gets back something deterministic.
    666 		 */
    667 		memset(data, 0, size);
    668 	else if (com&IOC_VOID)
    669 		*(void **)data = SCARG(uap, data);
    670 
    671 	switch (com) {
    672 
    673 	case FIONBIO:
    674 		if (*(int *)data != 0)
    675 			fp->f_flag |= FNONBLOCK;
    676 		else
    677 			fp->f_flag &= ~FNONBLOCK;
    678 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data, l);
    679 		break;
    680 
    681 	case FIOASYNC:
    682 		if (*(int *)data != 0)
    683 			fp->f_flag |= FASYNC;
    684 		else
    685 			fp->f_flag &= ~FASYNC;
    686 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data, l);
    687 		break;
    688 
    689 	default:
    690 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, l);
    691 		/*
    692 		 * Copy any data to user, size was
    693 		 * already set and checked above.
    694 		 */
    695 		if (error == 0 && (com&IOC_OUT) && size) {
    696 			error = copyout(data, SCARG(uap, data), size);
    697 			ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, data),
    698 			    size, error);
    699 		}
    700 		break;
    701 	}
    702 	if (memp)
    703 		kmem_free(memp, size);
    704  out:
    705 	FILE_UNUSE(fp, l);
    706 	switch (error) {
    707 	case -1:
    708 		printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
    709 		    "pid=%d comm=%s\n",
    710 		    (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
    711 		    (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
    712 		    p->p_pid, p->p_comm);
    713 		/* FALLTHROUGH */
    714 	case EPASSTHROUGH:
    715 		error = ENOTTY;
    716 		/* FALLTHROUGH */
    717 	default:
    718 		return (error);
    719 	}
    720 }
    721 
    722 /*
    723  * Select system call.
    724  */
    725 int
    726 sys_pselect(lwp_t *l, void *v, register_t *retval)
    727 {
    728 	struct sys_pselect_args /* {
    729 		syscallarg(int)				nd;
    730 		syscallarg(fd_set *)			in;
    731 		syscallarg(fd_set *)			ou;
    732 		syscallarg(fd_set *)			ex;
    733 		syscallarg(const struct timespec *)	ts;
    734 		syscallarg(sigset_t *)			mask;
    735 	} */ * const uap = v;
    736 	struct timespec	ats;
    737 	struct timeval	atv, *tv = NULL;
    738 	sigset_t	amask, *mask = NULL;
    739 	int		error;
    740 
    741 	if (SCARG(uap, ts)) {
    742 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
    743 		if (error)
    744 			return error;
    745 		atv.tv_sec = ats.tv_sec;
    746 		atv.tv_usec = ats.tv_nsec / 1000;
    747 		tv = &atv;
    748 	}
    749 	if (SCARG(uap, mask) != NULL) {
    750 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
    751 		if (error)
    752 			return error;
    753 		mask = &amask;
    754 	}
    755 
    756 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
    757 	    SCARG(uap, ou), SCARG(uap, ex), tv, mask);
    758 }
    759 
    760 int
    761 inittimeleft(struct timeval *tv, struct timeval *sleeptv)
    762 {
    763 	if (itimerfix(tv))
    764 		return -1;
    765 	getmicrouptime(sleeptv);
    766 	return 0;
    767 }
    768 
    769 int
    770 gettimeleft(struct timeval *tv, struct timeval *sleeptv)
    771 {
    772 	/*
    773 	 * We have to recalculate the timeout on every retry.
    774 	 */
    775 	struct timeval slepttv;
    776 	/*
    777 	 * reduce tv by elapsed time
    778 	 * based on monotonic time scale
    779 	 */
    780 	getmicrouptime(&slepttv);
    781 	timeradd(tv, sleeptv, tv);
    782 	timersub(tv, &slepttv, tv);
    783 	*sleeptv = slepttv;
    784 	return tvtohz(tv);
    785 }
    786 
    787 int
    788 sys_select(lwp_t *l, void *v, register_t *retval)
    789 {
    790 	struct sys_select_args /* {
    791 		syscallarg(int)			nd;
    792 		syscallarg(fd_set *)		in;
    793 		syscallarg(fd_set *)		ou;
    794 		syscallarg(fd_set *)		ex;
    795 		syscallarg(struct timeval *)	tv;
    796 	} */ * const uap = v;
    797 	struct timeval atv, *tv = NULL;
    798 	int error;
    799 
    800 	if (SCARG(uap, tv)) {
    801 		error = copyin(SCARG(uap, tv), (void *)&atv,
    802 			sizeof(atv));
    803 		if (error)
    804 			return error;
    805 		tv = &atv;
    806 	}
    807 
    808 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
    809 	    SCARG(uap, ou), SCARG(uap, ex), tv, NULL);
    810 }
    811 
    812 int
    813 selcommon(lwp_t *l, register_t *retval, int nd, fd_set *u_in,
    814 	  fd_set *u_ou, fd_set *u_ex, struct timeval *tv, sigset_t *mask)
    815 {
    816 	char		smallbits[howmany(FD_SETSIZE, NFDBITS) *
    817 			    sizeof(fd_mask) * 6];
    818 	proc_t		* const p = l->l_proc;
    819 	char 		*bits;
    820 	int		ncoll, error, timo;
    821 	size_t		ni;
    822 	sigset_t	oldmask;
    823 	struct timeval  sleeptv;
    824 
    825 	error = 0;
    826 	if (nd < 0)
    827 		return (EINVAL);
    828 	if (nd > p->p_fd->fd_nfiles) {
    829 		/* forgiving; slightly wrong */
    830 		nd = p->p_fd->fd_nfiles;
    831 	}
    832 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
    833 	if (ni * 6 > sizeof(smallbits))
    834 		bits = kmem_alloc(ni * 6, KM_SLEEP);
    835 	else
    836 		bits = smallbits;
    837 
    838 #define	getbits(name, x)						\
    839 	if (u_ ## name) {						\
    840 		error = copyin(u_ ## name, bits + ni * x, ni);		\
    841 		if (error)						\
    842 			goto done;					\
    843 	} else								\
    844 		memset(bits + ni * x, 0, ni);
    845 	getbits(in, 0);
    846 	getbits(ou, 1);
    847 	getbits(ex, 2);
    848 #undef	getbits
    849 
    850 	timo = 0;
    851 	if (tv && inittimeleft(tv, &sleeptv) == -1) {
    852 		error = EINVAL;
    853 		goto done;
    854 	}
    855 
    856 	if (mask) {
    857 		sigminusset(&sigcantmask, mask);
    858 		mutex_enter(&p->p_smutex);
    859 		oldmask = l->l_sigmask;
    860 		l->l_sigmask = *mask;
    861 		mutex_exit(&p->p_smutex);
    862 	} else
    863 		oldmask = l->l_sigmask;	/* XXXgcc */
    864 
    865 	mutex_enter(&select_lock);
    866 	SLIST_INIT(&l->l_selwait);
    867 	for (;;) {
    868 	 	l->l_selflag = SEL_SCANNING;
    869 		ncoll = nselcoll;
    870  		mutex_exit(&select_lock);
    871 
    872 		error = selscan(l, (fd_mask *)(bits + ni * 0),
    873 		    (fd_mask *)(bits + ni * 3), nd, retval);
    874 
    875 		mutex_enter(&select_lock);
    876 		if (error || *retval)
    877 			break;
    878 		if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
    879 			break;
    880 		if (l->l_selflag != SEL_SCANNING || ncoll != nselcoll)
    881 			continue;
    882 		l->l_selflag = SEL_BLOCKING;
    883 		error = cv_timedwait_sig(&select_cv, &select_lock, timo);
    884 		if (error != 0)
    885 			break;
    886 	}
    887 	selclear();
    888 	mutex_exit(&select_lock);
    889 
    890 	if (mask) {
    891 		mutex_enter(&p->p_smutex);
    892 		l->l_sigmask = oldmask;
    893 		mutex_exit(&p->p_smutex);
    894 	}
    895 
    896  done:
    897 	/* select is not restarted after signals... */
    898 	if (error == ERESTART)
    899 		error = EINTR;
    900 	if (error == EWOULDBLOCK)
    901 		error = 0;
    902 	if (error == 0 && u_in != NULL)
    903 		error = copyout(bits + ni * 3, u_in, ni);
    904 	if (error == 0 && u_ou != NULL)
    905 		error = copyout(bits + ni * 4, u_ou, ni);
    906 	if (error == 0 && u_ex != NULL)
    907 		error = copyout(bits + ni * 5, u_ex, ni);
    908 	if (bits != smallbits)
    909 		kmem_free(bits, ni * 6);
    910 	return (error);
    911 }
    912 
    913 int
    914 selscan(lwp_t *l, fd_mask *ibitp, fd_mask *obitp, int nfd,
    915 	register_t *retval)
    916 {
    917 	static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
    918 			       POLLWRNORM | POLLHUP | POLLERR,
    919 			       POLLRDBAND };
    920 	proc_t *p = l->l_proc;
    921 	struct filedesc	*fdp;
    922 	int msk, i, j, fd, n;
    923 	fd_mask ibits, obits;
    924 	struct file *fp;
    925 
    926 	fdp = p->p_fd;
    927 	n = 0;
    928 	for (msk = 0; msk < 3; msk++) {
    929 		for (i = 0; i < nfd; i += NFDBITS) {
    930 			ibits = *ibitp++;
    931 			obits = 0;
    932 			while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
    933 				ibits &= ~(1 << j);
    934 				if ((fp = fd_getfile(fdp, fd)) == NULL)
    935 					return (EBADF);
    936 				FILE_USE(fp);
    937 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], l)) {
    938 					obits |= (1 << j);
    939 					n++;
    940 				}
    941 				FILE_UNUSE(fp, l);
    942 			}
    943 			*obitp++ = obits;
    944 		}
    945 	}
    946 	*retval = n;
    947 	return (0);
    948 }
    949 
    950 /*
    951  * Poll system call.
    952  */
    953 int
    954 sys_poll(lwp_t *l, void *v, register_t *retval)
    955 {
    956 	struct sys_poll_args /* {
    957 		syscallarg(struct pollfd *)	fds;
    958 		syscallarg(u_int)		nfds;
    959 		syscallarg(int)			timeout;
    960 	} */ * const uap = v;
    961 	struct timeval	atv, *tv = NULL;
    962 
    963 	if (SCARG(uap, timeout) != INFTIM) {
    964 		atv.tv_sec = SCARG(uap, timeout) / 1000;
    965 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
    966 		tv = &atv;
    967 	}
    968 
    969 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
    970 		tv, NULL);
    971 }
    972 
    973 /*
    974  * Poll system call.
    975  */
    976 int
    977 sys_pollts(lwp_t *l, void *v, register_t *retval)
    978 {
    979 	struct sys_pollts_args /* {
    980 		syscallarg(struct pollfd *)		fds;
    981 		syscallarg(u_int)			nfds;
    982 		syscallarg(const struct timespec *)	ts;
    983 		syscallarg(const sigset_t *)		mask;
    984 	} */ * const uap = v;
    985 	struct timespec	ats;
    986 	struct timeval	atv, *tv = NULL;
    987 	sigset_t	amask, *mask = NULL;
    988 	int		error;
    989 
    990 	if (SCARG(uap, ts)) {
    991 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
    992 		if (error)
    993 			return error;
    994 		atv.tv_sec = ats.tv_sec;
    995 		atv.tv_usec = ats.tv_nsec / 1000;
    996 		tv = &atv;
    997 	}
    998 	if (SCARG(uap, mask)) {
    999 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
   1000 		if (error)
   1001 			return error;
   1002 		mask = &amask;
   1003 	}
   1004 
   1005 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
   1006 		tv, mask);
   1007 }
   1008 
   1009 int
   1010 pollcommon(lwp_t *l, register_t *retval,
   1011 	struct pollfd *u_fds, u_int nfds,
   1012 	struct timeval *tv, sigset_t *mask)
   1013 {
   1014 	char		smallbits[32 * sizeof(struct pollfd)];
   1015 	proc_t		* const p = l->l_proc;
   1016 	void *		bits;
   1017 	sigset_t	oldmask;
   1018 	int		ncoll, error, timo;
   1019 	size_t		ni;
   1020 	struct timeval	sleeptv;
   1021 
   1022 	if (nfds > p->p_fd->fd_nfiles) {
   1023 		/* forgiving; slightly wrong */
   1024 		nfds = p->p_fd->fd_nfiles;
   1025 	}
   1026 	ni = nfds * sizeof(struct pollfd);
   1027 	if (ni > sizeof(smallbits))
   1028 		bits = kmem_alloc(ni, KM_SLEEP);
   1029 	else
   1030 		bits = smallbits;
   1031 
   1032 	error = copyin(u_fds, bits, ni);
   1033 	if (error)
   1034 		goto done;
   1035 
   1036 	timo = 0;
   1037 	if (tv && inittimeleft(tv, &sleeptv) == -1) {
   1038 		error = EINVAL;
   1039 		goto done;
   1040 	}
   1041 
   1042 	if (mask) {
   1043 		sigminusset(&sigcantmask, mask);
   1044 		mutex_enter(&p->p_smutex);
   1045 		oldmask = l->l_sigmask;
   1046 		l->l_sigmask = *mask;
   1047 		mutex_exit(&p->p_smutex);
   1048 	} else
   1049 		oldmask = l->l_sigmask;	/* XXXgcc */
   1050 
   1051 	mutex_enter(&select_lock);
   1052 	SLIST_INIT(&l->l_selwait);
   1053 	for (;;) {
   1054 		ncoll = nselcoll;
   1055 		l->l_selflag = SEL_SCANNING;
   1056 		mutex_exit(&select_lock);
   1057 
   1058 		error = pollscan(l, (struct pollfd *)bits, nfds, retval);
   1059 
   1060 		mutex_enter(&select_lock);
   1061 		if (error || *retval)
   1062 			break;
   1063 		if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
   1064 			break;
   1065 		if (l->l_selflag != SEL_SCANNING || nselcoll != ncoll)
   1066 			continue;
   1067 		l->l_selflag = SEL_BLOCKING;
   1068 		error = cv_timedwait_sig(&select_cv, &select_lock, timo);
   1069 		if (error != 0)
   1070 			break;
   1071 	}
   1072 	selclear();
   1073 	mutex_exit(&select_lock);
   1074 
   1075 	if (mask) {
   1076 		mutex_enter(&p->p_smutex);
   1077 		l->l_sigmask = oldmask;
   1078 		mutex_exit(&p->p_smutex);
   1079 	}
   1080  done:
   1081 	/* poll is not restarted after signals... */
   1082 	if (error == ERESTART)
   1083 		error = EINTR;
   1084 	if (error == EWOULDBLOCK)
   1085 		error = 0;
   1086 	if (error == 0)
   1087 		error = copyout(bits, u_fds, ni);
   1088 	if (bits != smallbits)
   1089 		kmem_free(bits, ni);
   1090 	return (error);
   1091 }
   1092 
   1093 int
   1094 pollscan(lwp_t *l, struct pollfd *fds, int nfd, register_t *retval)
   1095 {
   1096 	proc_t		*p = l->l_proc;
   1097 	struct filedesc	*fdp;
   1098 	int		i, n;
   1099 	struct file	*fp;
   1100 
   1101 	fdp = p->p_fd;
   1102 	n = 0;
   1103 	for (i = 0; i < nfd; i++, fds++) {
   1104 		if (fds->fd >= fdp->fd_nfiles) {
   1105 			fds->revents = POLLNVAL;
   1106 			n++;
   1107 		} else if (fds->fd < 0) {
   1108 			fds->revents = 0;
   1109 		} else {
   1110 			if ((fp = fd_getfile(fdp, fds->fd)) == NULL) {
   1111 				fds->revents = POLLNVAL;
   1112 				n++;
   1113 			} else {
   1114 				FILE_USE(fp);
   1115 				fds->revents = (*fp->f_ops->fo_poll)(fp,
   1116 				    fds->events | POLLERR | POLLHUP, l);
   1117 				if (fds->revents != 0)
   1118 					n++;
   1119 				FILE_UNUSE(fp, l);
   1120 			}
   1121 		}
   1122 	}
   1123 	*retval = n;
   1124 	return (0);
   1125 }
   1126 
   1127 /*ARGSUSED*/
   1128 int
   1129 seltrue(dev_t dev, int events, lwp_t *l)
   1130 {
   1131 
   1132 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
   1133 }
   1134 
   1135 /*
   1136  * Record a select request.
   1137  */
   1138 void
   1139 selrecord(lwp_t *selector, struct selinfo *sip)
   1140 {
   1141 
   1142 	mutex_enter(&select_lock);
   1143 	if (sip->sel_lwp == NULL) {
   1144 		/* First named waiter, although there may be more. */
   1145 		sip->sel_lwp = selector;
   1146 		SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
   1147 	} else if (sip->sel_lwp != selector) {
   1148 		/* Multiple waiters. */
   1149 		sip->sel_collision = true;
   1150 	}
   1151 	mutex_exit(&select_lock);
   1152 }
   1153 
   1154 /*
   1155  * Do a wakeup when a selectable event occurs.
   1156  */
   1157 void
   1158 selwakeup(struct selinfo *sip)
   1159 {
   1160 	lwp_t *l;
   1161 
   1162 	mutex_enter(&select_lock);
   1163 	if (sip->sel_collision) {
   1164 		/* Multiple waiters - just notify everybody. */
   1165 		nselcoll++;
   1166 		sip->sel_collision = false;
   1167 		cv_broadcast(&select_cv);
   1168 	} else if (sip->sel_lwp != NULL) {
   1169 		/* Only one LWP waiting. */
   1170 		l = sip->sel_lwp;
   1171 		if (l->l_selflag == SEL_BLOCKING) {
   1172 			/*
   1173 			 * If it's sleeping, wake it up.  If not, it's
   1174 			 * already awake but hasn't yet removed itself
   1175 			 * from the selector.  We reset the state below
   1176 			 * so that we only attempt to do this once.
   1177 			 */
   1178 			lwp_lock(l);
   1179 			if (l->l_wchan == &select_cv) {
   1180 				/* lwp_unsleep() releases the LWP lock. */
   1181 				lwp_unsleep(l);
   1182 			} else
   1183 				lwp_unlock(l);
   1184 		} else {
   1185 			/*
   1186 			 * Not yet asleep.  Reset its state below so that
   1187 			 * it will go around again.
   1188 			 */
   1189 		}
   1190 		l->l_selflag = SEL_RESET;
   1191 	}
   1192 	mutex_exit(&select_lock);
   1193 }
   1194 
   1195 void
   1196 selnotify(struct selinfo *sip, long knhint)
   1197 {
   1198 
   1199 	selwakeup(sip);
   1200 	KNOTE(&sip->sel_klist, knhint);
   1201 }
   1202 
   1203 /*
   1204  * Remove an LWP from all objects that it is waiting for.
   1205  */
   1206 static void
   1207 selclear(void)
   1208 {
   1209 	struct selinfo *sip;
   1210 	lwp_t *l = curlwp;
   1211 
   1212 	KASSERT(mutex_owned(&select_lock));
   1213 
   1214 	SLIST_FOREACH(sip, &l->l_selwait, sel_chain) {
   1215 		KASSERT(sip->sel_lwp == l);
   1216 		sip->sel_lwp = NULL;
   1217 	}
   1218 }
   1219 
   1220 /*
   1221  * Initialize the select/poll system calls.
   1222  */
   1223 void
   1224 selsysinit(void)
   1225 {
   1226 
   1227 	mutex_init(&select_lock, MUTEX_DRIVER, IPL_VM);
   1228 	cv_init(&select_cv, "select");
   1229 }
   1230 
   1231 /*
   1232  * Initialize a selector.
   1233  */
   1234 void
   1235 selinit(struct selinfo *sip)
   1236 {
   1237 
   1238 	memset(sip, 0, sizeof(*sip));
   1239 }
   1240 
   1241 /*
   1242  * Destroy a selector.  The owning object must not gain new
   1243  * references while this is in progress: all activity on the
   1244  * selector must be stopped.
   1245  */
   1246 void
   1247 seldestroy(struct selinfo *sip)
   1248 {
   1249 	lwp_t *l;
   1250 
   1251 	if (sip->sel_lwp == NULL)
   1252 		return;
   1253 
   1254 	mutex_enter(&select_lock);
   1255 	if ((l = sip->sel_lwp) != NULL) {
   1256 		/* This should rarely happen, so SLIST_REMOVE() is OK. */
   1257 		SLIST_REMOVE(&l->l_selwait, sip, selinfo, sel_chain);
   1258 		sip->sel_lwp = NULL;
   1259 	}
   1260 	mutex_exit(&select_lock);
   1261 }
   1262