Home | History | Annotate | Line # | Download | only in kern
sys_generic.c revision 1.114
      1 /*	$NetBSD: sys_generic.c,v 1.114 2008/03/17 18:01:44 ad Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2007, 2008 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Andrew Doran.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  * 3. All advertising materials mentioning features or use of this software
     19  *    must display the following acknowledgement:
     20  *	This product includes software developed by the NetBSD
     21  *	Foundation, Inc. and its contributors.
     22  * 4. Neither the name of The NetBSD Foundation nor the names of its
     23  *    contributors may be used to endorse or promote products derived
     24  *    from this software without specific prior written permission.
     25  *
     26  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     27  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     28  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     29  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     30  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     31  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     32  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     33  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     34  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     35  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     36  * POSSIBILITY OF SUCH DAMAGE.
     37  */
     38 
     39 /*
     40  * Copyright (c) 1982, 1986, 1989, 1993
     41  *	The Regents of the University of California.  All rights reserved.
     42  * (c) UNIX System Laboratories, Inc.
     43  * All or some portions of this file are derived from material licensed
     44  * to the University of California by American Telephone and Telegraph
     45  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
     46  * the permission of UNIX System Laboratories, Inc.
     47  *
     48  * Redistribution and use in source and binary forms, with or without
     49  * modification, are permitted provided that the following conditions
     50  * are met:
     51  * 1. Redistributions of source code must retain the above copyright
     52  *    notice, this list of conditions and the following disclaimer.
     53  * 2. Redistributions in binary form must reproduce the above copyright
     54  *    notice, this list of conditions and the following disclaimer in the
     55  *    documentation and/or other materials provided with the distribution.
     56  * 3. Neither the name of the University nor the names of its contributors
     57  *    may be used to endorse or promote products derived from this software
     58  *    without specific prior written permission.
     59  *
     60  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     61  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     62  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     63  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     64  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     65  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     66  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     67  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     68  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     69  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     70  * SUCH DAMAGE.
     71  *
     72  *	@(#)sys_generic.c	8.9 (Berkeley) 2/14/95
     73  */
     74 
     75 /*
     76  * System calls relating to files.
     77  */
     78 
     79 #include <sys/cdefs.h>
     80 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.114 2008/03/17 18:01:44 ad Exp $");
     81 
     82 #include <sys/param.h>
     83 #include <sys/systm.h>
     84 #include <sys/filedesc.h>
     85 #include <sys/ioctl.h>
     86 #include <sys/file.h>
     87 #include <sys/proc.h>
     88 #include <sys/socketvar.h>
     89 #include <sys/signalvar.h>
     90 #include <sys/uio.h>
     91 #include <sys/kernel.h>
     92 #include <sys/stat.h>
     93 #include <sys/kmem.h>
     94 #include <sys/poll.h>
     95 #include <sys/vnode.h>
     96 #include <sys/mount.h>
     97 #include <sys/syscallargs.h>
     98 #include <sys/ktrace.h>
     99 
    100 #include <uvm/uvm_extern.h>
    101 
    102 /* Flags for lwp::l_selflag. */
    103 #define	SEL_RESET	0	/* awoken, interrupted, or not yet polling */
    104 #define	SEL_SCANNING	1	/* polling descriptors */
    105 #define	SEL_BLOCKING	2	/* about to block on select_cv */
    106 
    107 static int	selscan(lwp_t *, fd_mask *, fd_mask *, int, register_t *);
    108 static int	pollscan(lwp_t *, struct pollfd *, int, register_t *);
    109 
    110 /* Global state for select()/poll(). */
    111 kmutex_t	select_lock;
    112 kcondvar_t	select_cv;
    113 int		nselcoll;
    114 
    115 /*
    116  * Read system call.
    117  */
    118 /* ARGSUSED */
    119 int
    120 sys_read(struct lwp *l, const struct sys_read_args *uap, register_t *retval)
    121 {
    122 	/* {
    123 		syscallarg(int)		fd;
    124 		syscallarg(void *)	buf;
    125 		syscallarg(size_t)	nbyte;
    126 	} */
    127 	int		fd;
    128 	struct file	*fp;
    129 	proc_t		*p;
    130 	struct filedesc	*fdp;
    131 
    132 	fd = SCARG(uap, fd);
    133 	p = l->l_proc;
    134 	fdp = p->p_fd;
    135 
    136 	if ((fp = fd_getfile(fdp, fd)) == NULL)
    137 		return (EBADF);
    138 
    139 	if ((fp->f_flag & FREAD) == 0) {
    140 		FILE_UNLOCK(fp);
    141 		return (EBADF);
    142 	}
    143 
    144 	FILE_USE(fp);
    145 
    146 	/* dofileread() will unuse the descriptor for us */
    147 	return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
    148 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
    149 }
    150 
    151 int
    152 dofileread(int fd, struct file *fp, void *buf, size_t nbyte,
    153 	off_t *offset, int flags, register_t *retval)
    154 {
    155 	struct iovec aiov;
    156 	struct uio auio;
    157 	size_t cnt;
    158 	int error;
    159 	lwp_t *l;
    160 
    161 	l = curlwp;
    162 
    163 	aiov.iov_base = (void *)buf;
    164 	aiov.iov_len = nbyte;
    165 	auio.uio_iov = &aiov;
    166 	auio.uio_iovcnt = 1;
    167 	auio.uio_resid = nbyte;
    168 	auio.uio_rw = UIO_READ;
    169 	auio.uio_vmspace = l->l_proc->p_vmspace;
    170 
    171 	/*
    172 	 * Reads return ssize_t because -1 is returned on error.  Therefore
    173 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
    174 	 * values.
    175 	 */
    176 	if (auio.uio_resid > SSIZE_MAX) {
    177 		error = EINVAL;
    178 		goto out;
    179 	}
    180 
    181 	cnt = auio.uio_resid;
    182 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
    183 	if (error)
    184 		if (auio.uio_resid != cnt && (error == ERESTART ||
    185 		    error == EINTR || error == EWOULDBLOCK))
    186 			error = 0;
    187 	cnt -= auio.uio_resid;
    188 	ktrgenio(fd, UIO_READ, buf, cnt, error);
    189 	*retval = cnt;
    190  out:
    191 	FILE_UNUSE(fp, l);
    192 	return (error);
    193 }
    194 
    195 /*
    196  * Scatter read system call.
    197  */
    198 int
    199 sys_readv(struct lwp *l, const struct sys_readv_args *uap, register_t *retval)
    200 {
    201 	/* {
    202 		syscallarg(int)				fd;
    203 		syscallarg(const struct iovec *)	iovp;
    204 		syscallarg(int)				iovcnt;
    205 	} */
    206 
    207 	return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
    208 	    SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
    209 }
    210 
    211 int
    212 do_filereadv(int fd, const struct iovec *iovp, int iovcnt,
    213     off_t *offset, int flags, register_t *retval)
    214 {
    215 	struct uio	auio;
    216 	struct iovec	*iov, *needfree = NULL, aiov[UIO_SMALLIOV];
    217 	int		i, error;
    218 	size_t		cnt;
    219 	u_int		iovlen;
    220 	struct file	*fp;
    221 	struct iovec	*ktriov = NULL;
    222 	lwp_t		*l;
    223 
    224 	if (iovcnt == 0)
    225 		return EINVAL;
    226 
    227 	l = curlwp;
    228 
    229 	if ((fp = fd_getfile(l->l_proc->p_fd, fd)) == NULL)
    230 		return EBADF;
    231 
    232 	if ((fp->f_flag & FREAD) == 0) {
    233 		FILE_UNLOCK(fp);
    234 		return EBADF;
    235 	}
    236 
    237 	FILE_USE(fp);
    238 
    239 	if (offset == NULL)
    240 		offset = &fp->f_offset;
    241 	else {
    242 		struct vnode *vp = fp->f_data;
    243 		if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
    244 			error = ESPIPE;
    245 			goto out;
    246 		}
    247 		/*
    248 		 * Test that the device is seekable ?
    249 		 * XXX This works because no file systems actually
    250 		 * XXX take any action on the seek operation.
    251 		 */
    252 		error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
    253 		if (error != 0)
    254 			goto out;
    255 	}
    256 
    257 	iovlen = iovcnt * sizeof(struct iovec);
    258 	if (flags & FOF_IOV_SYSSPACE)
    259 		iov = __UNCONST(iovp);
    260 	else {
    261 		iov = aiov;
    262 		if ((u_int)iovcnt > UIO_SMALLIOV) {
    263 			if ((u_int)iovcnt > IOV_MAX) {
    264 				error = EINVAL;
    265 				goto out;
    266 			}
    267 			iov = kmem_alloc(iovlen, KM_SLEEP);
    268 			if (iov == NULL) {
    269 				error = ENOMEM;
    270 				goto out;
    271 			}
    272 			needfree = iov;
    273 		}
    274 		error = copyin(iovp, iov, iovlen);
    275 		if (error)
    276 			goto done;
    277 	}
    278 
    279 	auio.uio_iov = iov;
    280 	auio.uio_iovcnt = iovcnt;
    281 	auio.uio_rw = UIO_READ;
    282 	auio.uio_vmspace = l->l_proc->p_vmspace;
    283 
    284 	auio.uio_resid = 0;
    285 	for (i = 0; i < iovcnt; i++, iov++) {
    286 		auio.uio_resid += iov->iov_len;
    287 		/*
    288 		 * Reads return ssize_t because -1 is returned on error.
    289 		 * Therefore we must restrict the length to SSIZE_MAX to
    290 		 * avoid garbage return values.
    291 		 */
    292 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
    293 			error = EINVAL;
    294 			goto done;
    295 		}
    296 	}
    297 
    298 	/*
    299 	 * if tracing, save a copy of iovec
    300 	 */
    301 	if (ktrpoint(KTR_GENIO))  {
    302 		ktriov = kmem_alloc(iovlen, KM_SLEEP);
    303 		if (ktriov != NULL)
    304 			memcpy(ktriov, auio.uio_iov, iovlen);
    305 	}
    306 
    307 	cnt = auio.uio_resid;
    308 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
    309 	if (error)
    310 		if (auio.uio_resid != cnt && (error == ERESTART ||
    311 		    error == EINTR || error == EWOULDBLOCK))
    312 			error = 0;
    313 	cnt -= auio.uio_resid;
    314 	*retval = cnt;
    315 
    316 	if (ktriov != NULL) {
    317 		ktrgeniov(fd, UIO_READ, ktriov, cnt, error);
    318 		kmem_free(ktriov, iovlen);
    319 	}
    320 
    321  done:
    322 	if (needfree)
    323 		kmem_free(needfree, iovlen);
    324  out:
    325 	FILE_UNUSE(fp, l);
    326 	return (error);
    327 }
    328 
    329 /*
    330  * Write system call
    331  */
    332 int
    333 sys_write(struct lwp *l, const struct sys_write_args *uap, register_t *retval)
    334 {
    335 	/* {
    336 		syscallarg(int)			fd;
    337 		syscallarg(const void *)	buf;
    338 		syscallarg(size_t)		nbyte;
    339 	} */
    340 	int		fd;
    341 	struct file	*fp;
    342 
    343 	fd = SCARG(uap, fd);
    344 
    345 	if ((fp = fd_getfile(curproc->p_fd, fd)) == NULL)
    346 		return (EBADF);
    347 
    348 	if ((fp->f_flag & FWRITE) == 0) {
    349 		FILE_UNLOCK(fp);
    350 		return (EBADF);
    351 	}
    352 
    353 	FILE_USE(fp);
    354 
    355 	/* dofilewrite() will unuse the descriptor for us */
    356 	return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
    357 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
    358 }
    359 
    360 int
    361 dofilewrite(int fd, struct file *fp, const void *buf,
    362 	size_t nbyte, off_t *offset, int flags, register_t *retval)
    363 {
    364 	struct iovec aiov;
    365 	struct uio auio;
    366 	size_t cnt;
    367 	int error;
    368 	lwp_t *l;
    369 
    370 	l = curlwp;
    371 
    372 	aiov.iov_base = __UNCONST(buf);		/* XXXUNCONST kills const */
    373 	aiov.iov_len = nbyte;
    374 	auio.uio_iov = &aiov;
    375 	auio.uio_iovcnt = 1;
    376 	auio.uio_resid = nbyte;
    377 	auio.uio_rw = UIO_WRITE;
    378 	auio.uio_vmspace = l->l_proc->p_vmspace;
    379 
    380 	/*
    381 	 * Writes return ssize_t because -1 is returned on error.  Therefore
    382 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
    383 	 * values.
    384 	 */
    385 	if (auio.uio_resid > SSIZE_MAX) {
    386 		error = EINVAL;
    387 		goto out;
    388 	}
    389 
    390 	cnt = auio.uio_resid;
    391 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
    392 	if (error) {
    393 		if (auio.uio_resid != cnt && (error == ERESTART ||
    394 		    error == EINTR || error == EWOULDBLOCK))
    395 			error = 0;
    396 		if (error == EPIPE) {
    397 			mutex_enter(&proclist_mutex);
    398 			psignal(l->l_proc, SIGPIPE);
    399 			mutex_exit(&proclist_mutex);
    400 		}
    401 	}
    402 	cnt -= auio.uio_resid;
    403 	ktrgenio(fd, UIO_WRITE, buf, cnt, error);
    404 	*retval = cnt;
    405  out:
    406 	FILE_UNUSE(fp, l);
    407 	return (error);
    408 }
    409 
    410 /*
    411  * Gather write system call
    412  */
    413 int
    414 sys_writev(struct lwp *l, const struct sys_writev_args *uap, register_t *retval)
    415 {
    416 	/* {
    417 		syscallarg(int)				fd;
    418 		syscallarg(const struct iovec *)	iovp;
    419 		syscallarg(int)				iovcnt;
    420 	} */
    421 
    422 	return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
    423 	    SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
    424 }
    425 
    426 int
    427 do_filewritev(int fd, const struct iovec *iovp, int iovcnt,
    428     off_t *offset, int flags, register_t *retval)
    429 {
    430 	struct uio	auio;
    431 	struct iovec	*iov, *needfree = NULL, aiov[UIO_SMALLIOV];
    432 	int		i, error;
    433 	size_t		cnt;
    434 	u_int		iovlen;
    435 	struct file	*fp;
    436 	struct iovec	*ktriov = NULL;
    437 	lwp_t		*l;
    438 
    439 	l = curlwp;
    440 
    441 	if (iovcnt == 0)
    442 		return EINVAL;
    443 
    444 	if ((fp = fd_getfile(l->l_proc->p_fd, fd)) == NULL)
    445 		return EBADF;
    446 
    447 	if ((fp->f_flag & FWRITE) == 0) {
    448 		FILE_UNLOCK(fp);
    449 		return EBADF;
    450 	}
    451 
    452 	FILE_USE(fp);
    453 
    454 	if (offset == NULL)
    455 		offset = &fp->f_offset;
    456 	else {
    457 		struct vnode *vp = fp->f_data;
    458 		if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
    459 			error = ESPIPE;
    460 			goto out;
    461 		}
    462 		/*
    463 		 * Test that the device is seekable ?
    464 		 * XXX This works because no file systems actually
    465 		 * XXX take any action on the seek operation.
    466 		 */
    467 		error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
    468 		if (error != 0)
    469 			goto out;
    470 	}
    471 
    472 	iovlen = iovcnt * sizeof(struct iovec);
    473 	if (flags & FOF_IOV_SYSSPACE)
    474 		iov = __UNCONST(iovp);
    475 	else {
    476 		iov = aiov;
    477 		if ((u_int)iovcnt > UIO_SMALLIOV) {
    478 			if ((u_int)iovcnt > IOV_MAX) {
    479 				error = EINVAL;
    480 				goto out;
    481 			}
    482 			iov = kmem_alloc(iovlen, KM_SLEEP);
    483 			if (iov == NULL) {
    484 				error = ENOMEM;
    485 				goto out;
    486 			}
    487 			needfree = iov;
    488 		}
    489 		error = copyin(iovp, iov, iovlen);
    490 		if (error)
    491 			goto done;
    492 	}
    493 
    494 	auio.uio_iov = iov;
    495 	auio.uio_iovcnt = iovcnt;
    496 	auio.uio_rw = UIO_WRITE;
    497 	auio.uio_vmspace = curproc->p_vmspace;
    498 
    499 	auio.uio_resid = 0;
    500 	for (i = 0; i < iovcnt; i++, iov++) {
    501 		auio.uio_resid += iov->iov_len;
    502 		/*
    503 		 * Writes return ssize_t because -1 is returned on error.
    504 		 * Therefore we must restrict the length to SSIZE_MAX to
    505 		 * avoid garbage return values.
    506 		 */
    507 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
    508 			error = EINVAL;
    509 			goto done;
    510 		}
    511 	}
    512 
    513 	/*
    514 	 * if tracing, save a copy of iovec
    515 	 */
    516 	if (ktrpoint(KTR_GENIO))  {
    517 		ktriov = kmem_alloc(iovlen, KM_SLEEP);
    518 		if (ktriov != NULL)
    519 			memcpy(ktriov, auio.uio_iov, iovlen);
    520 	}
    521 
    522 	cnt = auio.uio_resid;
    523 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
    524 	if (error) {
    525 		if (auio.uio_resid != cnt && (error == ERESTART ||
    526 		    error == EINTR || error == EWOULDBLOCK))
    527 			error = 0;
    528 		if (error == EPIPE) {
    529 			mutex_enter(&proclist_mutex);
    530 			psignal(l->l_proc, SIGPIPE);
    531 			mutex_exit(&proclist_mutex);
    532 		}
    533 	}
    534 	cnt -= auio.uio_resid;
    535 	*retval = cnt;
    536 
    537 	if (ktriov != NULL) {
    538 		ktrgeniov(fd, UIO_WRITE, ktriov, cnt, error);
    539 		kmem_free(ktriov, iovlen);
    540 	}
    541 
    542  done:
    543 	if (needfree)
    544 		kmem_free(needfree, iovlen);
    545  out:
    546 	FILE_UNUSE(fp, l);
    547 	return (error);
    548 }
    549 
    550 /*
    551  * Ioctl system call
    552  */
    553 /* ARGSUSED */
    554 int
    555 sys_ioctl(struct lwp *l, const struct sys_ioctl_args *uap, register_t *retval)
    556 {
    557 	/* {
    558 		syscallarg(int)		fd;
    559 		syscallarg(u_long)	com;
    560 		syscallarg(void *)	data;
    561 	} */
    562 	struct file	*fp;
    563 	proc_t		*p;
    564 	struct filedesc	*fdp;
    565 	u_long		com;
    566 	int		error;
    567 	u_int		size;
    568 	void 		*data, *memp;
    569 #define	STK_PARAMS	128
    570 	u_long		stkbuf[STK_PARAMS/sizeof(u_long)];
    571 
    572 	error = 0;
    573 	p = l->l_proc;
    574 	fdp = p->p_fd;
    575 
    576 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
    577 		return (EBADF);
    578 
    579 	FILE_USE(fp);
    580 
    581 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
    582 		error = EBADF;
    583 		com = 0;
    584 		goto out;
    585 	}
    586 
    587 	switch (com = SCARG(uap, com)) {
    588 	case FIONCLEX:
    589 		rw_enter(&fdp->fd_lock, RW_WRITER);
    590 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
    591 		rw_exit(&fdp->fd_lock);
    592 		goto out;
    593 
    594 	case FIOCLEX:
    595 		rw_enter(&fdp->fd_lock, RW_WRITER);
    596 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
    597 		rw_exit(&fdp->fd_lock);
    598 		goto out;
    599 	}
    600 
    601 	/*
    602 	 * Interpret high order word to find amount of data to be
    603 	 * copied to/from the user's address space.
    604 	 */
    605 	size = IOCPARM_LEN(com);
    606 	if (size > IOCPARM_MAX) {
    607 		error = ENOTTY;
    608 		goto out;
    609 	}
    610 	memp = NULL;
    611 	if (size > sizeof(stkbuf)) {
    612 		memp = kmem_alloc(size, KM_SLEEP);
    613 		data = memp;
    614 	} else
    615 		data = (void *)stkbuf;
    616 	if (com&IOC_IN) {
    617 		if (size) {
    618 			error = copyin(SCARG(uap, data), data, size);
    619 			if (error) {
    620 				if (memp)
    621 					kmem_free(memp, size);
    622 				goto out;
    623 			}
    624 			ktrgenio(SCARG(uap, fd), UIO_WRITE, SCARG(uap, data),
    625 			    size, 0);
    626 		} else
    627 			*(void **)data = SCARG(uap, data);
    628 	} else if ((com&IOC_OUT) && size)
    629 		/*
    630 		 * Zero the buffer so the user always
    631 		 * gets back something deterministic.
    632 		 */
    633 		memset(data, 0, size);
    634 	else if (com&IOC_VOID)
    635 		*(void **)data = SCARG(uap, data);
    636 
    637 	switch (com) {
    638 
    639 	case FIONBIO:
    640 		FILE_LOCK(fp);
    641 		if (*(int *)data != 0)
    642 			fp->f_flag |= FNONBLOCK;
    643 		else
    644 			fp->f_flag &= ~FNONBLOCK;
    645 		FILE_UNLOCK(fp);
    646 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data, l);
    647 		break;
    648 
    649 	case FIOASYNC:
    650 		FILE_LOCK(fp);
    651 		if (*(int *)data != 0)
    652 			fp->f_flag |= FASYNC;
    653 		else
    654 			fp->f_flag &= ~FASYNC;
    655 		FILE_UNLOCK(fp);
    656 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data, l);
    657 		break;
    658 
    659 	default:
    660 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, l);
    661 		/*
    662 		 * Copy any data to user, size was
    663 		 * already set and checked above.
    664 		 */
    665 		if (error == 0 && (com&IOC_OUT) && size) {
    666 			error = copyout(data, SCARG(uap, data), size);
    667 			ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, data),
    668 			    size, error);
    669 		}
    670 		break;
    671 	}
    672 	if (memp)
    673 		kmem_free(memp, size);
    674  out:
    675 	FILE_UNUSE(fp, l);
    676 	switch (error) {
    677 	case -1:
    678 		printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
    679 		    "pid=%d comm=%s\n",
    680 		    (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
    681 		    (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
    682 		    p->p_pid, p->p_comm);
    683 		/* FALLTHROUGH */
    684 	case EPASSTHROUGH:
    685 		error = ENOTTY;
    686 		/* FALLTHROUGH */
    687 	default:
    688 		return (error);
    689 	}
    690 }
    691 
    692 /*
    693  * Select system call.
    694  */
    695 int
    696 sys_pselect(struct lwp *l, const struct sys_pselect_args *uap, register_t *retval)
    697 {
    698 	/* {
    699 		syscallarg(int)				nd;
    700 		syscallarg(fd_set *)			in;
    701 		syscallarg(fd_set *)			ou;
    702 		syscallarg(fd_set *)			ex;
    703 		syscallarg(const struct timespec *)	ts;
    704 		syscallarg(sigset_t *)			mask;
    705 	} */
    706 	struct timespec	ats;
    707 	struct timeval	atv, *tv = NULL;
    708 	sigset_t	amask, *mask = NULL;
    709 	int		error;
    710 
    711 	if (SCARG(uap, ts)) {
    712 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
    713 		if (error)
    714 			return error;
    715 		atv.tv_sec = ats.tv_sec;
    716 		atv.tv_usec = ats.tv_nsec / 1000;
    717 		tv = &atv;
    718 	}
    719 	if (SCARG(uap, mask) != NULL) {
    720 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
    721 		if (error)
    722 			return error;
    723 		mask = &amask;
    724 	}
    725 
    726 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
    727 	    SCARG(uap, ou), SCARG(uap, ex), tv, mask);
    728 }
    729 
    730 int
    731 inittimeleft(struct timeval *tv, struct timeval *sleeptv)
    732 {
    733 	if (itimerfix(tv))
    734 		return -1;
    735 	getmicrouptime(sleeptv);
    736 	return 0;
    737 }
    738 
    739 int
    740 gettimeleft(struct timeval *tv, struct timeval *sleeptv)
    741 {
    742 	/*
    743 	 * We have to recalculate the timeout on every retry.
    744 	 */
    745 	struct timeval slepttv;
    746 	/*
    747 	 * reduce tv by elapsed time
    748 	 * based on monotonic time scale
    749 	 */
    750 	getmicrouptime(&slepttv);
    751 	timeradd(tv, sleeptv, tv);
    752 	timersub(tv, &slepttv, tv);
    753 	*sleeptv = slepttv;
    754 	return tvtohz(tv);
    755 }
    756 
    757 int
    758 sys_select(struct lwp *l, const struct sys_select_args *uap, register_t *retval)
    759 {
    760 	/* {
    761 		syscallarg(int)			nd;
    762 		syscallarg(fd_set *)		in;
    763 		syscallarg(fd_set *)		ou;
    764 		syscallarg(fd_set *)		ex;
    765 		syscallarg(struct timeval *)	tv;
    766 	} */
    767 	struct timeval atv, *tv = NULL;
    768 	int error;
    769 
    770 	if (SCARG(uap, tv)) {
    771 		error = copyin(SCARG(uap, tv), (void *)&atv,
    772 			sizeof(atv));
    773 		if (error)
    774 			return error;
    775 		tv = &atv;
    776 	}
    777 
    778 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
    779 	    SCARG(uap, ou), SCARG(uap, ex), tv, NULL);
    780 }
    781 
    782 int
    783 selcommon(lwp_t *l, register_t *retval, int nd, fd_set *u_in,
    784 	  fd_set *u_ou, fd_set *u_ex, struct timeval *tv, sigset_t *mask)
    785 {
    786 	char		smallbits[howmany(FD_SETSIZE, NFDBITS) *
    787 			    sizeof(fd_mask) * 6];
    788 	proc_t		* const p = l->l_proc;
    789 	char 		*bits;
    790 	int		ncoll, error, timo;
    791 	size_t		ni;
    792 	sigset_t	oldmask;
    793 	struct timeval  sleeptv;
    794 
    795 	error = 0;
    796 	if (nd < 0)
    797 		return (EINVAL);
    798 	if (nd > p->p_fd->fd_nfiles) {
    799 		/* forgiving; slightly wrong */
    800 		nd = p->p_fd->fd_nfiles;
    801 	}
    802 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
    803 	if (ni * 6 > sizeof(smallbits))
    804 		bits = kmem_alloc(ni * 6, KM_SLEEP);
    805 	else
    806 		bits = smallbits;
    807 
    808 #define	getbits(name, x)						\
    809 	if (u_ ## name) {						\
    810 		error = copyin(u_ ## name, bits + ni * x, ni);		\
    811 		if (error)						\
    812 			goto done;					\
    813 	} else								\
    814 		memset(bits + ni * x, 0, ni);
    815 	getbits(in, 0);
    816 	getbits(ou, 1);
    817 	getbits(ex, 2);
    818 #undef	getbits
    819 
    820 	timo = 0;
    821 	if (tv && inittimeleft(tv, &sleeptv) == -1) {
    822 		error = EINVAL;
    823 		goto done;
    824 	}
    825 
    826 	if (mask) {
    827 		sigminusset(&sigcantmask, mask);
    828 		mutex_enter(&p->p_smutex);
    829 		oldmask = l->l_sigmask;
    830 		l->l_sigmask = *mask;
    831 		mutex_exit(&p->p_smutex);
    832 	} else
    833 		oldmask = l->l_sigmask;	/* XXXgcc */
    834 
    835 	mutex_enter(&select_lock);
    836 	SLIST_INIT(&l->l_selwait);
    837 	for (;;) {
    838 	 	l->l_selflag = SEL_SCANNING;
    839 		ncoll = nselcoll;
    840  		mutex_exit(&select_lock);
    841 
    842 		error = selscan(l, (fd_mask *)(bits + ni * 0),
    843 		    (fd_mask *)(bits + ni * 3), nd, retval);
    844 
    845 		mutex_enter(&select_lock);
    846 		if (error || *retval)
    847 			break;
    848 		if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
    849 			break;
    850 		if (l->l_selflag != SEL_SCANNING || ncoll != nselcoll)
    851 			continue;
    852 		l->l_selflag = SEL_BLOCKING;
    853 		error = cv_timedwait_sig(&select_cv, &select_lock, timo);
    854 		if (error != 0)
    855 			break;
    856 	}
    857 	selclear();
    858 	mutex_exit(&select_lock);
    859 
    860 	if (mask) {
    861 		mutex_enter(&p->p_smutex);
    862 		l->l_sigmask = oldmask;
    863 		mutex_exit(&p->p_smutex);
    864 	}
    865 
    866  done:
    867 	/* select is not restarted after signals... */
    868 	if (error == ERESTART)
    869 		error = EINTR;
    870 	if (error == EWOULDBLOCK)
    871 		error = 0;
    872 	if (error == 0 && u_in != NULL)
    873 		error = copyout(bits + ni * 3, u_in, ni);
    874 	if (error == 0 && u_ou != NULL)
    875 		error = copyout(bits + ni * 4, u_ou, ni);
    876 	if (error == 0 && u_ex != NULL)
    877 		error = copyout(bits + ni * 5, u_ex, ni);
    878 	if (bits != smallbits)
    879 		kmem_free(bits, ni * 6);
    880 	return (error);
    881 }
    882 
    883 int
    884 selscan(lwp_t *l, fd_mask *ibitp, fd_mask *obitp, int nfd,
    885 	register_t *retval)
    886 {
    887 	static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
    888 			       POLLWRNORM | POLLHUP | POLLERR,
    889 			       POLLRDBAND };
    890 	proc_t *p = l->l_proc;
    891 	struct filedesc	*fdp;
    892 	int msk, i, j, fd, n;
    893 	fd_mask ibits, obits;
    894 	struct file *fp;
    895 
    896 	fdp = p->p_fd;
    897 	n = 0;
    898 	for (msk = 0; msk < 3; msk++) {
    899 		for (i = 0; i < nfd; i += NFDBITS) {
    900 			ibits = *ibitp++;
    901 			obits = 0;
    902 			while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
    903 				ibits &= ~(1 << j);
    904 				if ((fp = fd_getfile(fdp, fd)) == NULL)
    905 					return (EBADF);
    906 				FILE_USE(fp);
    907 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], l)) {
    908 					obits |= (1 << j);
    909 					n++;
    910 				}
    911 				FILE_UNUSE(fp, l);
    912 			}
    913 			*obitp++ = obits;
    914 		}
    915 	}
    916 	*retval = n;
    917 	return (0);
    918 }
    919 
    920 /*
    921  * Poll system call.
    922  */
    923 int
    924 sys_poll(struct lwp *l, const struct sys_poll_args *uap, register_t *retval)
    925 {
    926 	/* {
    927 		syscallarg(struct pollfd *)	fds;
    928 		syscallarg(u_int)		nfds;
    929 		syscallarg(int)			timeout;
    930 	} */
    931 	struct timeval	atv, *tv = NULL;
    932 
    933 	if (SCARG(uap, timeout) != INFTIM) {
    934 		atv.tv_sec = SCARG(uap, timeout) / 1000;
    935 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
    936 		tv = &atv;
    937 	}
    938 
    939 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
    940 		tv, NULL);
    941 }
    942 
    943 /*
    944  * Poll system call.
    945  */
    946 int
    947 sys_pollts(struct lwp *l, const struct sys_pollts_args *uap, register_t *retval)
    948 {
    949 	/* {
    950 		syscallarg(struct pollfd *)		fds;
    951 		syscallarg(u_int)			nfds;
    952 		syscallarg(const struct timespec *)	ts;
    953 		syscallarg(const sigset_t *)		mask;
    954 	} */
    955 	struct timespec	ats;
    956 	struct timeval	atv, *tv = NULL;
    957 	sigset_t	amask, *mask = NULL;
    958 	int		error;
    959 
    960 	if (SCARG(uap, ts)) {
    961 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
    962 		if (error)
    963 			return error;
    964 		atv.tv_sec = ats.tv_sec;
    965 		atv.tv_usec = ats.tv_nsec / 1000;
    966 		tv = &atv;
    967 	}
    968 	if (SCARG(uap, mask)) {
    969 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
    970 		if (error)
    971 			return error;
    972 		mask = &amask;
    973 	}
    974 
    975 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
    976 		tv, mask);
    977 }
    978 
    979 int
    980 pollcommon(lwp_t *l, register_t *retval,
    981 	struct pollfd *u_fds, u_int nfds,
    982 	struct timeval *tv, sigset_t *mask)
    983 {
    984 	char		smallbits[32 * sizeof(struct pollfd)];
    985 	proc_t		* const p = l->l_proc;
    986 	void *		bits;
    987 	sigset_t	oldmask;
    988 	int		ncoll, error, timo;
    989 	size_t		ni;
    990 	struct timeval	sleeptv;
    991 
    992 	if (nfds > p->p_fd->fd_nfiles) {
    993 		/* forgiving; slightly wrong */
    994 		nfds = p->p_fd->fd_nfiles;
    995 	}
    996 	ni = nfds * sizeof(struct pollfd);
    997 	if (ni > sizeof(smallbits))
    998 		bits = kmem_alloc(ni, KM_SLEEP);
    999 	else
   1000 		bits = smallbits;
   1001 
   1002 	error = copyin(u_fds, bits, ni);
   1003 	if (error)
   1004 		goto done;
   1005 
   1006 	timo = 0;
   1007 	if (tv && inittimeleft(tv, &sleeptv) == -1) {
   1008 		error = EINVAL;
   1009 		goto done;
   1010 	}
   1011 
   1012 	if (mask) {
   1013 		sigminusset(&sigcantmask, mask);
   1014 		mutex_enter(&p->p_smutex);
   1015 		oldmask = l->l_sigmask;
   1016 		l->l_sigmask = *mask;
   1017 		mutex_exit(&p->p_smutex);
   1018 	} else
   1019 		oldmask = l->l_sigmask;	/* XXXgcc */
   1020 
   1021 	mutex_enter(&select_lock);
   1022 	SLIST_INIT(&l->l_selwait);
   1023 	for (;;) {
   1024 		ncoll = nselcoll;
   1025 		l->l_selflag = SEL_SCANNING;
   1026 		mutex_exit(&select_lock);
   1027 
   1028 		error = pollscan(l, (struct pollfd *)bits, nfds, retval);
   1029 
   1030 		mutex_enter(&select_lock);
   1031 		if (error || *retval)
   1032 			break;
   1033 		if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
   1034 			break;
   1035 		if (l->l_selflag != SEL_SCANNING || nselcoll != ncoll)
   1036 			continue;
   1037 		l->l_selflag = SEL_BLOCKING;
   1038 		error = cv_timedwait_sig(&select_cv, &select_lock, timo);
   1039 		if (error != 0)
   1040 			break;
   1041 	}
   1042 	selclear();
   1043 	mutex_exit(&select_lock);
   1044 
   1045 	if (mask) {
   1046 		mutex_enter(&p->p_smutex);
   1047 		l->l_sigmask = oldmask;
   1048 		mutex_exit(&p->p_smutex);
   1049 	}
   1050  done:
   1051 	/* poll is not restarted after signals... */
   1052 	if (error == ERESTART)
   1053 		error = EINTR;
   1054 	if (error == EWOULDBLOCK)
   1055 		error = 0;
   1056 	if (error == 0)
   1057 		error = copyout(bits, u_fds, ni);
   1058 	if (bits != smallbits)
   1059 		kmem_free(bits, ni);
   1060 	return (error);
   1061 }
   1062 
   1063 int
   1064 pollscan(lwp_t *l, struct pollfd *fds, int nfd, register_t *retval)
   1065 {
   1066 	proc_t		*p = l->l_proc;
   1067 	struct filedesc	*fdp;
   1068 	int		i, n;
   1069 	struct file	*fp;
   1070 
   1071 	fdp = p->p_fd;
   1072 	n = 0;
   1073 	for (i = 0; i < nfd; i++, fds++) {
   1074 		if (fds->fd >= fdp->fd_nfiles) {
   1075 			fds->revents = POLLNVAL;
   1076 			n++;
   1077 		} else if (fds->fd < 0) {
   1078 			fds->revents = 0;
   1079 		} else {
   1080 			if ((fp = fd_getfile(fdp, fds->fd)) == NULL) {
   1081 				fds->revents = POLLNVAL;
   1082 				n++;
   1083 			} else {
   1084 				FILE_USE(fp);
   1085 				fds->revents = (*fp->f_ops->fo_poll)(fp,
   1086 				    fds->events | POLLERR | POLLHUP, l);
   1087 				if (fds->revents != 0)
   1088 					n++;
   1089 				FILE_UNUSE(fp, l);
   1090 			}
   1091 		}
   1092 	}
   1093 	*retval = n;
   1094 	return (0);
   1095 }
   1096 
   1097 /*ARGSUSED*/
   1098 int
   1099 seltrue(dev_t dev, int events, lwp_t *l)
   1100 {
   1101 
   1102 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
   1103 }
   1104 
   1105 /*
   1106  * Record a select request.
   1107  */
   1108 void
   1109 selrecord(lwp_t *selector, struct selinfo *sip)
   1110 {
   1111 
   1112 	mutex_enter(&select_lock);
   1113 	if (sip->sel_lwp == NULL) {
   1114 		/* First named waiter, although there may be more. */
   1115 		sip->sel_lwp = selector;
   1116 		SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
   1117 	} else if (sip->sel_lwp != selector) {
   1118 		/* Multiple waiters. */
   1119 		sip->sel_collision = true;
   1120 	}
   1121 	mutex_exit(&select_lock);
   1122 }
   1123 
   1124 /*
   1125  * Do a wakeup when a selectable event occurs.
   1126  */
   1127 void
   1128 selnotify(struct selinfo *sip, int events, long knhint)
   1129 {
   1130 	lwp_t *l;
   1131 
   1132 	mutex_enter(&select_lock);
   1133 	if (sip->sel_collision) {
   1134 		/* Multiple waiters - just notify everybody. */
   1135 		nselcoll++;
   1136 		sip->sel_collision = false;
   1137 		cv_broadcast(&select_cv);
   1138 	} else if (sip->sel_lwp != NULL) {
   1139 		/* Only one LWP waiting. */
   1140 		l = sip->sel_lwp;
   1141 		if (l->l_selflag == SEL_BLOCKING) {
   1142 			/*
   1143 			 * If it's sleeping, wake it up.  If not, it's
   1144 			 * already awake but hasn't yet removed itself
   1145 			 * from the selector.  We reset the state below
   1146 			 * so that we only attempt to do this once.
   1147 			 */
   1148 			lwp_lock(l);
   1149 			if (l->l_wchan == &select_cv) {
   1150 				/* lwp_unsleep() releases the LWP lock. */
   1151 				(void)lwp_unsleep(l, true);
   1152 			} else
   1153 				lwp_unlock(l);
   1154 		} else {
   1155 			/*
   1156 			 * Not yet asleep.  Reset its state below so that
   1157 			 * it will go around again.
   1158 			 */
   1159 		}
   1160 		l->l_selflag = SEL_RESET;
   1161 	}
   1162 	mutex_exit(&select_lock);
   1163 
   1164 	KNOTE(&sip->sel_klist, knhint);
   1165 }
   1166 
   1167 /*
   1168  * Remove an LWP from all objects that it is waiting for.
   1169  */
   1170 void
   1171 selclear(void)
   1172 {
   1173 	struct selinfo *sip;
   1174 	lwp_t *l = curlwp;
   1175 
   1176 	KASSERT(mutex_owned(&select_lock));
   1177 
   1178 	SLIST_FOREACH(sip, &l->l_selwait, sel_chain) {
   1179 		KASSERT(sip->sel_lwp == l);
   1180 		sip->sel_lwp = NULL;
   1181 	}
   1182 }
   1183 
   1184 /*
   1185  * Initialize the select/poll system calls.
   1186  */
   1187 void
   1188 selsysinit(void)
   1189 {
   1190 
   1191 	mutex_init(&select_lock, MUTEX_DEFAULT, IPL_VM);
   1192 	cv_init(&select_cv, "select");
   1193 }
   1194 
   1195 /*
   1196  * Initialize a selector.
   1197  */
   1198 void
   1199 selinit(struct selinfo *sip)
   1200 {
   1201 
   1202 	memset(sip, 0, sizeof(*sip));
   1203 }
   1204 
   1205 /*
   1206  * Destroy a selector.  The owning object must not gain new
   1207  * references while this is in progress: all activity on the
   1208  * selector must be stopped.
   1209  */
   1210 void
   1211 seldestroy(struct selinfo *sip)
   1212 {
   1213 	lwp_t *l;
   1214 
   1215 	if (sip->sel_lwp == NULL)
   1216 		return;
   1217 
   1218 	mutex_enter(&select_lock);
   1219 	if ((l = sip->sel_lwp) != NULL) {
   1220 		/* This should rarely happen, so SLIST_REMOVE() is OK. */
   1221 		SLIST_REMOVE(&l->l_selwait, sip, selinfo, sel_chain);
   1222 		sip->sel_lwp = NULL;
   1223 	}
   1224 	mutex_exit(&select_lock);
   1225 }
   1226