Home | History | Annotate | Line # | Download | only in kern
sys_generic.c revision 1.115
      1 /*	$NetBSD: sys_generic.c,v 1.115 2008/03/21 21:55:00 ad Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2007, 2008 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Andrew Doran.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  * 3. All advertising materials mentioning features or use of this software
     19  *    must display the following acknowledgement:
     20  *	This product includes software developed by the NetBSD
     21  *	Foundation, Inc. and its contributors.
     22  * 4. Neither the name of The NetBSD Foundation nor the names of its
     23  *    contributors may be used to endorse or promote products derived
     24  *    from this software without specific prior written permission.
     25  *
     26  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     27  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     28  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     29  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     30  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     31  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     32  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     33  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     34  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     35  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     36  * POSSIBILITY OF SUCH DAMAGE.
     37  */
     38 
     39 /*
     40  * Copyright (c) 1982, 1986, 1989, 1993
     41  *	The Regents of the University of California.  All rights reserved.
     42  * (c) UNIX System Laboratories, Inc.
     43  * All or some portions of this file are derived from material licensed
     44  * to the University of California by American Telephone and Telegraph
     45  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
     46  * the permission of UNIX System Laboratories, Inc.
     47  *
     48  * Redistribution and use in source and binary forms, with or without
     49  * modification, are permitted provided that the following conditions
     50  * are met:
     51  * 1. Redistributions of source code must retain the above copyright
     52  *    notice, this list of conditions and the following disclaimer.
     53  * 2. Redistributions in binary form must reproduce the above copyright
     54  *    notice, this list of conditions and the following disclaimer in the
     55  *    documentation and/or other materials provided with the distribution.
     56  * 3. Neither the name of the University nor the names of its contributors
     57  *    may be used to endorse or promote products derived from this software
     58  *    without specific prior written permission.
     59  *
     60  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     61  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     62  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     63  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     64  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     65  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     66  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     67  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     68  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     69  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     70  * SUCH DAMAGE.
     71  *
     72  *	@(#)sys_generic.c	8.9 (Berkeley) 2/14/95
     73  */
     74 
     75 /*
     76  * System calls relating to files.
     77  */
     78 
     79 #include <sys/cdefs.h>
     80 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.115 2008/03/21 21:55:00 ad Exp $");
     81 
     82 #include <sys/param.h>
     83 #include <sys/systm.h>
     84 #include <sys/filedesc.h>
     85 #include <sys/ioctl.h>
     86 #include <sys/file.h>
     87 #include <sys/proc.h>
     88 #include <sys/socketvar.h>
     89 #include <sys/signalvar.h>
     90 #include <sys/uio.h>
     91 #include <sys/kernel.h>
     92 #include <sys/stat.h>
     93 #include <sys/kmem.h>
     94 #include <sys/poll.h>
     95 #include <sys/vnode.h>
     96 #include <sys/mount.h>
     97 #include <sys/syscallargs.h>
     98 #include <sys/ktrace.h>
     99 
    100 #include <uvm/uvm_extern.h>
    101 
    102 /* Flags for lwp::l_selflag. */
    103 #define	SEL_RESET	0	/* awoken, interrupted, or not yet polling */
    104 #define	SEL_SCANNING	1	/* polling descriptors */
    105 #define	SEL_BLOCKING	2	/* about to block on select_cv */
    106 
    107 static int	selscan(lwp_t *, fd_mask *, fd_mask *, int, register_t *);
    108 static int	pollscan(lwp_t *, struct pollfd *, int, register_t *);
    109 
    110 /* Global state for select()/poll(). */
    111 kmutex_t	select_lock;
    112 kcondvar_t	select_cv;
    113 int		nselcoll;
    114 
    115 /*
    116  * Read system call.
    117  */
    118 /* ARGSUSED */
    119 int
    120 sys_read(struct lwp *l, const struct sys_read_args *uap, register_t *retval)
    121 {
    122 	/* {
    123 		syscallarg(int)		fd;
    124 		syscallarg(void *)	buf;
    125 		syscallarg(size_t)	nbyte;
    126 	} */
    127 	file_t *fp;
    128 	int fd;
    129 
    130 	fd = SCARG(uap, fd);
    131 
    132 	if ((fp = fd_getfile(fd)) == NULL)
    133 		return (EBADF);
    134 
    135 	if ((fp->f_flag & FREAD) == 0) {
    136 		fd_putfile(fd);
    137 		return (EBADF);
    138 	}
    139 
    140 	/* dofileread() will unuse the descriptor for us */
    141 	return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
    142 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
    143 }
    144 
    145 int
    146 dofileread(int fd, struct file *fp, void *buf, size_t nbyte,
    147 	off_t *offset, int flags, register_t *retval)
    148 {
    149 	struct iovec aiov;
    150 	struct uio auio;
    151 	size_t cnt;
    152 	int error;
    153 	lwp_t *l;
    154 
    155 	l = curlwp;
    156 
    157 	aiov.iov_base = (void *)buf;
    158 	aiov.iov_len = nbyte;
    159 	auio.uio_iov = &aiov;
    160 	auio.uio_iovcnt = 1;
    161 	auio.uio_resid = nbyte;
    162 	auio.uio_rw = UIO_READ;
    163 	auio.uio_vmspace = l->l_proc->p_vmspace;
    164 
    165 	/*
    166 	 * Reads return ssize_t because -1 is returned on error.  Therefore
    167 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
    168 	 * values.
    169 	 */
    170 	if (auio.uio_resid > SSIZE_MAX) {
    171 		error = EINVAL;
    172 		goto out;
    173 	}
    174 
    175 	cnt = auio.uio_resid;
    176 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
    177 	if (error)
    178 		if (auio.uio_resid != cnt && (error == ERESTART ||
    179 		    error == EINTR || error == EWOULDBLOCK))
    180 			error = 0;
    181 	cnt -= auio.uio_resid;
    182 	ktrgenio(fd, UIO_READ, buf, cnt, error);
    183 	*retval = cnt;
    184  out:
    185 	fd_putfile(fd);
    186 	return (error);
    187 }
    188 
    189 /*
    190  * Scatter read system call.
    191  */
    192 int
    193 sys_readv(struct lwp *l, const struct sys_readv_args *uap, register_t *retval)
    194 {
    195 	/* {
    196 		syscallarg(int)				fd;
    197 		syscallarg(const struct iovec *)	iovp;
    198 		syscallarg(int)				iovcnt;
    199 	} */
    200 
    201 	return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
    202 	    SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
    203 }
    204 
    205 int
    206 do_filereadv(int fd, const struct iovec *iovp, int iovcnt,
    207     off_t *offset, int flags, register_t *retval)
    208 {
    209 	struct uio	auio;
    210 	struct iovec	*iov, *needfree = NULL, aiov[UIO_SMALLIOV];
    211 	int		i, error;
    212 	size_t		cnt;
    213 	u_int		iovlen;
    214 	struct file	*fp;
    215 	struct iovec	*ktriov = NULL;
    216 
    217 	if (iovcnt == 0)
    218 		return EINVAL;
    219 
    220 	if ((fp = fd_getfile(fd)) == NULL)
    221 		return EBADF;
    222 
    223 	if ((fp->f_flag & FREAD) == 0) {
    224 		fd_putfile(fd);
    225 		return EBADF;
    226 	}
    227 
    228 	if (offset == NULL)
    229 		offset = &fp->f_offset;
    230 	else {
    231 		struct vnode *vp = fp->f_data;
    232 		if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
    233 			error = ESPIPE;
    234 			goto out;
    235 		}
    236 		/*
    237 		 * Test that the device is seekable ?
    238 		 * XXX This works because no file systems actually
    239 		 * XXX take any action on the seek operation.
    240 		 */
    241 		error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
    242 		if (error != 0)
    243 			goto out;
    244 	}
    245 
    246 	iovlen = iovcnt * sizeof(struct iovec);
    247 	if (flags & FOF_IOV_SYSSPACE)
    248 		iov = __UNCONST(iovp);
    249 	else {
    250 		iov = aiov;
    251 		if ((u_int)iovcnt > UIO_SMALLIOV) {
    252 			if ((u_int)iovcnt > IOV_MAX) {
    253 				error = EINVAL;
    254 				goto out;
    255 			}
    256 			iov = kmem_alloc(iovlen, KM_SLEEP);
    257 			if (iov == NULL) {
    258 				error = ENOMEM;
    259 				goto out;
    260 			}
    261 			needfree = iov;
    262 		}
    263 		error = copyin(iovp, iov, iovlen);
    264 		if (error)
    265 			goto done;
    266 	}
    267 
    268 	auio.uio_iov = iov;
    269 	auio.uio_iovcnt = iovcnt;
    270 	auio.uio_rw = UIO_READ;
    271 	auio.uio_vmspace = curproc->p_vmspace;
    272 
    273 	auio.uio_resid = 0;
    274 	for (i = 0; i < iovcnt; i++, iov++) {
    275 		auio.uio_resid += iov->iov_len;
    276 		/*
    277 		 * Reads return ssize_t because -1 is returned on error.
    278 		 * Therefore we must restrict the length to SSIZE_MAX to
    279 		 * avoid garbage return values.
    280 		 */
    281 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
    282 			error = EINVAL;
    283 			goto done;
    284 		}
    285 	}
    286 
    287 	/*
    288 	 * if tracing, save a copy of iovec
    289 	 */
    290 	if (ktrpoint(KTR_GENIO))  {
    291 		ktriov = kmem_alloc(iovlen, KM_SLEEP);
    292 		if (ktriov != NULL)
    293 			memcpy(ktriov, auio.uio_iov, iovlen);
    294 	}
    295 
    296 	cnt = auio.uio_resid;
    297 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
    298 	if (error)
    299 		if (auio.uio_resid != cnt && (error == ERESTART ||
    300 		    error == EINTR || error == EWOULDBLOCK))
    301 			error = 0;
    302 	cnt -= auio.uio_resid;
    303 	*retval = cnt;
    304 
    305 	if (ktriov != NULL) {
    306 		ktrgeniov(fd, UIO_READ, ktriov, cnt, error);
    307 		kmem_free(ktriov, iovlen);
    308 	}
    309 
    310  done:
    311 	if (needfree)
    312 		kmem_free(needfree, iovlen);
    313  out:
    314 	fd_putfile(fd);
    315 	return (error);
    316 }
    317 
    318 /*
    319  * Write system call
    320  */
    321 int
    322 sys_write(struct lwp *l, const struct sys_write_args *uap, register_t *retval)
    323 {
    324 	/* {
    325 		syscallarg(int)			fd;
    326 		syscallarg(const void *)	buf;
    327 		syscallarg(size_t)		nbyte;
    328 	} */
    329 	file_t *fp;
    330 	int fd;
    331 
    332 	fd = SCARG(uap, fd);
    333 
    334 	if ((fp = fd_getfile(fd)) == NULL)
    335 		return (EBADF);
    336 
    337 	if ((fp->f_flag & FWRITE) == 0) {
    338 		fd_putfile(fd);
    339 		return (EBADF);
    340 	}
    341 
    342 	/* dofilewrite() will unuse the descriptor for us */
    343 	return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
    344 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
    345 }
    346 
    347 int
    348 dofilewrite(int fd, struct file *fp, const void *buf,
    349 	size_t nbyte, off_t *offset, int flags, register_t *retval)
    350 {
    351 	struct iovec aiov;
    352 	struct uio auio;
    353 	size_t cnt;
    354 	int error;
    355 
    356 	aiov.iov_base = __UNCONST(buf);		/* XXXUNCONST kills const */
    357 	aiov.iov_len = nbyte;
    358 	auio.uio_iov = &aiov;
    359 	auio.uio_iovcnt = 1;
    360 	auio.uio_resid = nbyte;
    361 	auio.uio_rw = UIO_WRITE;
    362 	auio.uio_vmspace = curproc->p_vmspace;
    363 
    364 	/*
    365 	 * Writes return ssize_t because -1 is returned on error.  Therefore
    366 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
    367 	 * values.
    368 	 */
    369 	if (auio.uio_resid > SSIZE_MAX) {
    370 		error = EINVAL;
    371 		goto out;
    372 	}
    373 
    374 	cnt = auio.uio_resid;
    375 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
    376 	if (error) {
    377 		if (auio.uio_resid != cnt && (error == ERESTART ||
    378 		    error == EINTR || error == EWOULDBLOCK))
    379 			error = 0;
    380 		if (error == EPIPE) {
    381 			mutex_enter(&proclist_mutex);
    382 			psignal(curproc, SIGPIPE);
    383 			mutex_exit(&proclist_mutex);
    384 		}
    385 	}
    386 	cnt -= auio.uio_resid;
    387 	ktrgenio(fd, UIO_WRITE, buf, cnt, error);
    388 	*retval = cnt;
    389  out:
    390 	fd_putfile(fd);
    391 	return (error);
    392 }
    393 
    394 /*
    395  * Gather write system call
    396  */
    397 int
    398 sys_writev(struct lwp *l, const struct sys_writev_args *uap, register_t *retval)
    399 {
    400 	/* {
    401 		syscallarg(int)				fd;
    402 		syscallarg(const struct iovec *)	iovp;
    403 		syscallarg(int)				iovcnt;
    404 	} */
    405 
    406 	return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
    407 	    SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
    408 }
    409 
    410 int
    411 do_filewritev(int fd, const struct iovec *iovp, int iovcnt,
    412     off_t *offset, int flags, register_t *retval)
    413 {
    414 	struct uio	auio;
    415 	struct iovec	*iov, *needfree = NULL, aiov[UIO_SMALLIOV];
    416 	int		i, error;
    417 	size_t		cnt;
    418 	u_int		iovlen;
    419 	struct file	*fp;
    420 	struct iovec	*ktriov = NULL;
    421 
    422 	if (iovcnt == 0)
    423 		return EINVAL;
    424 
    425 	if ((fp = fd_getfile(fd)) == NULL)
    426 		return EBADF;
    427 
    428 	if ((fp->f_flag & FWRITE) == 0) {
    429 		fd_putfile(fd);
    430 		return EBADF;
    431 	}
    432 
    433 	if (offset == NULL)
    434 		offset = &fp->f_offset;
    435 	else {
    436 		struct vnode *vp = fp->f_data;
    437 		if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
    438 			error = ESPIPE;
    439 			goto out;
    440 		}
    441 		/*
    442 		 * Test that the device is seekable ?
    443 		 * XXX This works because no file systems actually
    444 		 * XXX take any action on the seek operation.
    445 		 */
    446 		error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
    447 		if (error != 0)
    448 			goto out;
    449 	}
    450 
    451 	iovlen = iovcnt * sizeof(struct iovec);
    452 	if (flags & FOF_IOV_SYSSPACE)
    453 		iov = __UNCONST(iovp);
    454 	else {
    455 		iov = aiov;
    456 		if ((u_int)iovcnt > UIO_SMALLIOV) {
    457 			if ((u_int)iovcnt > IOV_MAX) {
    458 				error = EINVAL;
    459 				goto out;
    460 			}
    461 			iov = kmem_alloc(iovlen, KM_SLEEP);
    462 			if (iov == NULL) {
    463 				error = ENOMEM;
    464 				goto out;
    465 			}
    466 			needfree = iov;
    467 		}
    468 		error = copyin(iovp, iov, iovlen);
    469 		if (error)
    470 			goto done;
    471 	}
    472 
    473 	auio.uio_iov = iov;
    474 	auio.uio_iovcnt = iovcnt;
    475 	auio.uio_rw = UIO_WRITE;
    476 	auio.uio_vmspace = curproc->p_vmspace;
    477 
    478 	auio.uio_resid = 0;
    479 	for (i = 0; i < iovcnt; i++, iov++) {
    480 		auio.uio_resid += iov->iov_len;
    481 		/*
    482 		 * Writes return ssize_t because -1 is returned on error.
    483 		 * Therefore we must restrict the length to SSIZE_MAX to
    484 		 * avoid garbage return values.
    485 		 */
    486 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
    487 			error = EINVAL;
    488 			goto done;
    489 		}
    490 	}
    491 
    492 	/*
    493 	 * if tracing, save a copy of iovec
    494 	 */
    495 	if (ktrpoint(KTR_GENIO))  {
    496 		ktriov = kmem_alloc(iovlen, KM_SLEEP);
    497 		if (ktriov != NULL)
    498 			memcpy(ktriov, auio.uio_iov, iovlen);
    499 	}
    500 
    501 	cnt = auio.uio_resid;
    502 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
    503 	if (error) {
    504 		if (auio.uio_resid != cnt && (error == ERESTART ||
    505 		    error == EINTR || error == EWOULDBLOCK))
    506 			error = 0;
    507 		if (error == EPIPE) {
    508 			mutex_enter(&proclist_mutex);
    509 			psignal(curproc, SIGPIPE);
    510 			mutex_exit(&proclist_mutex);
    511 		}
    512 	}
    513 	cnt -= auio.uio_resid;
    514 	*retval = cnt;
    515 
    516 	if (ktriov != NULL) {
    517 		ktrgeniov(fd, UIO_WRITE, ktriov, cnt, error);
    518 		kmem_free(ktriov, iovlen);
    519 	}
    520 
    521  done:
    522 	if (needfree)
    523 		kmem_free(needfree, iovlen);
    524  out:
    525 	fd_putfile(fd);
    526 	return (error);
    527 }
    528 
    529 /*
    530  * Ioctl system call
    531  */
    532 /* ARGSUSED */
    533 int
    534 sys_ioctl(struct lwp *l, const struct sys_ioctl_args *uap, register_t *retval)
    535 {
    536 	/* {
    537 		syscallarg(int)		fd;
    538 		syscallarg(u_long)	com;
    539 		syscallarg(void *)	data;
    540 	} */
    541 	struct file	*fp;
    542 	proc_t		*p;
    543 	struct filedesc	*fdp;
    544 	u_long		com;
    545 	int		error;
    546 	u_int		size;
    547 	void 		*data, *memp;
    548 #define	STK_PARAMS	128
    549 	u_long		stkbuf[STK_PARAMS/sizeof(u_long)];
    550 	fdfile_t	*ff;
    551 
    552 	error = 0;
    553 	p = l->l_proc;
    554 	fdp = p->p_fd;
    555 
    556 	if ((fp = fd_getfile(SCARG(uap, fd))) == NULL)
    557 		return (EBADF);
    558 
    559 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
    560 		error = EBADF;
    561 		com = 0;
    562 		goto out;
    563 	}
    564 
    565 	ff = fdp->fd_ofiles[SCARG(uap, fd)];
    566 	switch (com = SCARG(uap, com)) {
    567 	case FIONCLEX:
    568 		ff->ff_exclose = 0;
    569 		goto out;
    570 
    571 	case FIOCLEX:
    572 		ff->ff_exclose = 1;
    573 		fdp->fd_exclose = 1;
    574 		goto out;
    575 	}
    576 
    577 	/*
    578 	 * Interpret high order word to find amount of data to be
    579 	 * copied to/from the user's address space.
    580 	 */
    581 	size = IOCPARM_LEN(com);
    582 	if (size > IOCPARM_MAX) {
    583 		error = ENOTTY;
    584 		goto out;
    585 	}
    586 	memp = NULL;
    587 	if (size > sizeof(stkbuf)) {
    588 		memp = kmem_alloc(size, KM_SLEEP);
    589 		data = memp;
    590 	} else
    591 		data = (void *)stkbuf;
    592 	if (com&IOC_IN) {
    593 		if (size) {
    594 			error = copyin(SCARG(uap, data), data, size);
    595 			if (error) {
    596 				if (memp)
    597 					kmem_free(memp, size);
    598 				goto out;
    599 			}
    600 			ktrgenio(SCARG(uap, fd), UIO_WRITE, SCARG(uap, data),
    601 			    size, 0);
    602 		} else
    603 			*(void **)data = SCARG(uap, data);
    604 	} else if ((com&IOC_OUT) && size)
    605 		/*
    606 		 * Zero the buffer so the user always
    607 		 * gets back something deterministic.
    608 		 */
    609 		memset(data, 0, size);
    610 	else if (com&IOC_VOID)
    611 		*(void **)data = SCARG(uap, data);
    612 
    613 	switch (com) {
    614 
    615 	case FIONBIO:
    616 		FILE_LOCK(fp);
    617 		if (*(int *)data != 0)
    618 			fp->f_flag |= FNONBLOCK;
    619 		else
    620 			fp->f_flag &= ~FNONBLOCK;
    621 		FILE_UNLOCK(fp);
    622 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data);
    623 		break;
    624 
    625 	case FIOASYNC:
    626 		FILE_LOCK(fp);
    627 		if (*(int *)data != 0)
    628 			fp->f_flag |= FASYNC;
    629 		else
    630 			fp->f_flag &= ~FASYNC;
    631 		FILE_UNLOCK(fp);
    632 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data);
    633 		break;
    634 
    635 	default:
    636 		error = (*fp->f_ops->fo_ioctl)(fp, com, data);
    637 		/*
    638 		 * Copy any data to user, size was
    639 		 * already set and checked above.
    640 		 */
    641 		if (error == 0 && (com&IOC_OUT) && size) {
    642 			error = copyout(data, SCARG(uap, data), size);
    643 			ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, data),
    644 			    size, error);
    645 		}
    646 		break;
    647 	}
    648 	if (memp)
    649 		kmem_free(memp, size);
    650  out:
    651 	fd_putfile(SCARG(uap, fd));
    652 	switch (error) {
    653 	case -1:
    654 		printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
    655 		    "pid=%d comm=%s\n",
    656 		    (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
    657 		    (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
    658 		    p->p_pid, p->p_comm);
    659 		/* FALLTHROUGH */
    660 	case EPASSTHROUGH:
    661 		error = ENOTTY;
    662 		/* FALLTHROUGH */
    663 	default:
    664 		return (error);
    665 	}
    666 }
    667 
    668 /*
    669  * Select system call.
    670  */
    671 int
    672 sys_pselect(struct lwp *l, const struct sys_pselect_args *uap, register_t *retval)
    673 {
    674 	/* {
    675 		syscallarg(int)				nd;
    676 		syscallarg(fd_set *)			in;
    677 		syscallarg(fd_set *)			ou;
    678 		syscallarg(fd_set *)			ex;
    679 		syscallarg(const struct timespec *)	ts;
    680 		syscallarg(sigset_t *)			mask;
    681 	} */
    682 	struct timespec	ats;
    683 	struct timeval	atv, *tv = NULL;
    684 	sigset_t	amask, *mask = NULL;
    685 	int		error;
    686 
    687 	if (SCARG(uap, ts)) {
    688 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
    689 		if (error)
    690 			return error;
    691 		atv.tv_sec = ats.tv_sec;
    692 		atv.tv_usec = ats.tv_nsec / 1000;
    693 		tv = &atv;
    694 	}
    695 	if (SCARG(uap, mask) != NULL) {
    696 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
    697 		if (error)
    698 			return error;
    699 		mask = &amask;
    700 	}
    701 
    702 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
    703 	    SCARG(uap, ou), SCARG(uap, ex), tv, mask);
    704 }
    705 
    706 int
    707 inittimeleft(struct timeval *tv, struct timeval *sleeptv)
    708 {
    709 	if (itimerfix(tv))
    710 		return -1;
    711 	getmicrouptime(sleeptv);
    712 	return 0;
    713 }
    714 
    715 int
    716 gettimeleft(struct timeval *tv, struct timeval *sleeptv)
    717 {
    718 	/*
    719 	 * We have to recalculate the timeout on every retry.
    720 	 */
    721 	struct timeval slepttv;
    722 	/*
    723 	 * reduce tv by elapsed time
    724 	 * based on monotonic time scale
    725 	 */
    726 	getmicrouptime(&slepttv);
    727 	timeradd(tv, sleeptv, tv);
    728 	timersub(tv, &slepttv, tv);
    729 	*sleeptv = slepttv;
    730 	return tvtohz(tv);
    731 }
    732 
    733 int
    734 sys_select(struct lwp *l, const struct sys_select_args *uap, register_t *retval)
    735 {
    736 	/* {
    737 		syscallarg(int)			nd;
    738 		syscallarg(fd_set *)		in;
    739 		syscallarg(fd_set *)		ou;
    740 		syscallarg(fd_set *)		ex;
    741 		syscallarg(struct timeval *)	tv;
    742 	} */
    743 	struct timeval atv, *tv = NULL;
    744 	int error;
    745 
    746 	if (SCARG(uap, tv)) {
    747 		error = copyin(SCARG(uap, tv), (void *)&atv,
    748 			sizeof(atv));
    749 		if (error)
    750 			return error;
    751 		tv = &atv;
    752 	}
    753 
    754 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
    755 	    SCARG(uap, ou), SCARG(uap, ex), tv, NULL);
    756 }
    757 
    758 int
    759 selcommon(lwp_t *l, register_t *retval, int nd, fd_set *u_in,
    760 	  fd_set *u_ou, fd_set *u_ex, struct timeval *tv, sigset_t *mask)
    761 {
    762 	char		smallbits[howmany(FD_SETSIZE, NFDBITS) *
    763 			    sizeof(fd_mask) * 6];
    764 	proc_t		* const p = l->l_proc;
    765 	char 		*bits;
    766 	int		ncoll, error, timo;
    767 	size_t		ni;
    768 	sigset_t	oldmask;
    769 	struct timeval  sleeptv;
    770 
    771 	error = 0;
    772 	if (nd < 0)
    773 		return (EINVAL);
    774 	if (nd > p->p_fd->fd_nfiles) {
    775 		/* forgiving; slightly wrong */
    776 		nd = p->p_fd->fd_nfiles;
    777 	}
    778 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
    779 	if (ni * 6 > sizeof(smallbits))
    780 		bits = kmem_alloc(ni * 6, KM_SLEEP);
    781 	else
    782 		bits = smallbits;
    783 
    784 #define	getbits(name, x)						\
    785 	if (u_ ## name) {						\
    786 		error = copyin(u_ ## name, bits + ni * x, ni);		\
    787 		if (error)						\
    788 			goto done;					\
    789 	} else								\
    790 		memset(bits + ni * x, 0, ni);
    791 	getbits(in, 0);
    792 	getbits(ou, 1);
    793 	getbits(ex, 2);
    794 #undef	getbits
    795 
    796 	timo = 0;
    797 	if (tv && inittimeleft(tv, &sleeptv) == -1) {
    798 		error = EINVAL;
    799 		goto done;
    800 	}
    801 
    802 	if (mask) {
    803 		sigminusset(&sigcantmask, mask);
    804 		mutex_enter(&p->p_smutex);
    805 		oldmask = l->l_sigmask;
    806 		l->l_sigmask = *mask;
    807 		mutex_exit(&p->p_smutex);
    808 	} else
    809 		oldmask = l->l_sigmask;	/* XXXgcc */
    810 
    811 	mutex_enter(&select_lock);
    812 	SLIST_INIT(&l->l_selwait);
    813 	for (;;) {
    814 	 	l->l_selflag = SEL_SCANNING;
    815 		ncoll = nselcoll;
    816  		mutex_exit(&select_lock);
    817 
    818 		error = selscan(l, (fd_mask *)(bits + ni * 0),
    819 		    (fd_mask *)(bits + ni * 3), nd, retval);
    820 
    821 		mutex_enter(&select_lock);
    822 		if (error || *retval)
    823 			break;
    824 		if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
    825 			break;
    826 		if (l->l_selflag != SEL_SCANNING || ncoll != nselcoll)
    827 			continue;
    828 		l->l_selflag = SEL_BLOCKING;
    829 		error = cv_timedwait_sig(&select_cv, &select_lock, timo);
    830 		if (error != 0)
    831 			break;
    832 	}
    833 	selclear();
    834 	mutex_exit(&select_lock);
    835 
    836 	if (mask) {
    837 		mutex_enter(&p->p_smutex);
    838 		l->l_sigmask = oldmask;
    839 		mutex_exit(&p->p_smutex);
    840 	}
    841 
    842  done:
    843 	/* select is not restarted after signals... */
    844 	if (error == ERESTART)
    845 		error = EINTR;
    846 	if (error == EWOULDBLOCK)
    847 		error = 0;
    848 	if (error == 0 && u_in != NULL)
    849 		error = copyout(bits + ni * 3, u_in, ni);
    850 	if (error == 0 && u_ou != NULL)
    851 		error = copyout(bits + ni * 4, u_ou, ni);
    852 	if (error == 0 && u_ex != NULL)
    853 		error = copyout(bits + ni * 5, u_ex, ni);
    854 	if (bits != smallbits)
    855 		kmem_free(bits, ni * 6);
    856 	return (error);
    857 }
    858 
    859 int
    860 selscan(lwp_t *l, fd_mask *ibitp, fd_mask *obitp, int nfd,
    861 	register_t *retval)
    862 {
    863 	static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
    864 			       POLLWRNORM | POLLHUP | POLLERR,
    865 			       POLLRDBAND };
    866 	int msk, i, j, fd, n;
    867 	fd_mask ibits, obits;
    868 	file_t *fp;
    869 
    870 	n = 0;
    871 	for (msk = 0; msk < 3; msk++) {
    872 		for (i = 0; i < nfd; i += NFDBITS) {
    873 			ibits = *ibitp++;
    874 			obits = 0;
    875 			while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
    876 				ibits &= ~(1 << j);
    877 				if ((fp = fd_getfile(fd)) == NULL)
    878 					return (EBADF);
    879 				if ((*fp->f_ops->fo_poll)(fp, flag[msk])) {
    880 					obits |= (1 << j);
    881 					n++;
    882 				}
    883 				fd_putfile(fd);
    884 			}
    885 			*obitp++ = obits;
    886 		}
    887 	}
    888 	*retval = n;
    889 	return (0);
    890 }
    891 
    892 /*
    893  * Poll system call.
    894  */
    895 int
    896 sys_poll(struct lwp *l, const struct sys_poll_args *uap, register_t *retval)
    897 {
    898 	/* {
    899 		syscallarg(struct pollfd *)	fds;
    900 		syscallarg(u_int)		nfds;
    901 		syscallarg(int)			timeout;
    902 	} */
    903 	struct timeval	atv, *tv = NULL;
    904 
    905 	if (SCARG(uap, timeout) != INFTIM) {
    906 		atv.tv_sec = SCARG(uap, timeout) / 1000;
    907 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
    908 		tv = &atv;
    909 	}
    910 
    911 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
    912 		tv, NULL);
    913 }
    914 
    915 /*
    916  * Poll system call.
    917  */
    918 int
    919 sys_pollts(struct lwp *l, const struct sys_pollts_args *uap, register_t *retval)
    920 {
    921 	/* {
    922 		syscallarg(struct pollfd *)		fds;
    923 		syscallarg(u_int)			nfds;
    924 		syscallarg(const struct timespec *)	ts;
    925 		syscallarg(const sigset_t *)		mask;
    926 	} */
    927 	struct timespec	ats;
    928 	struct timeval	atv, *tv = NULL;
    929 	sigset_t	amask, *mask = NULL;
    930 	int		error;
    931 
    932 	if (SCARG(uap, ts)) {
    933 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
    934 		if (error)
    935 			return error;
    936 		atv.tv_sec = ats.tv_sec;
    937 		atv.tv_usec = ats.tv_nsec / 1000;
    938 		tv = &atv;
    939 	}
    940 	if (SCARG(uap, mask)) {
    941 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
    942 		if (error)
    943 			return error;
    944 		mask = &amask;
    945 	}
    946 
    947 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
    948 		tv, mask);
    949 }
    950 
    951 int
    952 pollcommon(lwp_t *l, register_t *retval,
    953 	struct pollfd *u_fds, u_int nfds,
    954 	struct timeval *tv, sigset_t *mask)
    955 {
    956 	char		smallbits[32 * sizeof(struct pollfd)];
    957 	proc_t		* const p = l->l_proc;
    958 	void *		bits;
    959 	sigset_t	oldmask;
    960 	int		ncoll, error, timo;
    961 	size_t		ni;
    962 	struct timeval	sleeptv;
    963 
    964 	if (nfds > p->p_fd->fd_nfiles) {
    965 		/* forgiving; slightly wrong */
    966 		nfds = p->p_fd->fd_nfiles;
    967 	}
    968 	ni = nfds * sizeof(struct pollfd);
    969 	if (ni > sizeof(smallbits))
    970 		bits = kmem_alloc(ni, KM_SLEEP);
    971 	else
    972 		bits = smallbits;
    973 
    974 	error = copyin(u_fds, bits, ni);
    975 	if (error)
    976 		goto done;
    977 
    978 	timo = 0;
    979 	if (tv && inittimeleft(tv, &sleeptv) == -1) {
    980 		error = EINVAL;
    981 		goto done;
    982 	}
    983 
    984 	if (mask) {
    985 		sigminusset(&sigcantmask, mask);
    986 		mutex_enter(&p->p_smutex);
    987 		oldmask = l->l_sigmask;
    988 		l->l_sigmask = *mask;
    989 		mutex_exit(&p->p_smutex);
    990 	} else
    991 		oldmask = l->l_sigmask;	/* XXXgcc */
    992 
    993 	mutex_enter(&select_lock);
    994 	SLIST_INIT(&l->l_selwait);
    995 	for (;;) {
    996 		ncoll = nselcoll;
    997 		l->l_selflag = SEL_SCANNING;
    998 		mutex_exit(&select_lock);
    999 
   1000 		error = pollscan(l, (struct pollfd *)bits, nfds, retval);
   1001 
   1002 		mutex_enter(&select_lock);
   1003 		if (error || *retval)
   1004 			break;
   1005 		if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
   1006 			break;
   1007 		if (l->l_selflag != SEL_SCANNING || nselcoll != ncoll)
   1008 			continue;
   1009 		l->l_selflag = SEL_BLOCKING;
   1010 		error = cv_timedwait_sig(&select_cv, &select_lock, timo);
   1011 		if (error != 0)
   1012 			break;
   1013 	}
   1014 	selclear();
   1015 	mutex_exit(&select_lock);
   1016 
   1017 	if (mask) {
   1018 		mutex_enter(&p->p_smutex);
   1019 		l->l_sigmask = oldmask;
   1020 		mutex_exit(&p->p_smutex);
   1021 	}
   1022  done:
   1023 	/* poll is not restarted after signals... */
   1024 	if (error == ERESTART)
   1025 		error = EINTR;
   1026 	if (error == EWOULDBLOCK)
   1027 		error = 0;
   1028 	if (error == 0)
   1029 		error = copyout(bits, u_fds, ni);
   1030 	if (bits != smallbits)
   1031 		kmem_free(bits, ni);
   1032 	return (error);
   1033 }
   1034 
   1035 int
   1036 pollscan(lwp_t *l, struct pollfd *fds, int nfd, register_t *retval)
   1037 {
   1038 	int i, n;
   1039 	file_t *fp;
   1040 
   1041 	n = 0;
   1042 	for (i = 0; i < nfd; i++, fds++) {
   1043 		if (fds->fd < 0) {
   1044 			fds->revents = 0;
   1045 		} else if ((fp = fd_getfile(fds->fd)) == NULL) {
   1046 			fds->revents = POLLNVAL;
   1047 			n++;
   1048 		} else {
   1049 			fds->revents = (*fp->f_ops->fo_poll)(fp,
   1050 			    fds->events | POLLERR | POLLHUP);
   1051 			if (fds->revents != 0)
   1052 				n++;
   1053 			fd_putfile(fds->fd);
   1054 		}
   1055 	}
   1056 	*retval = n;
   1057 	return (0);
   1058 }
   1059 
   1060 /*ARGSUSED*/
   1061 int
   1062 seltrue(dev_t dev, int events, lwp_t *l)
   1063 {
   1064 
   1065 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
   1066 }
   1067 
   1068 /*
   1069  * Record a select request.
   1070  */
   1071 void
   1072 selrecord(lwp_t *selector, struct selinfo *sip)
   1073 {
   1074 
   1075 	mutex_enter(&select_lock);
   1076 	if (sip->sel_lwp == NULL) {
   1077 		/* First named waiter, although there may be more. */
   1078 		sip->sel_lwp = selector;
   1079 		SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
   1080 	} else if (sip->sel_lwp != selector) {
   1081 		/* Multiple waiters. */
   1082 		sip->sel_collision = true;
   1083 	}
   1084 	mutex_exit(&select_lock);
   1085 }
   1086 
   1087 /*
   1088  * Do a wakeup when a selectable event occurs.
   1089  */
   1090 void
   1091 selnotify(struct selinfo *sip, int events, long knhint)
   1092 {
   1093 	lwp_t *l;
   1094 
   1095 	mutex_enter(&select_lock);
   1096 	if (sip->sel_collision) {
   1097 		/* Multiple waiters - just notify everybody. */
   1098 		nselcoll++;
   1099 		sip->sel_collision = false;
   1100 		cv_broadcast(&select_cv);
   1101 	} else if (sip->sel_lwp != NULL) {
   1102 		/* Only one LWP waiting. */
   1103 		l = sip->sel_lwp;
   1104 		if (l->l_selflag == SEL_BLOCKING) {
   1105 			/*
   1106 			 * If it's sleeping, wake it up.  If not, it's
   1107 			 * already awake but hasn't yet removed itself
   1108 			 * from the selector.  We reset the state below
   1109 			 * so that we only attempt to do this once.
   1110 			 */
   1111 			lwp_lock(l);
   1112 			if (l->l_wchan == &select_cv) {
   1113 				/* lwp_unsleep() releases the LWP lock. */
   1114 				(void)lwp_unsleep(l, true);
   1115 			} else
   1116 				lwp_unlock(l);
   1117 		} else {
   1118 			/*
   1119 			 * Not yet asleep.  Reset its state below so that
   1120 			 * it will go around again.
   1121 			 */
   1122 		}
   1123 		l->l_selflag = SEL_RESET;
   1124 	}
   1125 	mutex_exit(&select_lock);
   1126 
   1127 	KNOTE(&sip->sel_klist, knhint);
   1128 }
   1129 
   1130 /*
   1131  * Remove an LWP from all objects that it is waiting for.
   1132  */
   1133 void
   1134 selclear(void)
   1135 {
   1136 	struct selinfo *sip;
   1137 	lwp_t *l = curlwp;
   1138 
   1139 	KASSERT(mutex_owned(&select_lock));
   1140 
   1141 	SLIST_FOREACH(sip, &l->l_selwait, sel_chain) {
   1142 		KASSERT(sip->sel_lwp == l);
   1143 		sip->sel_lwp = NULL;
   1144 	}
   1145 }
   1146 
   1147 /*
   1148  * Initialize the select/poll system calls.
   1149  */
   1150 void
   1151 selsysinit(void)
   1152 {
   1153 
   1154 	mutex_init(&select_lock, MUTEX_DEFAULT, IPL_VM);
   1155 	cv_init(&select_cv, "select");
   1156 }
   1157 
   1158 /*
   1159  * Initialize a selector.
   1160  */
   1161 void
   1162 selinit(struct selinfo *sip)
   1163 {
   1164 
   1165 	memset(sip, 0, sizeof(*sip));
   1166 }
   1167 
   1168 /*
   1169  * Destroy a selector.  The owning object must not gain new
   1170  * references while this is in progress: all activity on the
   1171  * selector must be stopped.
   1172  */
   1173 void
   1174 seldestroy(struct selinfo *sip)
   1175 {
   1176 	lwp_t *l;
   1177 
   1178 	if (sip->sel_lwp == NULL)
   1179 		return;
   1180 
   1181 	mutex_enter(&select_lock);
   1182 	if ((l = sip->sel_lwp) != NULL) {
   1183 		/* This should rarely happen, so SLIST_REMOVE() is OK. */
   1184 		SLIST_REMOVE(&l->l_selwait, sip, selinfo, sel_chain);
   1185 		sip->sel_lwp = NULL;
   1186 	}
   1187 	mutex_exit(&select_lock);
   1188 }
   1189