Home | History | Annotate | Line # | Download | only in kern
sys_generic.c revision 1.108
      1 /*	$NetBSD: sys_generic.c,v 1.108 2007/10/08 15:12:08 ad Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2007 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Andrew Doran.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  * 3. All advertising materials mentioning features or use of this software
     19  *    must display the following acknowledgement:
     20  *	This product includes software developed by the NetBSD
     21  *	Foundation, Inc. and its contributors.
     22  * 4. Neither the name of The NetBSD Foundation nor the names of its
     23  *    contributors may be used to endorse or promote products derived
     24  *    from this software without specific prior written permission.
     25  *
     26  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     27  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     28  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     29  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     30  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     31  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     32  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     33  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     34  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     35  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     36  * POSSIBILITY OF SUCH DAMAGE.
     37  */
     38 
     39 /*
     40  * Copyright (c) 1982, 1986, 1989, 1993
     41  *	The Regents of the University of California.  All rights reserved.
     42  * (c) UNIX System Laboratories, Inc.
     43  * All or some portions of this file are derived from material licensed
     44  * to the University of California by American Telephone and Telegraph
     45  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
     46  * the permission of UNIX System Laboratories, Inc.
     47  *
     48  * Redistribution and use in source and binary forms, with or without
     49  * modification, are permitted provided that the following conditions
     50  * are met:
     51  * 1. Redistributions of source code must retain the above copyright
     52  *    notice, this list of conditions and the following disclaimer.
     53  * 2. Redistributions in binary form must reproduce the above copyright
     54  *    notice, this list of conditions and the following disclaimer in the
     55  *    documentation and/or other materials provided with the distribution.
     56  * 3. Neither the name of the University nor the names of its contributors
     57  *    may be used to endorse or promote products derived from this software
     58  *    without specific prior written permission.
     59  *
     60  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     61  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     62  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     63  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     64  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     65  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     66  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     67  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     68  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     69  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     70  * SUCH DAMAGE.
     71  *
     72  *	@(#)sys_generic.c	8.9 (Berkeley) 2/14/95
     73  */
     74 
     75 /*
     76  * System calls relating to files.
     77  */
     78 
     79 #include <sys/cdefs.h>
     80 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.108 2007/10/08 15:12:08 ad Exp $");
     81 
     82 #include <sys/param.h>
     83 #include <sys/systm.h>
     84 #include <sys/filedesc.h>
     85 #include <sys/ioctl.h>
     86 #include <sys/file.h>
     87 #include <sys/proc.h>
     88 #include <sys/socketvar.h>
     89 #include <sys/signalvar.h>
     90 #include <sys/uio.h>
     91 #include <sys/kernel.h>
     92 #include <sys/stat.h>
     93 #include <sys/kmem.h>
     94 #include <sys/poll.h>
     95 #include <sys/vnode.h>
     96 #include <sys/mount.h>
     97 #include <sys/syscallargs.h>
     98 #include <sys/ktrace.h>
     99 
    100 #include <uvm/uvm_extern.h>
    101 
    102 /* Flags for lwp::l_selflag. */
    103 #define	SEL_RESET	0	/* awoken, interrupted, or not yet polling */
    104 #define	SEL_SCANNING	1	/* polling descriptors */
    105 #define	SEL_BLOCKING	2	/* about to block on select_cv */
    106 
    107 static int	selscan(lwp_t *, fd_mask *, fd_mask *, int, register_t *);
    108 static int	pollscan(lwp_t *, struct pollfd *, int, register_t *);
    109 static void	selclear(void);
    110 
    111 /* Global state for select()/poll(). */
    112 kmutex_t	select_lock;
    113 kcondvar_t	select_cv;
    114 int		nselcoll;
    115 
    116 /*
    117  * Read system call.
    118  */
    119 /* ARGSUSED */
    120 int
    121 sys_read(lwp_t *l, void *v, register_t *retval)
    122 {
    123 	struct sys_read_args /* {
    124 		syscallarg(int)		fd;
    125 		syscallarg(void *)	buf;
    126 		syscallarg(size_t)	nbyte;
    127 	} */ *uap = v;
    128 	int		fd;
    129 	struct file	*fp;
    130 	proc_t		*p;
    131 	struct filedesc	*fdp;
    132 
    133 	fd = SCARG(uap, fd);
    134 	p = l->l_proc;
    135 	fdp = p->p_fd;
    136 
    137 	if ((fp = fd_getfile(fdp, fd)) == NULL)
    138 		return (EBADF);
    139 
    140 	if ((fp->f_flag & FREAD) == 0) {
    141 		mutex_exit(&fp->f_lock);
    142 		return (EBADF);
    143 	}
    144 
    145 	FILE_USE(fp);
    146 
    147 	/* dofileread() will unuse the descriptor for us */
    148 	return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
    149 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
    150 }
    151 
    152 int
    153 dofileread(int fd, struct file *fp, void *buf, size_t nbyte,
    154 	off_t *offset, int flags, register_t *retval)
    155 {
    156 	struct iovec aiov;
    157 	struct uio auio;
    158 	size_t cnt;
    159 	int error;
    160 	lwp_t *l;
    161 
    162 	l = curlwp;
    163 
    164 	aiov.iov_base = (void *)buf;
    165 	aiov.iov_len = nbyte;
    166 	auio.uio_iov = &aiov;
    167 	auio.uio_iovcnt = 1;
    168 	auio.uio_resid = nbyte;
    169 	auio.uio_rw = UIO_READ;
    170 	auio.uio_vmspace = l->l_proc->p_vmspace;
    171 
    172 	/*
    173 	 * Reads return ssize_t because -1 is returned on error.  Therefore
    174 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
    175 	 * values.
    176 	 */
    177 	if (auio.uio_resid > SSIZE_MAX) {
    178 		error = EINVAL;
    179 		goto out;
    180 	}
    181 
    182 	cnt = auio.uio_resid;
    183 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
    184 	if (error)
    185 		if (auio.uio_resid != cnt && (error == ERESTART ||
    186 		    error == EINTR || error == EWOULDBLOCK))
    187 			error = 0;
    188 	cnt -= auio.uio_resid;
    189 	ktrgenio(fd, UIO_READ, buf, cnt, error);
    190 	*retval = cnt;
    191  out:
    192 	FILE_UNUSE(fp, l);
    193 	return (error);
    194 }
    195 
    196 /*
    197  * Scatter read system call.
    198  */
    199 int
    200 sys_readv(lwp_t *l, void *v, register_t *retval)
    201 {
    202 	struct sys_readv_args /* {
    203 		syscallarg(int)				fd;
    204 		syscallarg(const struct iovec *)	iovp;
    205 		syscallarg(int)				iovcnt;
    206 	} */ *uap = v;
    207 
    208 	return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
    209 	    SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
    210 }
    211 
    212 int
    213 do_filereadv(int fd, const struct iovec *iovp, int iovcnt,
    214     off_t *offset, int flags, register_t *retval)
    215 {
    216 	struct uio	auio;
    217 	struct iovec	*iov, *needfree = NULL, aiov[UIO_SMALLIOV];
    218 	int		i, error;
    219 	size_t		cnt;
    220 	u_int		iovlen;
    221 	struct file	*fp;
    222 	struct iovec	*ktriov = NULL;
    223 	lwp_t		*l;
    224 
    225 	if (iovcnt == 0)
    226 		return EINVAL;
    227 
    228 	l = curlwp;
    229 
    230 	if ((fp = fd_getfile(l->l_proc->p_fd, fd)) == NULL)
    231 		return EBADF;
    232 
    233 	if ((fp->f_flag & FREAD) == 0) {
    234 		mutex_exit(&fp->f_lock);
    235 		return EBADF;
    236 	}
    237 
    238 	FILE_USE(fp);
    239 
    240 	if (offset == NULL)
    241 		offset = &fp->f_offset;
    242 	else {
    243 		struct vnode *vp = fp->f_data;
    244 		if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
    245 			error = ESPIPE;
    246 			goto out;
    247 		}
    248 		/*
    249 		 * Test that the device is seekable ?
    250 		 * XXX This works because no file systems actually
    251 		 * XXX take any action on the seek operation.
    252 		 */
    253 		error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
    254 		if (error != 0)
    255 			goto out;
    256 	}
    257 
    258 	iovlen = iovcnt * sizeof(struct iovec);
    259 	if (flags & FOF_IOV_SYSSPACE)
    260 		iov = __UNCONST(iovp);
    261 	else {
    262 		iov = aiov;
    263 		if ((u_int)iovcnt > UIO_SMALLIOV) {
    264 			if ((u_int)iovcnt > IOV_MAX) {
    265 				error = EINVAL;
    266 				goto out;
    267 			}
    268 			iov = kmem_alloc(iovlen, KM_SLEEP);
    269 			if (iov == NULL) {
    270 				error = ENOMEM;
    271 				goto out;
    272 			}
    273 			needfree = iov;
    274 		}
    275 		error = copyin(iovp, iov, iovlen);
    276 		if (error)
    277 			goto done;
    278 	}
    279 
    280 	auio.uio_iov = iov;
    281 	auio.uio_iovcnt = iovcnt;
    282 	auio.uio_rw = UIO_READ;
    283 	auio.uio_vmspace = l->l_proc->p_vmspace;
    284 
    285 	auio.uio_resid = 0;
    286 	for (i = 0; i < iovcnt; i++, iov++) {
    287 		auio.uio_resid += iov->iov_len;
    288 		/*
    289 		 * Reads return ssize_t because -1 is returned on error.
    290 		 * Therefore we must restrict the length to SSIZE_MAX to
    291 		 * avoid garbage return values.
    292 		 */
    293 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
    294 			error = EINVAL;
    295 			goto done;
    296 		}
    297 	}
    298 
    299 	/*
    300 	 * if tracing, save a copy of iovec
    301 	 */
    302 	if (ktrpoint(KTR_GENIO))  {
    303 		ktriov = kmem_alloc(iovlen, KM_SLEEP);
    304 		if (ktriov != NULL)
    305 			memcpy(ktriov, auio.uio_iov, iovlen);
    306 	}
    307 
    308 	cnt = auio.uio_resid;
    309 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
    310 	if (error)
    311 		if (auio.uio_resid != cnt && (error == ERESTART ||
    312 		    error == EINTR || error == EWOULDBLOCK))
    313 			error = 0;
    314 	cnt -= auio.uio_resid;
    315 	*retval = cnt;
    316 
    317 	if (ktriov != NULL) {
    318 		ktrgeniov(fd, UIO_READ, ktriov, cnt, error);
    319 		kmem_free(ktriov, iovlen);
    320 	}
    321 
    322  done:
    323 	if (needfree)
    324 		kmem_free(needfree, iovlen);
    325  out:
    326 	FILE_UNUSE(fp, l);
    327 	return (error);
    328 }
    329 
    330 /*
    331  * Write system call
    332  */
    333 int
    334 sys_write(lwp_t *l, void *v, register_t *retval)
    335 {
    336 	struct sys_write_args /* {
    337 		syscallarg(int)			fd;
    338 		syscallarg(const void *)	buf;
    339 		syscallarg(size_t)		nbyte;
    340 	} */ *uap = v;
    341 	int		fd;
    342 	struct file	*fp;
    343 
    344 	fd = SCARG(uap, fd);
    345 
    346 	if ((fp = fd_getfile(curproc->p_fd, fd)) == NULL)
    347 		return (EBADF);
    348 
    349 	if ((fp->f_flag & FWRITE) == 0) {
    350 		mutex_exit(&fp->f_lock);
    351 		return (EBADF);
    352 	}
    353 
    354 	FILE_USE(fp);
    355 
    356 	/* dofilewrite() will unuse the descriptor for us */
    357 	return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
    358 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
    359 }
    360 
    361 int
    362 dofilewrite(int fd, struct file *fp, const void *buf,
    363 	size_t nbyte, off_t *offset, int flags, register_t *retval)
    364 {
    365 	struct iovec aiov;
    366 	struct uio auio;
    367 	size_t cnt;
    368 	int error;
    369 	lwp_t *l;
    370 
    371 	l = curlwp;
    372 
    373 	aiov.iov_base = __UNCONST(buf);		/* XXXUNCONST kills const */
    374 	aiov.iov_len = nbyte;
    375 	auio.uio_iov = &aiov;
    376 	auio.uio_iovcnt = 1;
    377 	auio.uio_resid = nbyte;
    378 	auio.uio_rw = UIO_WRITE;
    379 	auio.uio_vmspace = l->l_proc->p_vmspace;
    380 
    381 	/*
    382 	 * Writes return ssize_t because -1 is returned on error.  Therefore
    383 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
    384 	 * values.
    385 	 */
    386 	if (auio.uio_resid > SSIZE_MAX) {
    387 		error = EINVAL;
    388 		goto out;
    389 	}
    390 
    391 	cnt = auio.uio_resid;
    392 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
    393 	if (error) {
    394 		if (auio.uio_resid != cnt && (error == ERESTART ||
    395 		    error == EINTR || error == EWOULDBLOCK))
    396 			error = 0;
    397 		if (error == EPIPE) {
    398 			mutex_enter(&proclist_mutex);
    399 			psignal(l->l_proc, SIGPIPE);
    400 			mutex_exit(&proclist_mutex);
    401 		}
    402 	}
    403 	cnt -= auio.uio_resid;
    404 	ktrgenio(fd, UIO_WRITE, buf, cnt, error);
    405 	*retval = cnt;
    406  out:
    407 	FILE_UNUSE(fp, l);
    408 	return (error);
    409 }
    410 
    411 /*
    412  * Gather write system call
    413  */
    414 int
    415 sys_writev(lwp_t *l, void *v, register_t *retval)
    416 {
    417 	struct sys_writev_args /* {
    418 		syscallarg(int)				fd;
    419 		syscallarg(const struct iovec *)	iovp;
    420 		syscallarg(int)				iovcnt;
    421 	} */ *uap = v;
    422 
    423 	return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
    424 	    SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
    425 }
    426 
    427 int
    428 do_filewritev(int fd, const struct iovec *iovp, int iovcnt,
    429     off_t *offset, int flags, register_t *retval)
    430 {
    431 	struct uio	auio;
    432 	struct iovec	*iov, *needfree = NULL, aiov[UIO_SMALLIOV];
    433 	int		i, error;
    434 	size_t		cnt;
    435 	u_int		iovlen;
    436 	struct file	*fp;
    437 	struct iovec	*ktriov = NULL;
    438 	lwp_t		*l;
    439 
    440 	l = curlwp;
    441 
    442 	if (iovcnt == 0)
    443 		return EINVAL;
    444 
    445 	if ((fp = fd_getfile(l->l_proc->p_fd, fd)) == NULL)
    446 		return EBADF;
    447 
    448 	if ((fp->f_flag & FWRITE) == 0) {
    449 		mutex_exit(&fp->f_lock);
    450 		return EBADF;
    451 	}
    452 
    453 	FILE_USE(fp);
    454 
    455 	if (offset == NULL)
    456 		offset = &fp->f_offset;
    457 	else {
    458 		struct vnode *vp = fp->f_data;
    459 		if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
    460 			error = ESPIPE;
    461 			goto out;
    462 		}
    463 		/*
    464 		 * Test that the device is seekable ?
    465 		 * XXX This works because no file systems actually
    466 		 * XXX take any action on the seek operation.
    467 		 */
    468 		error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred);
    469 		if (error != 0)
    470 			goto out;
    471 	}
    472 
    473 	iovlen = iovcnt * sizeof(struct iovec);
    474 	if (flags & FOF_IOV_SYSSPACE)
    475 		iov = __UNCONST(iovp);
    476 	else {
    477 		iov = aiov;
    478 		if ((u_int)iovcnt > UIO_SMALLIOV) {
    479 			if ((u_int)iovcnt > IOV_MAX) {
    480 				error = EINVAL;
    481 				goto out;
    482 			}
    483 			iov = kmem_alloc(iovlen, KM_SLEEP);
    484 			if (iov == NULL) {
    485 				error = ENOMEM;
    486 				goto out;
    487 			}
    488 			needfree = iov;
    489 		}
    490 		error = copyin(iovp, iov, iovlen);
    491 		if (error)
    492 			goto done;
    493 	}
    494 
    495 	auio.uio_iov = iov;
    496 	auio.uio_iovcnt = iovcnt;
    497 	auio.uio_rw = UIO_WRITE;
    498 	auio.uio_vmspace = curproc->p_vmspace;
    499 
    500 	auio.uio_resid = 0;
    501 	for (i = 0; i < iovcnt; i++, iov++) {
    502 		auio.uio_resid += iov->iov_len;
    503 		/*
    504 		 * Writes return ssize_t because -1 is returned on error.
    505 		 * Therefore we must restrict the length to SSIZE_MAX to
    506 		 * avoid garbage return values.
    507 		 */
    508 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
    509 			error = EINVAL;
    510 			goto done;
    511 		}
    512 	}
    513 
    514 	/*
    515 	 * if tracing, save a copy of iovec
    516 	 */
    517 	if (ktrpoint(KTR_GENIO))  {
    518 		ktriov = kmem_alloc(iovlen, KM_SLEEP);
    519 		if (ktriov != NULL)
    520 			memcpy(ktriov, auio.uio_iov, iovlen);
    521 	}
    522 
    523 	cnt = auio.uio_resid;
    524 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
    525 	if (error) {
    526 		if (auio.uio_resid != cnt && (error == ERESTART ||
    527 		    error == EINTR || error == EWOULDBLOCK))
    528 			error = 0;
    529 		if (error == EPIPE) {
    530 			mutex_enter(&proclist_mutex);
    531 			psignal(l->l_proc, SIGPIPE);
    532 			mutex_exit(&proclist_mutex);
    533 		}
    534 	}
    535 	cnt -= auio.uio_resid;
    536 	*retval = cnt;
    537 
    538 	if (ktriov != NULL) {
    539 		ktrgeniov(fd, UIO_WRITE, ktriov, cnt, error);
    540 		kmem_free(ktriov, iovlen);
    541 	}
    542 
    543  done:
    544 	if (needfree)
    545 		kmem_free(needfree, iovlen);
    546  out:
    547 	FILE_UNUSE(fp, l);
    548 	return (error);
    549 }
    550 
    551 /*
    552  * Ioctl system call
    553  */
    554 /* ARGSUSED */
    555 int
    556 sys_ioctl(lwp_t *l, void *v, register_t *retval)
    557 {
    558 	struct sys_ioctl_args /* {
    559 		syscallarg(int)		fd;
    560 		syscallarg(u_long)	com;
    561 		syscallarg(void *)	data;
    562 	} */ *uap = v;
    563 	struct file	*fp;
    564 	proc_t		*p;
    565 	struct filedesc	*fdp;
    566 	u_long		com;
    567 	int		error;
    568 	u_int		size;
    569 	void 		*data, *memp;
    570 #define	STK_PARAMS	128
    571 	u_long		stkbuf[STK_PARAMS/sizeof(u_long)];
    572 
    573 	error = 0;
    574 	p = l->l_proc;
    575 	fdp = p->p_fd;
    576 
    577 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
    578 		return (EBADF);
    579 
    580 	FILE_USE(fp);
    581 
    582 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
    583 		error = EBADF;
    584 		com = 0;
    585 		goto out;
    586 	}
    587 
    588 	switch (com = SCARG(uap, com)) {
    589 	case FIONCLEX:
    590 		rw_enter(&fdp->fd_lock, RW_WRITER);
    591 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
    592 		rw_exit(&fdp->fd_lock);
    593 		goto out;
    594 
    595 	case FIOCLEX:
    596 		rw_enter(&fdp->fd_lock, RW_WRITER);
    597 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
    598 		rw_exit(&fdp->fd_lock);
    599 		goto out;
    600 	}
    601 
    602 	/*
    603 	 * Interpret high order word to find amount of data to be
    604 	 * copied to/from the user's address space.
    605 	 */
    606 	size = IOCPARM_LEN(com);
    607 	if (size > IOCPARM_MAX) {
    608 		error = ENOTTY;
    609 		goto out;
    610 	}
    611 	memp = NULL;
    612 	if (size > sizeof(stkbuf)) {
    613 		memp = kmem_alloc(size, KM_SLEEP);
    614 		data = memp;
    615 	} else
    616 		data = (void *)stkbuf;
    617 	if (com&IOC_IN) {
    618 		if (size) {
    619 			error = copyin(SCARG(uap, data), data, size);
    620 			if (error) {
    621 				if (memp)
    622 					kmem_free(memp, size);
    623 				goto out;
    624 			}
    625 			ktrgenio(SCARG(uap, fd), UIO_WRITE, SCARG(uap, data),
    626 			    size, 0);
    627 		} else
    628 			*(void **)data = SCARG(uap, data);
    629 	} else if ((com&IOC_OUT) && size)
    630 		/*
    631 		 * Zero the buffer so the user always
    632 		 * gets back something deterministic.
    633 		 */
    634 		memset(data, 0, size);
    635 	else if (com&IOC_VOID)
    636 		*(void **)data = SCARG(uap, data);
    637 
    638 	switch (com) {
    639 
    640 	case FIONBIO:
    641 		mutex_enter(&fp->f_lock);
    642 		if (*(int *)data != 0)
    643 			fp->f_flag |= FNONBLOCK;
    644 		else
    645 			fp->f_flag &= ~FNONBLOCK;
    646 		mutex_exit(&fp->f_lock);
    647 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data, l);
    648 		break;
    649 
    650 	case FIOASYNC:
    651 		mutex_enter(&fp->f_lock);
    652 		if (*(int *)data != 0)
    653 			fp->f_flag |= FASYNC;
    654 		else
    655 			fp->f_flag &= ~FASYNC;
    656 		mutex_exit(&fp->f_lock);
    657 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data, l);
    658 		break;
    659 
    660 	default:
    661 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, l);
    662 		/*
    663 		 * Copy any data to user, size was
    664 		 * already set and checked above.
    665 		 */
    666 		if (error == 0 && (com&IOC_OUT) && size) {
    667 			error = copyout(data, SCARG(uap, data), size);
    668 			ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, data),
    669 			    size, error);
    670 		}
    671 		break;
    672 	}
    673 	if (memp)
    674 		kmem_free(memp, size);
    675  out:
    676 	FILE_UNUSE(fp, l);
    677 	switch (error) {
    678 	case -1:
    679 		printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
    680 		    "pid=%d comm=%s\n",
    681 		    (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
    682 		    (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
    683 		    p->p_pid, p->p_comm);
    684 		/* FALLTHROUGH */
    685 	case EPASSTHROUGH:
    686 		error = ENOTTY;
    687 		/* FALLTHROUGH */
    688 	default:
    689 		return (error);
    690 	}
    691 }
    692 
    693 /*
    694  * Select system call.
    695  */
    696 int
    697 sys_pselect(lwp_t *l, void *v, register_t *retval)
    698 {
    699 	struct sys_pselect_args /* {
    700 		syscallarg(int)				nd;
    701 		syscallarg(fd_set *)			in;
    702 		syscallarg(fd_set *)			ou;
    703 		syscallarg(fd_set *)			ex;
    704 		syscallarg(const struct timespec *)	ts;
    705 		syscallarg(sigset_t *)			mask;
    706 	} */ * const uap = v;
    707 	struct timespec	ats;
    708 	struct timeval	atv, *tv = NULL;
    709 	sigset_t	amask, *mask = NULL;
    710 	int		error;
    711 
    712 	if (SCARG(uap, ts)) {
    713 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
    714 		if (error)
    715 			return error;
    716 		atv.tv_sec = ats.tv_sec;
    717 		atv.tv_usec = ats.tv_nsec / 1000;
    718 		tv = &atv;
    719 	}
    720 	if (SCARG(uap, mask) != NULL) {
    721 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
    722 		if (error)
    723 			return error;
    724 		mask = &amask;
    725 	}
    726 
    727 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
    728 	    SCARG(uap, ou), SCARG(uap, ex), tv, mask);
    729 }
    730 
    731 int
    732 inittimeleft(struct timeval *tv, struct timeval *sleeptv)
    733 {
    734 	if (itimerfix(tv))
    735 		return -1;
    736 	getmicrouptime(sleeptv);
    737 	return 0;
    738 }
    739 
    740 int
    741 gettimeleft(struct timeval *tv, struct timeval *sleeptv)
    742 {
    743 	/*
    744 	 * We have to recalculate the timeout on every retry.
    745 	 */
    746 	struct timeval slepttv;
    747 	/*
    748 	 * reduce tv by elapsed time
    749 	 * based on monotonic time scale
    750 	 */
    751 	getmicrouptime(&slepttv);
    752 	timeradd(tv, sleeptv, tv);
    753 	timersub(tv, &slepttv, tv);
    754 	*sleeptv = slepttv;
    755 	return tvtohz(tv);
    756 }
    757 
    758 int
    759 sys_select(lwp_t *l, void *v, register_t *retval)
    760 {
    761 	struct sys_select_args /* {
    762 		syscallarg(int)			nd;
    763 		syscallarg(fd_set *)		in;
    764 		syscallarg(fd_set *)		ou;
    765 		syscallarg(fd_set *)		ex;
    766 		syscallarg(struct timeval *)	tv;
    767 	} */ * const uap = v;
    768 	struct timeval atv, *tv = NULL;
    769 	int error;
    770 
    771 	if (SCARG(uap, tv)) {
    772 		error = copyin(SCARG(uap, tv), (void *)&atv,
    773 			sizeof(atv));
    774 		if (error)
    775 			return error;
    776 		tv = &atv;
    777 	}
    778 
    779 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
    780 	    SCARG(uap, ou), SCARG(uap, ex), tv, NULL);
    781 }
    782 
    783 int
    784 selcommon(lwp_t *l, register_t *retval, int nd, fd_set *u_in,
    785 	  fd_set *u_ou, fd_set *u_ex, struct timeval *tv, sigset_t *mask)
    786 {
    787 	char		smallbits[howmany(FD_SETSIZE, NFDBITS) *
    788 			    sizeof(fd_mask) * 6];
    789 	proc_t		* const p = l->l_proc;
    790 	char 		*bits;
    791 	int		ncoll, error, timo;
    792 	size_t		ni;
    793 	sigset_t	oldmask;
    794 	struct timeval  sleeptv;
    795 
    796 	error = 0;
    797 	if (nd < 0)
    798 		return (EINVAL);
    799 	if (nd > p->p_fd->fd_nfiles) {
    800 		/* forgiving; slightly wrong */
    801 		nd = p->p_fd->fd_nfiles;
    802 	}
    803 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
    804 	if (ni * 6 > sizeof(smallbits))
    805 		bits = kmem_alloc(ni * 6, KM_SLEEP);
    806 	else
    807 		bits = smallbits;
    808 
    809 #define	getbits(name, x)						\
    810 	if (u_ ## name) {						\
    811 		error = copyin(u_ ## name, bits + ni * x, ni);		\
    812 		if (error)						\
    813 			goto done;					\
    814 	} else								\
    815 		memset(bits + ni * x, 0, ni);
    816 	getbits(in, 0);
    817 	getbits(ou, 1);
    818 	getbits(ex, 2);
    819 #undef	getbits
    820 
    821 	timo = 0;
    822 	if (tv && inittimeleft(tv, &sleeptv) == -1) {
    823 		error = EINVAL;
    824 		goto done;
    825 	}
    826 
    827 	if (mask) {
    828 		sigminusset(&sigcantmask, mask);
    829 		mutex_enter(&p->p_smutex);
    830 		oldmask = l->l_sigmask;
    831 		l->l_sigmask = *mask;
    832 		mutex_exit(&p->p_smutex);
    833 	} else
    834 		oldmask = l->l_sigmask;	/* XXXgcc */
    835 
    836 	mutex_enter(&select_lock);
    837 	SLIST_INIT(&l->l_selwait);
    838 	for (;;) {
    839 	 	l->l_selflag = SEL_SCANNING;
    840 		ncoll = nselcoll;
    841  		mutex_exit(&select_lock);
    842 
    843 		error = selscan(l, (fd_mask *)(bits + ni * 0),
    844 		    (fd_mask *)(bits + ni * 3), nd, retval);
    845 
    846 		mutex_enter(&select_lock);
    847 		if (error || *retval)
    848 			break;
    849 		if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
    850 			break;
    851 		if (l->l_selflag != SEL_SCANNING || ncoll != nselcoll)
    852 			continue;
    853 		l->l_selflag = SEL_BLOCKING;
    854 		error = cv_timedwait_sig(&select_cv, &select_lock, timo);
    855 		if (error != 0)
    856 			break;
    857 	}
    858 	selclear();
    859 	mutex_exit(&select_lock);
    860 
    861 	if (mask) {
    862 		mutex_enter(&p->p_smutex);
    863 		l->l_sigmask = oldmask;
    864 		mutex_exit(&p->p_smutex);
    865 	}
    866 
    867  done:
    868 	/* select is not restarted after signals... */
    869 	if (error == ERESTART)
    870 		error = EINTR;
    871 	if (error == EWOULDBLOCK)
    872 		error = 0;
    873 	if (error == 0 && u_in != NULL)
    874 		error = copyout(bits + ni * 3, u_in, ni);
    875 	if (error == 0 && u_ou != NULL)
    876 		error = copyout(bits + ni * 4, u_ou, ni);
    877 	if (error == 0 && u_ex != NULL)
    878 		error = copyout(bits + ni * 5, u_ex, ni);
    879 	if (bits != smallbits)
    880 		kmem_free(bits, ni * 6);
    881 	return (error);
    882 }
    883 
    884 int
    885 selscan(lwp_t *l, fd_mask *ibitp, fd_mask *obitp, int nfd,
    886 	register_t *retval)
    887 {
    888 	static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
    889 			       POLLWRNORM | POLLHUP | POLLERR,
    890 			       POLLRDBAND };
    891 	proc_t *p = l->l_proc;
    892 	struct filedesc	*fdp;
    893 	int msk, i, j, fd, n;
    894 	fd_mask ibits, obits;
    895 	struct file *fp;
    896 
    897 	fdp = p->p_fd;
    898 	n = 0;
    899 	for (msk = 0; msk < 3; msk++) {
    900 		for (i = 0; i < nfd; i += NFDBITS) {
    901 			ibits = *ibitp++;
    902 			obits = 0;
    903 			while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
    904 				ibits &= ~(1 << j);
    905 				if ((fp = fd_getfile(fdp, fd)) == NULL)
    906 					return (EBADF);
    907 				FILE_USE(fp);
    908 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], l)) {
    909 					obits |= (1 << j);
    910 					n++;
    911 				}
    912 				FILE_UNUSE(fp, l);
    913 			}
    914 			*obitp++ = obits;
    915 		}
    916 	}
    917 	*retval = n;
    918 	return (0);
    919 }
    920 
    921 /*
    922  * Poll system call.
    923  */
    924 int
    925 sys_poll(lwp_t *l, void *v, register_t *retval)
    926 {
    927 	struct sys_poll_args /* {
    928 		syscallarg(struct pollfd *)	fds;
    929 		syscallarg(u_int)		nfds;
    930 		syscallarg(int)			timeout;
    931 	} */ * const uap = v;
    932 	struct timeval	atv, *tv = NULL;
    933 
    934 	if (SCARG(uap, timeout) != INFTIM) {
    935 		atv.tv_sec = SCARG(uap, timeout) / 1000;
    936 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
    937 		tv = &atv;
    938 	}
    939 
    940 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
    941 		tv, NULL);
    942 }
    943 
    944 /*
    945  * Poll system call.
    946  */
    947 int
    948 sys_pollts(lwp_t *l, void *v, register_t *retval)
    949 {
    950 	struct sys_pollts_args /* {
    951 		syscallarg(struct pollfd *)		fds;
    952 		syscallarg(u_int)			nfds;
    953 		syscallarg(const struct timespec *)	ts;
    954 		syscallarg(const sigset_t *)		mask;
    955 	} */ * const uap = v;
    956 	struct timespec	ats;
    957 	struct timeval	atv, *tv = NULL;
    958 	sigset_t	amask, *mask = NULL;
    959 	int		error;
    960 
    961 	if (SCARG(uap, ts)) {
    962 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
    963 		if (error)
    964 			return error;
    965 		atv.tv_sec = ats.tv_sec;
    966 		atv.tv_usec = ats.tv_nsec / 1000;
    967 		tv = &atv;
    968 	}
    969 	if (SCARG(uap, mask)) {
    970 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
    971 		if (error)
    972 			return error;
    973 		mask = &amask;
    974 	}
    975 
    976 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
    977 		tv, mask);
    978 }
    979 
    980 int
    981 pollcommon(lwp_t *l, register_t *retval,
    982 	struct pollfd *u_fds, u_int nfds,
    983 	struct timeval *tv, sigset_t *mask)
    984 {
    985 	char		smallbits[32 * sizeof(struct pollfd)];
    986 	proc_t		* const p = l->l_proc;
    987 	void *		bits;
    988 	sigset_t	oldmask;
    989 	int		ncoll, error, timo;
    990 	size_t		ni;
    991 	struct timeval	sleeptv;
    992 
    993 	if (nfds > p->p_fd->fd_nfiles) {
    994 		/* forgiving; slightly wrong */
    995 		nfds = p->p_fd->fd_nfiles;
    996 	}
    997 	ni = nfds * sizeof(struct pollfd);
    998 	if (ni > sizeof(smallbits))
    999 		bits = kmem_alloc(ni, KM_SLEEP);
   1000 	else
   1001 		bits = smallbits;
   1002 
   1003 	error = copyin(u_fds, bits, ni);
   1004 	if (error)
   1005 		goto done;
   1006 
   1007 	timo = 0;
   1008 	if (tv && inittimeleft(tv, &sleeptv) == -1) {
   1009 		error = EINVAL;
   1010 		goto done;
   1011 	}
   1012 
   1013 	if (mask) {
   1014 		sigminusset(&sigcantmask, mask);
   1015 		mutex_enter(&p->p_smutex);
   1016 		oldmask = l->l_sigmask;
   1017 		l->l_sigmask = *mask;
   1018 		mutex_exit(&p->p_smutex);
   1019 	} else
   1020 		oldmask = l->l_sigmask;	/* XXXgcc */
   1021 
   1022 	mutex_enter(&select_lock);
   1023 	SLIST_INIT(&l->l_selwait);
   1024 	for (;;) {
   1025 		ncoll = nselcoll;
   1026 		l->l_selflag = SEL_SCANNING;
   1027 		mutex_exit(&select_lock);
   1028 
   1029 		error = pollscan(l, (struct pollfd *)bits, nfds, retval);
   1030 
   1031 		mutex_enter(&select_lock);
   1032 		if (error || *retval)
   1033 			break;
   1034 		if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
   1035 			break;
   1036 		if (l->l_selflag != SEL_SCANNING || nselcoll != ncoll)
   1037 			continue;
   1038 		l->l_selflag = SEL_BLOCKING;
   1039 		error = cv_timedwait_sig(&select_cv, &select_lock, timo);
   1040 		if (error != 0)
   1041 			break;
   1042 	}
   1043 	selclear();
   1044 	mutex_exit(&select_lock);
   1045 
   1046 	if (mask) {
   1047 		mutex_enter(&p->p_smutex);
   1048 		l->l_sigmask = oldmask;
   1049 		mutex_exit(&p->p_smutex);
   1050 	}
   1051  done:
   1052 	/* poll is not restarted after signals... */
   1053 	if (error == ERESTART)
   1054 		error = EINTR;
   1055 	if (error == EWOULDBLOCK)
   1056 		error = 0;
   1057 	if (error == 0)
   1058 		error = copyout(bits, u_fds, ni);
   1059 	if (bits != smallbits)
   1060 		kmem_free(bits, ni);
   1061 	return (error);
   1062 }
   1063 
   1064 int
   1065 pollscan(lwp_t *l, struct pollfd *fds, int nfd, register_t *retval)
   1066 {
   1067 	proc_t		*p = l->l_proc;
   1068 	struct filedesc	*fdp;
   1069 	int		i, n;
   1070 	struct file	*fp;
   1071 
   1072 	fdp = p->p_fd;
   1073 	n = 0;
   1074 	for (i = 0; i < nfd; i++, fds++) {
   1075 		if (fds->fd >= fdp->fd_nfiles) {
   1076 			fds->revents = POLLNVAL;
   1077 			n++;
   1078 		} else if (fds->fd < 0) {
   1079 			fds->revents = 0;
   1080 		} else {
   1081 			if ((fp = fd_getfile(fdp, fds->fd)) == NULL) {
   1082 				fds->revents = POLLNVAL;
   1083 				n++;
   1084 			} else {
   1085 				FILE_USE(fp);
   1086 				fds->revents = (*fp->f_ops->fo_poll)(fp,
   1087 				    fds->events | POLLERR | POLLHUP, l);
   1088 				if (fds->revents != 0)
   1089 					n++;
   1090 				FILE_UNUSE(fp, l);
   1091 			}
   1092 		}
   1093 	}
   1094 	*retval = n;
   1095 	return (0);
   1096 }
   1097 
   1098 /*ARGSUSED*/
   1099 int
   1100 seltrue(dev_t dev, int events, lwp_t *l)
   1101 {
   1102 
   1103 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
   1104 }
   1105 
   1106 /*
   1107  * Record a select request.
   1108  */
   1109 void
   1110 selrecord(lwp_t *selector, struct selinfo *sip)
   1111 {
   1112 
   1113 	mutex_enter(&select_lock);
   1114 	if (sip->sel_lwp == NULL) {
   1115 		/* First named waiter, although there may be more. */
   1116 		sip->sel_lwp = selector;
   1117 		SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
   1118 	} else if (sip->sel_lwp != selector) {
   1119 		/* Multiple waiters. */
   1120 		sip->sel_collision = true;
   1121 	}
   1122 	mutex_exit(&select_lock);
   1123 }
   1124 
   1125 /*
   1126  * Do a wakeup when a selectable event occurs.
   1127  */
   1128 void
   1129 selwakeup(struct selinfo *sip)
   1130 {
   1131 	lwp_t *l;
   1132 
   1133 	mutex_enter(&select_lock);
   1134 	if (sip->sel_collision) {
   1135 		/* Multiple waiters - just notify everybody. */
   1136 		nselcoll++;
   1137 		sip->sel_collision = false;
   1138 		cv_broadcast(&select_cv);
   1139 	} else if (sip->sel_lwp != NULL) {
   1140 		/* Only one LWP waiting. */
   1141 		l = sip->sel_lwp;
   1142 		if (l->l_selflag == SEL_BLOCKING) {
   1143 			/*
   1144 			 * If it's sleeping, wake it up.  If not, it's
   1145 			 * already awake but hasn't yet removed itself
   1146 			 * from the selector.  We reset the state below
   1147 			 * so that we only attempt to do this once.
   1148 			 */
   1149 			lwp_lock(l);
   1150 			if (l->l_wchan == &select_cv) {
   1151 				/* lwp_unsleep() releases the LWP lock. */
   1152 				lwp_unsleep(l);
   1153 			} else
   1154 				lwp_unlock(l);
   1155 		} else {
   1156 			/*
   1157 			 * Not yet asleep.  Reset its state below so that
   1158 			 * it will go around again.
   1159 			 */
   1160 		}
   1161 		l->l_selflag = SEL_RESET;
   1162 	}
   1163 	mutex_exit(&select_lock);
   1164 }
   1165 
   1166 void
   1167 selnotify(struct selinfo *sip, long knhint)
   1168 {
   1169 
   1170 	selwakeup(sip);
   1171 	KNOTE(&sip->sel_klist, knhint);
   1172 }
   1173 
   1174 /*
   1175  * Remove an LWP from all objects that it is waiting for.
   1176  */
   1177 static void
   1178 selclear(void)
   1179 {
   1180 	struct selinfo *sip;
   1181 	lwp_t *l = curlwp;
   1182 
   1183 	KASSERT(mutex_owned(&select_lock));
   1184 
   1185 	SLIST_FOREACH(sip, &l->l_selwait, sel_chain) {
   1186 		KASSERT(sip->sel_lwp == l);
   1187 		sip->sel_lwp = NULL;
   1188 	}
   1189 }
   1190 
   1191 /*
   1192  * Initialize the select/poll system calls.
   1193  */
   1194 void
   1195 selsysinit(void)
   1196 {
   1197 
   1198 	mutex_init(&select_lock, MUTEX_DRIVER, IPL_VM);
   1199 	cv_init(&select_cv, "select");
   1200 }
   1201 
   1202 /*
   1203  * Initialize a selector.
   1204  */
   1205 void
   1206 selinit(struct selinfo *sip)
   1207 {
   1208 
   1209 	memset(sip, 0, sizeof(*sip));
   1210 }
   1211 
   1212 /*
   1213  * Destroy a selector.  The owning object must not gain new
   1214  * references while this is in progress: all activity on the
   1215  * selector must be stopped.
   1216  */
   1217 void
   1218 seldestroy(struct selinfo *sip)
   1219 {
   1220 	lwp_t *l;
   1221 
   1222 	if (sip->sel_lwp == NULL)
   1223 		return;
   1224 
   1225 	mutex_enter(&select_lock);
   1226 	if ((l = sip->sel_lwp) != NULL) {
   1227 		/* This should rarely happen, so SLIST_REMOVE() is OK. */
   1228 		SLIST_REMOVE(&l->l_selwait, sip, selinfo, sel_chain);
   1229 		sip->sel_lwp = NULL;
   1230 	}
   1231 	mutex_exit(&select_lock);
   1232 }
   1233