Home | History | Annotate | Line # | Download | only in kern
sys_generic.c revision 1.100.2.6
      1 /*	$NetBSD: sys_generic.c,v 1.100.2.6 2007/04/28 22:40:04 ad Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2007 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Andrew Doran.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  * 3. All advertising materials mentioning features or use of this software
     19  *    must display the following acknowledgement:
     20  *	This product includes software developed by the NetBSD
     21  *	Foundation, Inc. and its contributors.
     22  * 4. Neither the name of The NetBSD Foundation nor the names of its
     23  *    contributors may be used to endorse or promote products derived
     24  *    from this software without specific prior written permission.
     25  *
     26  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     27  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     28  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     29  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     30  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     31  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     32  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     33  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     34  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     35  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     36  * POSSIBILITY OF SUCH DAMAGE.
     37  */
     38 
     39 /*
     40  * Copyright (c) 1982, 1986, 1989, 1993
     41  *	The Regents of the University of California.  All rights reserved.
     42  * (c) UNIX System Laboratories, Inc.
     43  * All or some portions of this file are derived from material licensed
     44  * to the University of California by American Telephone and Telegraph
     45  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
     46  * the permission of UNIX System Laboratories, Inc.
     47  *
     48  * Redistribution and use in source and binary forms, with or without
     49  * modification, are permitted provided that the following conditions
     50  * are met:
     51  * 1. Redistributions of source code must retain the above copyright
     52  *    notice, this list of conditions and the following disclaimer.
     53  * 2. Redistributions in binary form must reproduce the above copyright
     54  *    notice, this list of conditions and the following disclaimer in the
     55  *    documentation and/or other materials provided with the distribution.
     56  * 3. Neither the name of the University nor the names of its contributors
     57  *    may be used to endorse or promote products derived from this software
     58  *    without specific prior written permission.
     59  *
     60  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     61  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     62  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     63  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     64  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     65  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     66  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     67  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     68  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     69  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     70  * SUCH DAMAGE.
     71  *
     72  *	@(#)sys_generic.c	8.9 (Berkeley) 2/14/95
     73  */
     74 
     75 /*
     76  * System calls relating to files.
     77  */
     78 
     79 #include <sys/cdefs.h>
     80 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.100.2.6 2007/04/28 22:40:04 ad Exp $");
     81 
     82 #include "opt_ktrace.h"
     83 
     84 #include <sys/param.h>
     85 #include <sys/systm.h>
     86 #include <sys/filedesc.h>
     87 #include <sys/ioctl.h>
     88 #include <sys/file.h>
     89 #include <sys/proc.h>
     90 #include <sys/socketvar.h>
     91 #include <sys/signalvar.h>
     92 #include <sys/uio.h>
     93 #include <sys/kernel.h>
     94 #include <sys/stat.h>
     95 #include <sys/kmem.h>
     96 #include <sys/poll.h>
     97 #include <sys/mount.h>
     98 #include <sys/syscallargs.h>
     99 #ifdef KTRACE
    100 #include <sys/ktrace.h>
    101 #endif
    102 
    103 #include <uvm/uvm_extern.h>
    104 
    105 /* Flags for lwp::l_selflag. */
    106 #define	SEL_RESET	0	/* awoken, interrupted, or not yet polling */
    107 #define	SEL_SCANNING	1	/* polling descriptors */
    108 #define	SEL_BLOCKING	2	/* about to block on select_cv */
    109 
    110 static int	selscan(lwp_t *, fd_mask *, fd_mask *, int, register_t *);
    111 static int	pollscan(lwp_t *, struct pollfd *, int, register_t *);
    112 static void	selclear(void);
    113 
    114 /* Global state for select()/poll(). */
    115 kmutex_t	select_lock;
    116 kcondvar_t	select_cv;
    117 int		nselcoll;
    118 
    119 /*
    120  * Read system call.
    121  */
    122 /* ARGSUSED */
    123 int
    124 sys_read(lwp_t *l, void *v, register_t *retval)
    125 {
    126 	struct sys_read_args /* {
    127 		syscallarg(int)		fd;
    128 		syscallarg(void *)	buf;
    129 		syscallarg(size_t)	nbyte;
    130 	} */ *uap = v;
    131 	int		fd;
    132 	struct file	*fp;
    133 	proc_t		*p;
    134 	struct filedesc	*fdp;
    135 
    136 	fd = SCARG(uap, fd);
    137 	p = l->l_proc;
    138 	fdp = p->p_fd;
    139 
    140 	if ((fp = fd_getfile(fdp, fd)) == NULL)
    141 		return (EBADF);
    142 
    143 	if ((fp->f_flag & FREAD) == 0) {
    144 		mutex_exit(&fp->f_lock);
    145 		return (EBADF);
    146 	}
    147 
    148 	FILE_USE(fp);
    149 
    150 	/* dofileread() will unuse the descriptor for us */
    151 	return (dofileread(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
    152 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
    153 }
    154 
    155 int
    156 dofileread(lwp_t *l, int fd, struct file *fp, void *buf, size_t nbyte,
    157 	off_t *offset, int flags, register_t *retval)
    158 {
    159 	struct iovec aiov;
    160 	struct uio auio;
    161 	proc_t *p;
    162 	struct vmspace *vm;
    163 	size_t cnt;
    164 	int error;
    165 #ifdef KTRACE
    166 	struct iovec	ktriov = { .iov_base = NULL, };
    167 #endif
    168 	p = l->l_proc;
    169 
    170 	error = proc_vmspace_getref(p, &vm);
    171 	if (error) {
    172 		goto out;
    173 	}
    174 
    175 	aiov.iov_base = (void *)buf;
    176 	aiov.iov_len = nbyte;
    177 	auio.uio_iov = &aiov;
    178 	auio.uio_iovcnt = 1;
    179 	auio.uio_resid = nbyte;
    180 	auio.uio_rw = UIO_READ;
    181 	auio.uio_vmspace = vm;
    182 
    183 	/*
    184 	 * Reads return ssize_t because -1 is returned on error.  Therefore
    185 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
    186 	 * values.
    187 	 */
    188 	if (auio.uio_resid > SSIZE_MAX) {
    189 		error = EINVAL;
    190 		goto out;
    191 	}
    192 
    193 #ifdef KTRACE
    194 	/*
    195 	 * if tracing, save a copy of iovec
    196 	 */
    197 	if (KTRPOINT(p, KTR_GENIO))
    198 		ktriov = aiov;
    199 #endif
    200 	cnt = auio.uio_resid;
    201 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
    202 	if (error)
    203 		if (auio.uio_resid != cnt && (error == ERESTART ||
    204 		    error == EINTR || error == EWOULDBLOCK))
    205 			error = 0;
    206 	cnt -= auio.uio_resid;
    207 #ifdef KTRACE
    208 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
    209 		ktrgenio(l, fd, UIO_READ, &ktriov, cnt, error);
    210 #endif
    211 	*retval = cnt;
    212  out:
    213 	FILE_UNUSE(fp, l);
    214 	uvmspace_free(vm);
    215 	return (error);
    216 }
    217 
    218 /*
    219  * Scatter read system call.
    220  */
    221 int
    222 sys_readv(lwp_t *l, void *v, register_t *retval)
    223 {
    224 	struct sys_readv_args /* {
    225 		syscallarg(int)				fd;
    226 		syscallarg(const struct iovec *)	iovp;
    227 		syscallarg(int)				iovcnt;
    228 	} */ *uap = v;
    229 	struct filedesc	*fdp;
    230 	struct file *fp;
    231 	proc_t *p;
    232 	int fd;
    233 
    234 	fd = SCARG(uap, fd);
    235 	p = l->l_proc;
    236 	fdp = p->p_fd;
    237 
    238 	if ((fp = fd_getfile(fdp, fd)) == NULL)
    239 		return (EBADF);
    240 
    241 	if ((fp->f_flag & FREAD) == 0) {
    242 		mutex_exit(&fp->f_lock);
    243 		return (EBADF);
    244 	}
    245 
    246 	FILE_USE(fp);
    247 
    248 	/* dofilereadv() will unuse the descriptor for us */
    249 	return (dofilereadv(l, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
    250 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
    251 }
    252 
    253 int
    254 dofilereadv(lwp_t *l, int fd, struct file *fp, const struct iovec *iovp,
    255 	int iovcnt, off_t *offset, int flags, register_t *retval)
    256 {
    257 	proc_t		*p;
    258 	struct uio	auio;
    259 	struct iovec	*iov, *needfree, aiov[UIO_SMALLIOV];
    260 	struct vmspace	*vm;
    261 	int		i, error;
    262 	size_t		cnt;
    263 	u_int		iovlen;
    264 #ifdef KTRACE
    265 	struct iovec	*ktriov;
    266 #endif
    267 
    268 	p = l->l_proc;
    269 	error = proc_vmspace_getref(p, &vm);
    270 	if (error) {
    271 		goto out;
    272 	}
    273 
    274 #ifdef KTRACE
    275 	ktriov = NULL;
    276 #endif
    277 	/* note: can't use iovlen until iovcnt is validated */
    278 	iovlen = iovcnt * sizeof(struct iovec);
    279 	if ((u_int)iovcnt > UIO_SMALLIOV) {
    280 		if ((u_int)iovcnt > IOV_MAX) {
    281 			error = EINVAL;
    282 			goto out;
    283 		}
    284 		iov = kmem_alloc(iovlen, KM_SLEEP);
    285 		needfree = iov;
    286 	} else if ((u_int)iovcnt > 0) {
    287 		iov = aiov;
    288 		needfree = NULL;
    289 	} else {
    290 		error = EINVAL;
    291 		goto out;
    292 	}
    293 
    294 	auio.uio_iov = iov;
    295 	auio.uio_iovcnt = iovcnt;
    296 	auio.uio_rw = UIO_READ;
    297 	auio.uio_vmspace = vm;
    298 	error = copyin(iovp, iov, iovlen);
    299 	if (error)
    300 		goto done;
    301 	auio.uio_resid = 0;
    302 	for (i = 0; i < iovcnt; i++) {
    303 		auio.uio_resid += iov->iov_len;
    304 		/*
    305 		 * Reads return ssize_t because -1 is returned on error.
    306 		 * Therefore we must restrict the length to SSIZE_MAX to
    307 		 * avoid garbage return values.
    308 		 */
    309 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
    310 			error = EINVAL;
    311 			goto done;
    312 		}
    313 		iov++;
    314 	}
    315 #ifdef KTRACE
    316 	/*
    317 	 * if tracing, save a copy of iovec
    318 	 */
    319 	if (KTRPOINT(p, KTR_GENIO))  {
    320 		ktriov = kmem_alloc(iovlen, KM_SLEEP);
    321 		memcpy((void *)ktriov, (void *)auio.uio_iov, iovlen);
    322 	}
    323 #endif
    324 	cnt = auio.uio_resid;
    325 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
    326 	if (error)
    327 		if (auio.uio_resid != cnt && (error == ERESTART ||
    328 		    error == EINTR || error == EWOULDBLOCK))
    329 			error = 0;
    330 	cnt -= auio.uio_resid;
    331 #ifdef KTRACE
    332 	if (ktriov != NULL) {
    333 		if (KTRPOINT(p, KTR_GENIO) && (error == 0))
    334 			ktrgenio(l, fd, UIO_READ, ktriov, cnt, error);
    335 		kmem_free(ktriov, iovlen);
    336 	}
    337 #endif
    338 	*retval = cnt;
    339  done:
    340 	if (needfree)
    341 		kmem_free(needfree, iovlen);
    342  out:
    343 	FILE_UNUSE(fp, l);
    344 	uvmspace_free(vm);
    345 	return (error);
    346 }
    347 
    348 /*
    349  * Write system call
    350  */
    351 int
    352 sys_write(lwp_t *l, void *v, register_t *retval)
    353 {
    354 	struct sys_write_args /* {
    355 		syscallarg(int)			fd;
    356 		syscallarg(const void *)	buf;
    357 		syscallarg(size_t)		nbyte;
    358 	} */ *uap = v;
    359 	int		fd;
    360 	struct file	*fp;
    361 	proc_t		*p;
    362 	struct filedesc	*fdp;
    363 
    364 	fd = SCARG(uap, fd);
    365 	p = l->l_proc;
    366 	fdp = p->p_fd;
    367 
    368 	if ((fp = fd_getfile(fdp, fd)) == NULL)
    369 		return (EBADF);
    370 
    371 	if ((fp->f_flag & FWRITE) == 0) {
    372 		mutex_exit(&fp->f_lock);
    373 		return (EBADF);
    374 	}
    375 
    376 	FILE_USE(fp);
    377 
    378 	/* dofilewrite() will unuse the descriptor for us */
    379 	return (dofilewrite(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
    380 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
    381 }
    382 
    383 int
    384 dofilewrite(lwp_t *l, int fd, struct file *fp, const void *buf,
    385 	size_t nbyte, off_t *offset, int flags, register_t *retval)
    386 {
    387 	struct iovec aiov;
    388 	struct uio auio;
    389 	proc_t *p;
    390 	struct vmspace *vm;
    391 	size_t cnt;
    392 	int error;
    393 #ifdef KTRACE
    394 	struct iovec	ktriov = { .iov_base = NULL, };
    395 #endif
    396 
    397 	p = l->l_proc;
    398 	error = proc_vmspace_getref(p, &vm);
    399 	if (error) {
    400 		goto out;
    401 	}
    402 	aiov.iov_base = __UNCONST(buf);		/* XXXUNCONST kills const */
    403 	aiov.iov_len = nbyte;
    404 	auio.uio_iov = &aiov;
    405 	auio.uio_iovcnt = 1;
    406 	auio.uio_resid = nbyte;
    407 	auio.uio_rw = UIO_WRITE;
    408 	auio.uio_vmspace = vm;
    409 
    410 	/*
    411 	 * Writes return ssize_t because -1 is returned on error.  Therefore
    412 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
    413 	 * values.
    414 	 */
    415 	if (auio.uio_resid > SSIZE_MAX) {
    416 		error = EINVAL;
    417 		goto out;
    418 	}
    419 
    420 #ifdef KTRACE
    421 	/*
    422 	 * if tracing, save a copy of iovec
    423 	 */
    424 	if (KTRPOINT(p, KTR_GENIO))
    425 		ktriov = aiov;
    426 #endif
    427 	cnt = auio.uio_resid;
    428 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
    429 	if (error) {
    430 		if (auio.uio_resid != cnt && (error == ERESTART ||
    431 		    error == EINTR || error == EWOULDBLOCK))
    432 			error = 0;
    433 		if (error == EPIPE) {
    434 			mutex_enter(&proclist_mutex);
    435 			psignal(p, SIGPIPE);
    436 			mutex_exit(&proclist_mutex);
    437 		}
    438 	}
    439 	cnt -= auio.uio_resid;
    440 #ifdef KTRACE
    441 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
    442 		ktrgenio(l, fd, UIO_WRITE, &ktriov, cnt, error);
    443 #endif
    444 	*retval = cnt;
    445  out:
    446 	FILE_UNUSE(fp, l);
    447 	uvmspace_free(vm);
    448 	return (error);
    449 }
    450 
    451 /*
    452  * Gather write system call
    453  */
    454 int
    455 sys_writev(lwp_t *l, void *v, register_t *retval)
    456 {
    457 	struct sys_writev_args /* {
    458 		syscallarg(int)				fd;
    459 		syscallarg(const struct iovec *)	iovp;
    460 		syscallarg(int)				iovcnt;
    461 	} */ *uap = v;
    462 	int		fd;
    463 	struct file	*fp;
    464 	proc_t		*p;
    465 	struct filedesc	*fdp;
    466 
    467 	fd = SCARG(uap, fd);
    468 	p = l->l_proc;
    469 	fdp = p->p_fd;
    470 
    471 	if ((fp = fd_getfile(fdp, fd)) == NULL)
    472 		return (EBADF);
    473 
    474 	if ((fp->f_flag & FWRITE) == 0) {
    475 		mutex_exit(&fp->f_lock);
    476 		return (EBADF);
    477 	}
    478 
    479 	FILE_USE(fp);
    480 
    481 	/* dofilewritev() will unuse the descriptor for us */
    482 	return (dofilewritev(l, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
    483 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
    484 }
    485 
    486 int
    487 dofilewritev(lwp_t *l, int fd, struct file *fp, const struct iovec *iovp,
    488 	int iovcnt, off_t *offset, int flags, register_t *retval)
    489 {
    490 	proc_t		*p;
    491 	struct uio	auio;
    492 	struct iovec	*iov, *needfree, aiov[UIO_SMALLIOV];
    493 	struct vmspace	*vm;
    494 	int		i, error;
    495 	size_t		cnt;
    496 	u_int		iovlen;
    497 #ifdef KTRACE
    498 	struct iovec	*ktriov;
    499 #endif
    500 
    501 	p = l->l_proc;
    502 	error = proc_vmspace_getref(p, &vm);
    503 	if (error) {
    504 		goto out;
    505 	}
    506 #ifdef KTRACE
    507 	ktriov = NULL;
    508 #endif
    509 	/* note: can't use iovlen until iovcnt is validated */
    510 	iovlen = iovcnt * sizeof(struct iovec);
    511 	if ((u_int)iovcnt > UIO_SMALLIOV) {
    512 		if ((u_int)iovcnt > IOV_MAX) {
    513 			error = EINVAL;
    514 			goto out;
    515 		}
    516 		iov = kmem_alloc(iovlen, KM_SLEEP);
    517 		needfree = iov;
    518 	} else if ((u_int)iovcnt > 0) {
    519 		iov = aiov;
    520 		needfree = NULL;
    521 	} else {
    522 		error = EINVAL;
    523 		goto out;
    524 	}
    525 
    526 	auio.uio_iov = iov;
    527 	auio.uio_iovcnt = iovcnt;
    528 	auio.uio_rw = UIO_WRITE;
    529 	auio.uio_vmspace = vm;
    530 	error = copyin(iovp, iov, iovlen);
    531 	if (error)
    532 		goto done;
    533 	auio.uio_resid = 0;
    534 	for (i = 0; i < iovcnt; i++) {
    535 		auio.uio_resid += iov->iov_len;
    536 		/*
    537 		 * Writes return ssize_t because -1 is returned on error.
    538 		 * Therefore we must restrict the length to SSIZE_MAX to
    539 		 * avoid garbage return values.
    540 		 */
    541 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
    542 			error = EINVAL;
    543 			goto done;
    544 		}
    545 		iov++;
    546 	}
    547 #ifdef KTRACE
    548 	/*
    549 	 * if tracing, save a copy of iovec
    550 	 */
    551 	if (KTRPOINT(p, KTR_GENIO))  {
    552 		ktriov = kmem_alloc(iovlen, KM_SLEEP);
    553 		memcpy((void *)ktriov, (void *)auio.uio_iov, iovlen);
    554 	}
    555 #endif
    556 	cnt = auio.uio_resid;
    557 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
    558 	if (error) {
    559 		if (auio.uio_resid != cnt && (error == ERESTART ||
    560 		    error == EINTR || error == EWOULDBLOCK))
    561 			error = 0;
    562 		if (error == EPIPE) {
    563 			mutex_enter(&proclist_mutex);
    564 			psignal(p, SIGPIPE);
    565 			mutex_exit(&proclist_mutex);
    566 		}
    567 	}
    568 	cnt -= auio.uio_resid;
    569 #ifdef KTRACE
    570 	if (ktriov != NULL) {
    571 		if (KTRPOINT(p, KTR_GENIO) && (error == 0))
    572 			ktrgenio(l, fd, UIO_WRITE, ktriov, cnt, error);
    573 		kmem_free(ktriov, iovlen);
    574 	}
    575 #endif
    576 	*retval = cnt;
    577  done:
    578 	if (needfree)
    579 		kmem_free(needfree, iovlen);
    580  out:
    581 	FILE_UNUSE(fp, l);
    582 	uvmspace_free(vm);
    583 	return (error);
    584 }
    585 
    586 /*
    587  * Ioctl system call
    588  */
    589 /* ARGSUSED */
    590 int
    591 sys_ioctl(lwp_t *l, void *v, register_t *retval)
    592 {
    593 	struct sys_ioctl_args /* {
    594 		syscallarg(int)		fd;
    595 		syscallarg(u_long)	com;
    596 		syscallarg(void *)	data;
    597 	} */ *uap = v;
    598 	struct file	*fp;
    599 	proc_t		*p;
    600 	struct filedesc	*fdp;
    601 	u_long		com;
    602 	int		error;
    603 	u_int		size;
    604 	void 		*data, *memp;
    605 #define	STK_PARAMS	128
    606 	u_long		stkbuf[STK_PARAMS/sizeof(u_long)];
    607 
    608 	error = 0;
    609 	p = l->l_proc;
    610 	fdp = p->p_fd;
    611 
    612 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
    613 		return (EBADF);
    614 
    615 	FILE_USE(fp);
    616 
    617 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
    618 		error = EBADF;
    619 		com = 0;
    620 		goto out;
    621 	}
    622 
    623 	switch (com = SCARG(uap, com)) {
    624 	case FIONCLEX:
    625 		rw_enter(&fdp->fd_lock, RW_WRITER);
    626 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
    627 		rw_exit(&fdp->fd_lock);
    628 		goto out;
    629 
    630 	case FIOCLEX:
    631 		rw_enter(&fdp->fd_lock, RW_WRITER);
    632 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
    633 		rw_exit(&fdp->fd_lock);
    634 		goto out;
    635 	}
    636 
    637 	/*
    638 	 * Interpret high order word to find amount of data to be
    639 	 * copied to/from the user's address space.
    640 	 */
    641 	size = IOCPARM_LEN(com);
    642 	if (size > IOCPARM_MAX) {
    643 		error = ENOTTY;
    644 		goto out;
    645 	}
    646 	memp = NULL;
    647 	if (size > sizeof(stkbuf)) {
    648 		memp = kmem_alloc(size, KM_SLEEP);
    649 		data = memp;
    650 	} else
    651 		data = (void *)stkbuf;
    652 	if (com&IOC_IN) {
    653 		if (size) {
    654 			error = copyin(SCARG(uap, data), data, size);
    655 			if (error) {
    656 				if (memp)
    657 					kmem_free(memp, size);
    658 				goto out;
    659 			}
    660 #ifdef KTRACE
    661 			if (KTRPOINT(p, KTR_GENIO)) {
    662 				struct iovec iov;
    663 				iov.iov_base = SCARG(uap, data);
    664 				iov.iov_len = size;
    665 				ktrgenio(l, SCARG(uap, fd), UIO_WRITE, &iov,
    666 					size, 0);
    667 			}
    668 #endif
    669 		} else
    670 			*(void **)data = SCARG(uap, data);
    671 	} else if ((com&IOC_OUT) && size)
    672 		/*
    673 		 * Zero the buffer so the user always
    674 		 * gets back something deterministic.
    675 		 */
    676 		memset(data, 0, size);
    677 	else if (com&IOC_VOID)
    678 		*(void **)data = SCARG(uap, data);
    679 
    680 	switch (com) {
    681 
    682 	case FIONBIO:
    683 		mutex_enter(&fp->f_lock);
    684 		if (*(int *)data != 0)
    685 			fp->f_flag |= FNONBLOCK;
    686 		else
    687 			fp->f_flag &= ~FNONBLOCK;
    688 		mutex_exit(&fp->f_lock);
    689 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data, l);
    690 		break;
    691 
    692 	case FIOASYNC:
    693 		mutex_enter(&fp->f_lock);
    694 		if (*(int *)data != 0)
    695 			fp->f_flag |= FASYNC;
    696 		else
    697 			fp->f_flag &= ~FASYNC;
    698 		mutex_exit(&fp->f_lock);
    699 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data, l);
    700 		break;
    701 
    702 	default:
    703 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, l);
    704 		/*
    705 		 * Copy any data to user, size was
    706 		 * already set and checked above.
    707 		 */
    708 		if (error == 0 && (com&IOC_OUT) && size) {
    709 			error = copyout(data, SCARG(uap, data), size);
    710 #ifdef KTRACE
    711 			if (KTRPOINT(p, KTR_GENIO)) {
    712 				struct iovec iov;
    713 				iov.iov_base = SCARG(uap, data);
    714 				iov.iov_len = size;
    715 				ktrgenio(l, SCARG(uap, fd), UIO_READ, &iov,
    716 					size, error);
    717 			}
    718 #endif
    719 		}
    720 		break;
    721 	}
    722 	if (memp)
    723 		kmem_free(memp, size);
    724  out:
    725 	FILE_UNUSE(fp, l);
    726 	switch (error) {
    727 	case -1:
    728 		printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
    729 		    "pid=%d comm=%s\n",
    730 		    (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
    731 		    (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
    732 		    p->p_pid, p->p_comm);
    733 		/* FALLTHROUGH */
    734 	case EPASSTHROUGH:
    735 		error = ENOTTY;
    736 		/* FALLTHROUGH */
    737 	default:
    738 		return (error);
    739 	}
    740 }
    741 
    742 /*
    743  * Select system call.
    744  */
    745 int
    746 sys_pselect(lwp_t *l, void *v, register_t *retval)
    747 {
    748 	struct sys_pselect_args /* {
    749 		syscallarg(int)				nd;
    750 		syscallarg(fd_set *)			in;
    751 		syscallarg(fd_set *)			ou;
    752 		syscallarg(fd_set *)			ex;
    753 		syscallarg(const struct timespec *)	ts;
    754 		syscallarg(sigset_t *)			mask;
    755 	} */ * const uap = v;
    756 	struct timespec	ats;
    757 	struct timeval	atv, *tv = NULL;
    758 	sigset_t	amask, *mask = NULL;
    759 	int		error;
    760 
    761 	if (SCARG(uap, ts)) {
    762 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
    763 		if (error)
    764 			return error;
    765 		atv.tv_sec = ats.tv_sec;
    766 		atv.tv_usec = ats.tv_nsec / 1000;
    767 		tv = &atv;
    768 	}
    769 	if (SCARG(uap, mask) != NULL) {
    770 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
    771 		if (error)
    772 			return error;
    773 		mask = &amask;
    774 	}
    775 
    776 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
    777 	    SCARG(uap, ou), SCARG(uap, ex), tv, mask);
    778 }
    779 
    780 int
    781 inittimeleft(struct timeval *tv, struct timeval *sleeptv)
    782 {
    783 	if (itimerfix(tv))
    784 		return -1;
    785 	getmicrouptime(sleeptv);
    786 	return 0;
    787 }
    788 
    789 int
    790 gettimeleft(struct timeval *tv, struct timeval *sleeptv)
    791 {
    792 	/*
    793 	 * We have to recalculate the timeout on every retry.
    794 	 */
    795 	struct timeval slepttv;
    796 	/*
    797 	 * reduce tv by elapsed time
    798 	 * based on monotonic time scale
    799 	 */
    800 	getmicrouptime(&slepttv);
    801 	timeradd(tv, sleeptv, tv);
    802 	timersub(tv, &slepttv, tv);
    803 	*sleeptv = slepttv;
    804 	return tvtohz(tv);
    805 }
    806 
    807 int
    808 sys_select(lwp_t *l, void *v, register_t *retval)
    809 {
    810 	struct sys_select_args /* {
    811 		syscallarg(int)			nd;
    812 		syscallarg(fd_set *)		in;
    813 		syscallarg(fd_set *)		ou;
    814 		syscallarg(fd_set *)		ex;
    815 		syscallarg(struct timeval *)	tv;
    816 	} */ * const uap = v;
    817 	struct timeval atv, *tv = NULL;
    818 	int error;
    819 
    820 	if (SCARG(uap, tv)) {
    821 		error = copyin(SCARG(uap, tv), (void *)&atv,
    822 			sizeof(atv));
    823 		if (error)
    824 			return error;
    825 		tv = &atv;
    826 	}
    827 
    828 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
    829 	    SCARG(uap, ou), SCARG(uap, ex), tv, NULL);
    830 }
    831 
    832 int
    833 selcommon(lwp_t *l, register_t *retval, int nd, fd_set *u_in,
    834 	  fd_set *u_ou, fd_set *u_ex, struct timeval *tv, sigset_t *mask)
    835 {
    836 	char		smallbits[howmany(FD_SETSIZE, NFDBITS) *
    837 			    sizeof(fd_mask) * 6];
    838 	proc_t		* const p = l->l_proc;
    839 	char 		*bits;
    840 	int		ncoll, error, timo;
    841 	size_t		ni;
    842 	sigset_t	oldmask;
    843 	struct timeval  sleeptv;
    844 
    845 	error = 0;
    846 	if (nd < 0)
    847 		return (EINVAL);
    848 	if (nd > p->p_fd->fd_nfiles) {
    849 		/* forgiving; slightly wrong */
    850 		nd = p->p_fd->fd_nfiles;
    851 	}
    852 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
    853 	if (ni * 6 > sizeof(smallbits))
    854 		bits = kmem_alloc(ni * 6, KM_SLEEP);
    855 	else
    856 		bits = smallbits;
    857 
    858 #define	getbits(name, x)						\
    859 	if (u_ ## name) {						\
    860 		error = copyin(u_ ## name, bits + ni * x, ni);		\
    861 		if (error)						\
    862 			goto done;					\
    863 	} else								\
    864 		memset(bits + ni * x, 0, ni);
    865 	getbits(in, 0);
    866 	getbits(ou, 1);
    867 	getbits(ex, 2);
    868 #undef	getbits
    869 
    870 	timo = 0;
    871 	if (tv && inittimeleft(tv, &sleeptv) == -1) {
    872 		error = EINVAL;
    873 		goto done;
    874 	}
    875 
    876 	if (mask) {
    877 		sigminusset(&sigcantmask, mask);
    878 		mutex_enter(&p->p_smutex);
    879 		oldmask = l->l_sigmask;
    880 		l->l_sigmask = *mask;
    881 		mutex_exit(&p->p_smutex);
    882 	} else
    883 		oldmask = l->l_sigmask;	/* XXXgcc */
    884 
    885 	mutex_enter(&select_lock);
    886 	SLIST_INIT(&l->l_selwait);
    887 	for (;;) {
    888 	 	l->l_selflag = SEL_SCANNING;
    889 		ncoll = nselcoll;
    890  		mutex_exit(&select_lock);
    891 
    892 		error = selscan(l, (fd_mask *)(bits + ni * 0),
    893 		    (fd_mask *)(bits + ni * 3), nd, retval);
    894 
    895 		mutex_enter(&select_lock);
    896 		if (error || *retval)
    897 			break;
    898 		if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
    899 			break;
    900 		if (l->l_selflag != SEL_SCANNING || ncoll != nselcoll)
    901 			continue;
    902 		l->l_selflag = SEL_BLOCKING;
    903 		error = cv_timedwait_sig(&select_cv, &select_lock, timo);
    904 		if (error != 0)
    905 			break;
    906 	}
    907 	selclear();
    908 	mutex_exit(&select_lock);
    909 
    910 	if (mask) {
    911 		mutex_enter(&p->p_smutex);
    912 		l->l_sigmask = oldmask;
    913 		mutex_exit(&p->p_smutex);
    914 	}
    915 
    916  done:
    917 	/* select is not restarted after signals... */
    918 	if (error == ERESTART)
    919 		error = EINTR;
    920 	if (error == EWOULDBLOCK)
    921 		error = 0;
    922 	if (error == 0 && u_in != NULL)
    923 		error = copyout(bits + ni * 3, u_in, ni);
    924 	if (error == 0 && u_ou != NULL)
    925 		error = copyout(bits + ni * 4, u_ou, ni);
    926 	if (error == 0 && u_ex != NULL)
    927 		error = copyout(bits + ni * 5, u_ex, ni);
    928 	if (bits != smallbits)
    929 		kmem_free(bits, ni * 6);
    930 	return (error);
    931 }
    932 
    933 int
    934 selscan(lwp_t *l, fd_mask *ibitp, fd_mask *obitp, int nfd,
    935 	register_t *retval)
    936 {
    937 	static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
    938 			       POLLWRNORM | POLLHUP | POLLERR,
    939 			       POLLRDBAND };
    940 	proc_t *p = l->l_proc;
    941 	struct filedesc	*fdp;
    942 	int msk, i, j, fd, n;
    943 	fd_mask ibits, obits;
    944 	struct file *fp;
    945 
    946 	fdp = p->p_fd;
    947 	n = 0;
    948 	for (msk = 0; msk < 3; msk++) {
    949 		for (i = 0; i < nfd; i += NFDBITS) {
    950 			ibits = *ibitp++;
    951 			obits = 0;
    952 			while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
    953 				ibits &= ~(1 << j);
    954 				if ((fp = fd_getfile(fdp, fd)) == NULL)
    955 					return (EBADF);
    956 				FILE_USE(fp);
    957 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], l)) {
    958 					obits |= (1 << j);
    959 					n++;
    960 				}
    961 				FILE_UNUSE(fp, l);
    962 			}
    963 			*obitp++ = obits;
    964 		}
    965 	}
    966 	*retval = n;
    967 	return (0);
    968 }
    969 
    970 /*
    971  * Poll system call.
    972  */
    973 int
    974 sys_poll(lwp_t *l, void *v, register_t *retval)
    975 {
    976 	struct sys_poll_args /* {
    977 		syscallarg(struct pollfd *)	fds;
    978 		syscallarg(u_int)		nfds;
    979 		syscallarg(int)			timeout;
    980 	} */ * const uap = v;
    981 	struct timeval	atv, *tv = NULL;
    982 
    983 	if (SCARG(uap, timeout) != INFTIM) {
    984 		atv.tv_sec = SCARG(uap, timeout) / 1000;
    985 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
    986 		tv = &atv;
    987 	}
    988 
    989 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
    990 		tv, NULL);
    991 }
    992 
    993 /*
    994  * Poll system call.
    995  */
    996 int
    997 sys_pollts(lwp_t *l, void *v, register_t *retval)
    998 {
    999 	struct sys_pollts_args /* {
   1000 		syscallarg(struct pollfd *)		fds;
   1001 		syscallarg(u_int)			nfds;
   1002 		syscallarg(const struct timespec *)	ts;
   1003 		syscallarg(const sigset_t *)		mask;
   1004 	} */ * const uap = v;
   1005 	struct timespec	ats;
   1006 	struct timeval	atv, *tv = NULL;
   1007 	sigset_t	amask, *mask = NULL;
   1008 	int		error;
   1009 
   1010 	if (SCARG(uap, ts)) {
   1011 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
   1012 		if (error)
   1013 			return error;
   1014 		atv.tv_sec = ats.tv_sec;
   1015 		atv.tv_usec = ats.tv_nsec / 1000;
   1016 		tv = &atv;
   1017 	}
   1018 	if (SCARG(uap, mask)) {
   1019 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
   1020 		if (error)
   1021 			return error;
   1022 		mask = &amask;
   1023 	}
   1024 
   1025 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
   1026 		tv, mask);
   1027 }
   1028 
   1029 int
   1030 pollcommon(lwp_t *l, register_t *retval,
   1031 	struct pollfd *u_fds, u_int nfds,
   1032 	struct timeval *tv, sigset_t *mask)
   1033 {
   1034 	char		smallbits[32 * sizeof(struct pollfd)];
   1035 	proc_t		* const p = l->l_proc;
   1036 	void *		bits;
   1037 	sigset_t	oldmask;
   1038 	int		ncoll, error, timo;
   1039 	size_t		ni;
   1040 	struct timeval	sleeptv;
   1041 
   1042 	if (nfds > p->p_fd->fd_nfiles) {
   1043 		/* forgiving; slightly wrong */
   1044 		nfds = p->p_fd->fd_nfiles;
   1045 	}
   1046 	ni = nfds * sizeof(struct pollfd);
   1047 	if (ni > sizeof(smallbits))
   1048 		bits = kmem_alloc(ni, KM_SLEEP);
   1049 	else
   1050 		bits = smallbits;
   1051 
   1052 	error = copyin(u_fds, bits, ni);
   1053 	if (error)
   1054 		goto done;
   1055 
   1056 	timo = 0;
   1057 	if (tv && inittimeleft(tv, &sleeptv) == -1) {
   1058 		error = EINVAL;
   1059 		goto done;
   1060 	}
   1061 
   1062 	if (mask) {
   1063 		sigminusset(&sigcantmask, mask);
   1064 		mutex_enter(&p->p_smutex);
   1065 		oldmask = l->l_sigmask;
   1066 		l->l_sigmask = *mask;
   1067 		mutex_exit(&p->p_smutex);
   1068 	} else
   1069 		oldmask = l->l_sigmask;	/* XXXgcc */
   1070 
   1071 	mutex_enter(&select_lock);
   1072 	SLIST_INIT(&l->l_selwait);
   1073 	for (;;) {
   1074 		ncoll = nselcoll;
   1075 		l->l_selflag = SEL_SCANNING;
   1076 		mutex_exit(&select_lock);
   1077 
   1078 		error = pollscan(l, (struct pollfd *)bits, nfds, retval);
   1079 
   1080 		mutex_enter(&select_lock);
   1081 		if (error || *retval)
   1082 			break;
   1083 		if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
   1084 			break;
   1085 		if (l->l_selflag != SEL_SCANNING || nselcoll != ncoll)
   1086 			continue;
   1087 		l->l_selflag = SEL_BLOCKING;
   1088 		error = cv_timedwait_sig(&select_cv, &select_lock, timo);
   1089 		if (error != 0)
   1090 			break;
   1091 	}
   1092 	selclear();
   1093 	mutex_exit(&select_lock);
   1094 
   1095 	if (mask) {
   1096 		mutex_enter(&p->p_smutex);
   1097 		l->l_sigmask = oldmask;
   1098 		mutex_exit(&p->p_smutex);
   1099 	}
   1100  done:
   1101 	/* poll is not restarted after signals... */
   1102 	if (error == ERESTART)
   1103 		error = EINTR;
   1104 	if (error == EWOULDBLOCK)
   1105 		error = 0;
   1106 	if (error == 0)
   1107 		error = copyout(bits, u_fds, ni);
   1108 	if (bits != smallbits)
   1109 		kmem_free(bits, ni);
   1110 	return (error);
   1111 }
   1112 
   1113 int
   1114 pollscan(lwp_t *l, struct pollfd *fds, int nfd, register_t *retval)
   1115 {
   1116 	proc_t		*p = l->l_proc;
   1117 	struct filedesc	*fdp;
   1118 	int		i, n;
   1119 	struct file	*fp;
   1120 
   1121 	fdp = p->p_fd;
   1122 	n = 0;
   1123 	for (i = 0; i < nfd; i++, fds++) {
   1124 		if (fds->fd >= fdp->fd_nfiles) {
   1125 			fds->revents = POLLNVAL;
   1126 			n++;
   1127 		} else if (fds->fd < 0) {
   1128 			fds->revents = 0;
   1129 		} else {
   1130 			if ((fp = fd_getfile(fdp, fds->fd)) == NULL) {
   1131 				fds->revents = POLLNVAL;
   1132 				n++;
   1133 			} else {
   1134 				FILE_USE(fp);
   1135 				fds->revents = (*fp->f_ops->fo_poll)(fp,
   1136 				    fds->events | POLLERR | POLLHUP, l);
   1137 				if (fds->revents != 0)
   1138 					n++;
   1139 				FILE_UNUSE(fp, l);
   1140 			}
   1141 		}
   1142 	}
   1143 	*retval = n;
   1144 	return (0);
   1145 }
   1146 
   1147 /*ARGSUSED*/
   1148 int
   1149 seltrue(dev_t dev, int events, lwp_t *l)
   1150 {
   1151 
   1152 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
   1153 }
   1154 
   1155 /*
   1156  * Record a select request.
   1157  */
   1158 void
   1159 selrecord(lwp_t *selector, struct selinfo *sip)
   1160 {
   1161 
   1162 	mutex_enter(&select_lock);
   1163 	if (sip->sel_lwp == NULL) {
   1164 		/* First named waiter, although there may be more. */
   1165 		sip->sel_lwp = selector;
   1166 		SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
   1167 	} else if (sip->sel_lwp != selector) {
   1168 		/* Multiple waiters. */
   1169 		sip->sel_collision = true;
   1170 	}
   1171 	mutex_exit(&select_lock);
   1172 }
   1173 
   1174 /*
   1175  * Do a wakeup when a selectable event occurs.
   1176  */
   1177 void
   1178 selwakeup(struct selinfo *sip)
   1179 {
   1180 	lwp_t *l;
   1181 
   1182 	mutex_enter(&select_lock);
   1183 	if (sip->sel_collision) {
   1184 		/* Multiple waiters - just notify everybody. */
   1185 		nselcoll++;
   1186 		sip->sel_collision = false;
   1187 		cv_broadcast(&select_cv);
   1188 	} else if (sip->sel_lwp != NULL) {
   1189 		/* Only one LWP waiting. */
   1190 		l = sip->sel_lwp;
   1191 		if (l->l_selflag == SEL_BLOCKING) {
   1192 			/*
   1193 			 * If it's sleeping, wake it up.  If not, it's
   1194 			 * already awake but hasn't yet removed itself
   1195 			 * from the selector.  We reset the state below
   1196 			 * so that we only attempt to do this once.
   1197 			 */
   1198 			lwp_lock(l);
   1199 			if (l->l_wchan == &select_cv) {
   1200 				/* lwp_unsleep() releases the LWP lock. */
   1201 				lwp_unsleep(l);
   1202 			} else
   1203 				lwp_unlock(l);
   1204 		} else {
   1205 			/*
   1206 			 * Not yet asleep.  Reset its state below so that
   1207 			 * it will go around again.
   1208 			 */
   1209 		}
   1210 		l->l_selflag = SEL_RESET;
   1211 	}
   1212 	mutex_exit(&select_lock);
   1213 }
   1214 
   1215 void
   1216 selnotify(struct selinfo *sip, long knhint)
   1217 {
   1218 
   1219 	selwakeup(sip);
   1220 	KNOTE(&sip->sel_klist, knhint);
   1221 }
   1222 
   1223 /*
   1224  * Remove an LWP from all objects that it is waiting for.
   1225  */
   1226 static void
   1227 selclear(void)
   1228 {
   1229 	struct selinfo *sip;
   1230 	lwp_t *l = curlwp;
   1231 
   1232 	KASSERT(mutex_owned(&select_lock));
   1233 
   1234 	SLIST_FOREACH(sip, &l->l_selwait, sel_chain) {
   1235 		KASSERT(sip->sel_lwp == l);
   1236 		sip->sel_lwp = NULL;
   1237 	}
   1238 }
   1239 
   1240 /*
   1241  * Initialize the select/poll system calls.
   1242  */
   1243 void
   1244 selsysinit(void)
   1245 {
   1246 
   1247 	mutex_init(&select_lock, MUTEX_DRIVER, IPL_VM);
   1248 	cv_init(&select_cv, "select");
   1249 }
   1250