Home | History | Annotate | Line # | Download | only in kern
sys_generic.c revision 1.100.2.7
      1 /*	$NetBSD: sys_generic.c,v 1.100.2.7 2007/06/09 23:58:06 ad Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2007 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Andrew Doran.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  * 3. All advertising materials mentioning features or use of this software
     19  *    must display the following acknowledgement:
     20  *	This product includes software developed by the NetBSD
     21  *	Foundation, Inc. and its contributors.
     22  * 4. Neither the name of The NetBSD Foundation nor the names of its
     23  *    contributors may be used to endorse or promote products derived
     24  *    from this software without specific prior written permission.
     25  *
     26  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     27  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     28  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     29  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     30  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     31  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     32  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     33  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     34  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     35  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     36  * POSSIBILITY OF SUCH DAMAGE.
     37  */
     38 
     39 /*
     40  * Copyright (c) 1982, 1986, 1989, 1993
     41  *	The Regents of the University of California.  All rights reserved.
     42  * (c) UNIX System Laboratories, Inc.
     43  * All or some portions of this file are derived from material licensed
     44  * to the University of California by American Telephone and Telegraph
     45  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
     46  * the permission of UNIX System Laboratories, Inc.
     47  *
     48  * Redistribution and use in source and binary forms, with or without
     49  * modification, are permitted provided that the following conditions
     50  * are met:
     51  * 1. Redistributions of source code must retain the above copyright
     52  *    notice, this list of conditions and the following disclaimer.
     53  * 2. Redistributions in binary form must reproduce the above copyright
     54  *    notice, this list of conditions and the following disclaimer in the
     55  *    documentation and/or other materials provided with the distribution.
     56  * 3. Neither the name of the University nor the names of its contributors
     57  *    may be used to endorse or promote products derived from this software
     58  *    without specific prior written permission.
     59  *
     60  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     61  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     62  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     63  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     64  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     65  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     66  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     67  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     68  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     69  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     70  * SUCH DAMAGE.
     71  *
     72  *	@(#)sys_generic.c	8.9 (Berkeley) 2/14/95
     73  */
     74 
     75 /*
     76  * System calls relating to files.
     77  */
     78 
     79 #include <sys/cdefs.h>
     80 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.100.2.7 2007/06/09 23:58:06 ad Exp $");
     81 
     82 #include "opt_ktrace.h"
     83 
     84 #include <sys/param.h>
     85 #include <sys/systm.h>
     86 #include <sys/filedesc.h>
     87 #include <sys/ioctl.h>
     88 #include <sys/file.h>
     89 #include <sys/proc.h>
     90 #include <sys/socketvar.h>
     91 #include <sys/signalvar.h>
     92 #include <sys/uio.h>
     93 #include <sys/kernel.h>
     94 #include <sys/stat.h>
     95 #include <sys/kmem.h>
     96 #include <sys/poll.h>
     97 #include <sys/mount.h>
     98 #include <sys/syscallargs.h>
     99 #ifdef KTRACE
    100 #include <sys/ktrace.h>
    101 #endif
    102 
    103 #include <uvm/uvm_extern.h>
    104 
    105 /* Flags for lwp::l_selflag. */
    106 #define	SEL_RESET	0	/* awoken, interrupted, or not yet polling */
    107 #define	SEL_SCANNING	1	/* polling descriptors */
    108 #define	SEL_BLOCKING	2	/* about to block on select_cv */
    109 
    110 static int	selscan(lwp_t *, fd_mask *, fd_mask *, int, register_t *);
    111 static int	pollscan(lwp_t *, struct pollfd *, int, register_t *);
    112 static void	selclear(void);
    113 
    114 /* Global state for select()/poll(). */
    115 kmutex_t	select_lock;
    116 kcondvar_t	select_cv;
    117 int		nselcoll;
    118 
    119 /*
    120  * Read system call.
    121  */
    122 /* ARGSUSED */
    123 int
    124 sys_read(lwp_t *l, void *v, register_t *retval)
    125 {
    126 	struct sys_read_args /* {
    127 		syscallarg(int)		fd;
    128 		syscallarg(void *)	buf;
    129 		syscallarg(size_t)	nbyte;
    130 	} */ *uap = v;
    131 	int		fd;
    132 	struct file	*fp;
    133 	proc_t		*p;
    134 	struct filedesc	*fdp;
    135 
    136 	fd = SCARG(uap, fd);
    137 	p = l->l_proc;
    138 	fdp = p->p_fd;
    139 
    140 	if ((fp = fd_getfile(fdp, fd)) == NULL)
    141 		return (EBADF);
    142 
    143 	if ((fp->f_flag & FREAD) == 0) {
    144 		mutex_exit(&fp->f_lock);
    145 		return (EBADF);
    146 	}
    147 
    148 	FILE_USE(fp);
    149 
    150 	/* dofileread() will unuse the descriptor for us */
    151 	return (dofileread(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
    152 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
    153 }
    154 
    155 int
    156 dofileread(lwp_t *l, int fd, struct file *fp, void *buf, size_t nbyte,
    157 	off_t *offset, int flags, register_t *retval)
    158 {
    159 	struct iovec aiov;
    160 	struct uio auio;
    161 	proc_t *p;
    162 	struct vmspace *vm;
    163 	size_t cnt;
    164 	int error;
    165 #ifdef KTRACE
    166 	struct iovec	ktriov;
    167 #endif
    168 	p = l->l_proc;
    169 
    170 	error = proc_vmspace_getref(p, &vm);
    171 	if (error) {
    172 		goto out;
    173 	}
    174 
    175 	aiov.iov_base = (void *)buf;
    176 	aiov.iov_len = nbyte;
    177 	auio.uio_iov = &aiov;
    178 	auio.uio_iovcnt = 1;
    179 	auio.uio_resid = nbyte;
    180 	auio.uio_rw = UIO_READ;
    181 	auio.uio_vmspace = vm;
    182 
    183 	/*
    184 	 * Reads return ssize_t because -1 is returned on error.  Therefore
    185 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
    186 	 * values.
    187 	 */
    188 	if (auio.uio_resid > SSIZE_MAX) {
    189 		error = EINVAL;
    190 		goto out;
    191 	}
    192 
    193 #ifdef KTRACE
    194 	/* In case we are tracing, save a copy of iovec */
    195 	ktriov = aiov;
    196 #endif
    197 	cnt = auio.uio_resid;
    198 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
    199 	if (error)
    200 		if (auio.uio_resid != cnt && (error == ERESTART ||
    201 		    error == EINTR || error == EWOULDBLOCK))
    202 			error = 0;
    203 	cnt -= auio.uio_resid;
    204 #ifdef KTRACE
    205 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
    206 		ktrgenio(l, fd, UIO_READ, &ktriov, cnt, error);
    207 #endif
    208 	*retval = cnt;
    209  out:
    210 	FILE_UNUSE(fp, l);
    211 	uvmspace_free(vm);
    212 	return (error);
    213 }
    214 
    215 /*
    216  * Scatter read system call.
    217  */
    218 int
    219 sys_readv(lwp_t *l, void *v, register_t *retval)
    220 {
    221 	struct sys_readv_args /* {
    222 		syscallarg(int)				fd;
    223 		syscallarg(const struct iovec *)	iovp;
    224 		syscallarg(int)				iovcnt;
    225 	} */ *uap = v;
    226 	struct filedesc	*fdp;
    227 	struct file *fp;
    228 	proc_t *p;
    229 	int fd;
    230 
    231 	fd = SCARG(uap, fd);
    232 	p = l->l_proc;
    233 	fdp = p->p_fd;
    234 
    235 	if ((fp = fd_getfile(fdp, fd)) == NULL)
    236 		return (EBADF);
    237 
    238 	if ((fp->f_flag & FREAD) == 0) {
    239 		mutex_exit(&fp->f_lock);
    240 		return (EBADF);
    241 	}
    242 
    243 	FILE_USE(fp);
    244 
    245 	/* dofilereadv() will unuse the descriptor for us */
    246 	return (dofilereadv(l, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
    247 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
    248 }
    249 
    250 int
    251 dofilereadv(lwp_t *l, int fd, struct file *fp, const struct iovec *iovp,
    252 	int iovcnt, off_t *offset, int flags, register_t *retval)
    253 {
    254 	proc_t		*p;
    255 	struct uio	auio;
    256 	struct iovec	*iov, *needfree, aiov[UIO_SMALLIOV];
    257 	struct vmspace	*vm;
    258 	int		i, error;
    259 	size_t		cnt;
    260 	u_int		iovlen;
    261 #ifdef KTRACE
    262 	struct iovec	*ktriov;
    263 #endif
    264 
    265 	p = l->l_proc;
    266 	error = proc_vmspace_getref(p, &vm);
    267 	if (error) {
    268 		goto out;
    269 	}
    270 
    271 #ifdef KTRACE
    272 	ktriov = NULL;
    273 #endif
    274 	/* note: can't use iovlen until iovcnt is validated */
    275 	iovlen = iovcnt * sizeof(struct iovec);
    276 	if ((u_int)iovcnt > UIO_SMALLIOV) {
    277 		if ((u_int)iovcnt > IOV_MAX) {
    278 			error = EINVAL;
    279 			goto out;
    280 		}
    281 		iov = kmem_alloc(iovlen, KM_SLEEP);
    282 		needfree = iov;
    283 	} else if ((u_int)iovcnt > 0) {
    284 		iov = aiov;
    285 		needfree = NULL;
    286 	} else {
    287 		error = EINVAL;
    288 		goto out;
    289 	}
    290 
    291 	auio.uio_iov = iov;
    292 	auio.uio_iovcnt = iovcnt;
    293 	auio.uio_rw = UIO_READ;
    294 	auio.uio_vmspace = vm;
    295 	error = copyin(iovp, iov, iovlen);
    296 	if (error)
    297 		goto done;
    298 	auio.uio_resid = 0;
    299 	for (i = 0; i < iovcnt; i++) {
    300 		auio.uio_resid += iov->iov_len;
    301 		/*
    302 		 * Reads return ssize_t because -1 is returned on error.
    303 		 * Therefore we must restrict the length to SSIZE_MAX to
    304 		 * avoid garbage return values.
    305 		 */
    306 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
    307 			error = EINVAL;
    308 			goto done;
    309 		}
    310 		iov++;
    311 	}
    312 #ifdef KTRACE
    313 	/*
    314 	 * if tracing, save a copy of iovec
    315 	 */
    316 	if (KTRPOINT(p, KTR_GENIO))  {
    317 		ktriov = kmem_alloc(iovlen, KM_SLEEP);
    318 		memcpy((void *)ktriov, (void *)auio.uio_iov, iovlen);
    319 	}
    320 #endif
    321 	cnt = auio.uio_resid;
    322 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
    323 	if (error)
    324 		if (auio.uio_resid != cnt && (error == ERESTART ||
    325 		    error == EINTR || error == EWOULDBLOCK))
    326 			error = 0;
    327 	cnt -= auio.uio_resid;
    328 #ifdef KTRACE
    329 	if (ktriov != NULL) {
    330 		if (KTRPOINT(p, KTR_GENIO) && (error == 0))
    331 			ktrgenio(l, fd, UIO_READ, ktriov, cnt, error);
    332 		kmem_free(ktriov, iovlen);
    333 	}
    334 #endif
    335 	*retval = cnt;
    336  done:
    337 	if (needfree)
    338 		kmem_free(needfree, iovlen);
    339  out:
    340 	FILE_UNUSE(fp, l);
    341 	uvmspace_free(vm);
    342 	return (error);
    343 }
    344 
    345 /*
    346  * Write system call
    347  */
    348 int
    349 sys_write(lwp_t *l, void *v, register_t *retval)
    350 {
    351 	struct sys_write_args /* {
    352 		syscallarg(int)			fd;
    353 		syscallarg(const void *)	buf;
    354 		syscallarg(size_t)		nbyte;
    355 	} */ *uap = v;
    356 	int		fd;
    357 	struct file	*fp;
    358 	proc_t		*p;
    359 	struct filedesc	*fdp;
    360 
    361 	fd = SCARG(uap, fd);
    362 	p = l->l_proc;
    363 	fdp = p->p_fd;
    364 
    365 	if ((fp = fd_getfile(fdp, fd)) == NULL)
    366 		return (EBADF);
    367 
    368 	if ((fp->f_flag & FWRITE) == 0) {
    369 		mutex_exit(&fp->f_lock);
    370 		return (EBADF);
    371 	}
    372 
    373 	FILE_USE(fp);
    374 
    375 	/* dofilewrite() will unuse the descriptor for us */
    376 	return (dofilewrite(l, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
    377 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
    378 }
    379 
    380 int
    381 dofilewrite(lwp_t *l, int fd, struct file *fp, const void *buf,
    382 	size_t nbyte, off_t *offset, int flags, register_t *retval)
    383 {
    384 	struct iovec aiov;
    385 	struct uio auio;
    386 	proc_t *p;
    387 	struct vmspace *vm;
    388 	size_t cnt;
    389 	int error;
    390 #ifdef KTRACE
    391 	struct iovec	ktriov;
    392 #endif
    393 
    394 	p = l->l_proc;
    395 	error = proc_vmspace_getref(p, &vm);
    396 	if (error) {
    397 		goto out;
    398 	}
    399 	aiov.iov_base = __UNCONST(buf);		/* XXXUNCONST kills const */
    400 	aiov.iov_len = nbyte;
    401 	auio.uio_iov = &aiov;
    402 	auio.uio_iovcnt = 1;
    403 	auio.uio_resid = nbyte;
    404 	auio.uio_rw = UIO_WRITE;
    405 	auio.uio_vmspace = vm;
    406 
    407 	/*
    408 	 * Writes return ssize_t because -1 is returned on error.  Therefore
    409 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
    410 	 * values.
    411 	 */
    412 	if (auio.uio_resid > SSIZE_MAX) {
    413 		error = EINVAL;
    414 		goto out;
    415 	}
    416 
    417 #ifdef KTRACE
    418 	/* In case we are tracing, save a copy of iovec */
    419 	ktriov = aiov;
    420 #endif
    421 	cnt = auio.uio_resid;
    422 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
    423 	if (error) {
    424 		if (auio.uio_resid != cnt && (error == ERESTART ||
    425 		    error == EINTR || error == EWOULDBLOCK))
    426 			error = 0;
    427 		if (error == EPIPE) {
    428 			mutex_enter(&proclist_mutex);
    429 			psignal(p, SIGPIPE);
    430 			mutex_exit(&proclist_mutex);
    431 		}
    432 	}
    433 	cnt -= auio.uio_resid;
    434 #ifdef KTRACE
    435 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
    436 		ktrgenio(l, fd, UIO_WRITE, &ktriov, cnt, error);
    437 #endif
    438 	*retval = cnt;
    439  out:
    440 	FILE_UNUSE(fp, l);
    441 	uvmspace_free(vm);
    442 	return (error);
    443 }
    444 
    445 /*
    446  * Gather write system call
    447  */
    448 int
    449 sys_writev(lwp_t *l, void *v, register_t *retval)
    450 {
    451 	struct sys_writev_args /* {
    452 		syscallarg(int)				fd;
    453 		syscallarg(const struct iovec *)	iovp;
    454 		syscallarg(int)				iovcnt;
    455 	} */ *uap = v;
    456 	int		fd;
    457 	struct file	*fp;
    458 	proc_t		*p;
    459 	struct filedesc	*fdp;
    460 
    461 	fd = SCARG(uap, fd);
    462 	p = l->l_proc;
    463 	fdp = p->p_fd;
    464 
    465 	if ((fp = fd_getfile(fdp, fd)) == NULL)
    466 		return (EBADF);
    467 
    468 	if ((fp->f_flag & FWRITE) == 0) {
    469 		mutex_exit(&fp->f_lock);
    470 		return (EBADF);
    471 	}
    472 
    473 	FILE_USE(fp);
    474 
    475 	/* dofilewritev() will unuse the descriptor for us */
    476 	return (dofilewritev(l, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
    477 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
    478 }
    479 
    480 int
    481 dofilewritev(lwp_t *l, int fd, struct file *fp, const struct iovec *iovp,
    482 	int iovcnt, off_t *offset, int flags, register_t *retval)
    483 {
    484 	proc_t		*p;
    485 	struct uio	auio;
    486 	struct iovec	*iov, *needfree, aiov[UIO_SMALLIOV];
    487 	struct vmspace	*vm;
    488 	int		i, error;
    489 	size_t		cnt;
    490 	u_int		iovlen;
    491 #ifdef KTRACE
    492 	struct iovec	*ktriov;
    493 #endif
    494 
    495 	p = l->l_proc;
    496 	error = proc_vmspace_getref(p, &vm);
    497 	if (error) {
    498 		goto out;
    499 	}
    500 #ifdef KTRACE
    501 	ktriov = NULL;
    502 #endif
    503 	/* note: can't use iovlen until iovcnt is validated */
    504 	iovlen = iovcnt * sizeof(struct iovec);
    505 	if ((u_int)iovcnt > UIO_SMALLIOV) {
    506 		if ((u_int)iovcnt > IOV_MAX) {
    507 			error = EINVAL;
    508 			goto out;
    509 		}
    510 		iov = kmem_alloc(iovlen, KM_SLEEP);
    511 		needfree = iov;
    512 	} else if ((u_int)iovcnt > 0) {
    513 		iov = aiov;
    514 		needfree = NULL;
    515 	} else {
    516 		error = EINVAL;
    517 		goto out;
    518 	}
    519 
    520 	auio.uio_iov = iov;
    521 	auio.uio_iovcnt = iovcnt;
    522 	auio.uio_rw = UIO_WRITE;
    523 	auio.uio_vmspace = vm;
    524 	error = copyin(iovp, iov, iovlen);
    525 	if (error)
    526 		goto done;
    527 	auio.uio_resid = 0;
    528 	for (i = 0; i < iovcnt; i++) {
    529 		auio.uio_resid += iov->iov_len;
    530 		/*
    531 		 * Writes return ssize_t because -1 is returned on error.
    532 		 * Therefore we must restrict the length to SSIZE_MAX to
    533 		 * avoid garbage return values.
    534 		 */
    535 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
    536 			error = EINVAL;
    537 			goto done;
    538 		}
    539 		iov++;
    540 	}
    541 #ifdef KTRACE
    542 	/*
    543 	 * if tracing, save a copy of iovec
    544 	 */
    545 	if (KTRPOINT(p, KTR_GENIO))  {
    546 		ktriov = kmem_alloc(iovlen, KM_SLEEP);
    547 		memcpy((void *)ktriov, (void *)auio.uio_iov, iovlen);
    548 	}
    549 #endif
    550 	cnt = auio.uio_resid;
    551 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
    552 	if (error) {
    553 		if (auio.uio_resid != cnt && (error == ERESTART ||
    554 		    error == EINTR || error == EWOULDBLOCK))
    555 			error = 0;
    556 		if (error == EPIPE) {
    557 			mutex_enter(&proclist_mutex);
    558 			psignal(p, SIGPIPE);
    559 			mutex_exit(&proclist_mutex);
    560 		}
    561 	}
    562 	cnt -= auio.uio_resid;
    563 #ifdef KTRACE
    564 	if (ktriov != NULL) {
    565 		if (KTRPOINT(p, KTR_GENIO) && (error == 0))
    566 			ktrgenio(l, fd, UIO_WRITE, ktriov, cnt, error);
    567 		kmem_free(ktriov, iovlen);
    568 	}
    569 #endif
    570 	*retval = cnt;
    571  done:
    572 	if (needfree)
    573 		kmem_free(needfree, iovlen);
    574  out:
    575 	FILE_UNUSE(fp, l);
    576 	uvmspace_free(vm);
    577 	return (error);
    578 }
    579 
    580 /*
    581  * Ioctl system call
    582  */
    583 /* ARGSUSED */
    584 int
    585 sys_ioctl(lwp_t *l, void *v, register_t *retval)
    586 {
    587 	struct sys_ioctl_args /* {
    588 		syscallarg(int)		fd;
    589 		syscallarg(u_long)	com;
    590 		syscallarg(void *)	data;
    591 	} */ *uap = v;
    592 	struct file	*fp;
    593 	proc_t		*p;
    594 	struct filedesc	*fdp;
    595 	u_long		com;
    596 	int		error;
    597 	u_int		size;
    598 	void 		*data, *memp;
    599 #define	STK_PARAMS	128
    600 	u_long		stkbuf[STK_PARAMS/sizeof(u_long)];
    601 
    602 	error = 0;
    603 	p = l->l_proc;
    604 	fdp = p->p_fd;
    605 
    606 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
    607 		return (EBADF);
    608 
    609 	FILE_USE(fp);
    610 
    611 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
    612 		error = EBADF;
    613 		com = 0;
    614 		goto out;
    615 	}
    616 
    617 	switch (com = SCARG(uap, com)) {
    618 	case FIONCLEX:
    619 		rw_enter(&fdp->fd_lock, RW_WRITER);
    620 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
    621 		rw_exit(&fdp->fd_lock);
    622 		goto out;
    623 
    624 	case FIOCLEX:
    625 		rw_enter(&fdp->fd_lock, RW_WRITER);
    626 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
    627 		rw_exit(&fdp->fd_lock);
    628 		goto out;
    629 	}
    630 
    631 	/*
    632 	 * Interpret high order word to find amount of data to be
    633 	 * copied to/from the user's address space.
    634 	 */
    635 	size = IOCPARM_LEN(com);
    636 	if (size > IOCPARM_MAX) {
    637 		error = ENOTTY;
    638 		goto out;
    639 	}
    640 	memp = NULL;
    641 	if (size > sizeof(stkbuf)) {
    642 		memp = kmem_alloc(size, KM_SLEEP);
    643 		data = memp;
    644 	} else
    645 		data = (void *)stkbuf;
    646 	if (com&IOC_IN) {
    647 		if (size) {
    648 			error = copyin(SCARG(uap, data), data, size);
    649 			if (error) {
    650 				if (memp)
    651 					kmem_free(memp, size);
    652 				goto out;
    653 			}
    654 #ifdef KTRACE
    655 			if (KTRPOINT(p, KTR_GENIO)) {
    656 				struct iovec iov;
    657 				iov.iov_base = SCARG(uap, data);
    658 				iov.iov_len = size;
    659 				ktrgenio(l, SCARG(uap, fd), UIO_WRITE, &iov,
    660 					size, 0);
    661 			}
    662 #endif
    663 		} else
    664 			*(void **)data = SCARG(uap, data);
    665 	} else if ((com&IOC_OUT) && size)
    666 		/*
    667 		 * Zero the buffer so the user always
    668 		 * gets back something deterministic.
    669 		 */
    670 		memset(data, 0, size);
    671 	else if (com&IOC_VOID)
    672 		*(void **)data = SCARG(uap, data);
    673 
    674 	switch (com) {
    675 
    676 	case FIONBIO:
    677 		mutex_enter(&fp->f_lock);
    678 		if (*(int *)data != 0)
    679 			fp->f_flag |= FNONBLOCK;
    680 		else
    681 			fp->f_flag &= ~FNONBLOCK;
    682 		mutex_exit(&fp->f_lock);
    683 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data, l);
    684 		break;
    685 
    686 	case FIOASYNC:
    687 		mutex_enter(&fp->f_lock);
    688 		if (*(int *)data != 0)
    689 			fp->f_flag |= FASYNC;
    690 		else
    691 			fp->f_flag &= ~FASYNC;
    692 		mutex_exit(&fp->f_lock);
    693 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data, l);
    694 		break;
    695 
    696 	default:
    697 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, l);
    698 		/*
    699 		 * Copy any data to user, size was
    700 		 * already set and checked above.
    701 		 */
    702 		if (error == 0 && (com&IOC_OUT) && size) {
    703 			error = copyout(data, SCARG(uap, data), size);
    704 #ifdef KTRACE
    705 			if (KTRPOINT(p, KTR_GENIO)) {
    706 				struct iovec iov;
    707 				iov.iov_base = SCARG(uap, data);
    708 				iov.iov_len = size;
    709 				ktrgenio(l, SCARG(uap, fd), UIO_READ, &iov,
    710 					size, error);
    711 			}
    712 #endif
    713 		}
    714 		break;
    715 	}
    716 	if (memp)
    717 		kmem_free(memp, size);
    718  out:
    719 	FILE_UNUSE(fp, l);
    720 	switch (error) {
    721 	case -1:
    722 		printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
    723 		    "pid=%d comm=%s\n",
    724 		    (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
    725 		    (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
    726 		    p->p_pid, p->p_comm);
    727 		/* FALLTHROUGH */
    728 	case EPASSTHROUGH:
    729 		error = ENOTTY;
    730 		/* FALLTHROUGH */
    731 	default:
    732 		return (error);
    733 	}
    734 }
    735 
    736 /*
    737  * Select system call.
    738  */
    739 int
    740 sys_pselect(lwp_t *l, void *v, register_t *retval)
    741 {
    742 	struct sys_pselect_args /* {
    743 		syscallarg(int)				nd;
    744 		syscallarg(fd_set *)			in;
    745 		syscallarg(fd_set *)			ou;
    746 		syscallarg(fd_set *)			ex;
    747 		syscallarg(const struct timespec *)	ts;
    748 		syscallarg(sigset_t *)			mask;
    749 	} */ * const uap = v;
    750 	struct timespec	ats;
    751 	struct timeval	atv, *tv = NULL;
    752 	sigset_t	amask, *mask = NULL;
    753 	int		error;
    754 
    755 	if (SCARG(uap, ts)) {
    756 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
    757 		if (error)
    758 			return error;
    759 		atv.tv_sec = ats.tv_sec;
    760 		atv.tv_usec = ats.tv_nsec / 1000;
    761 		tv = &atv;
    762 	}
    763 	if (SCARG(uap, mask) != NULL) {
    764 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
    765 		if (error)
    766 			return error;
    767 		mask = &amask;
    768 	}
    769 
    770 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
    771 	    SCARG(uap, ou), SCARG(uap, ex), tv, mask);
    772 }
    773 
    774 int
    775 inittimeleft(struct timeval *tv, struct timeval *sleeptv)
    776 {
    777 	if (itimerfix(tv))
    778 		return -1;
    779 	getmicrouptime(sleeptv);
    780 	return 0;
    781 }
    782 
    783 int
    784 gettimeleft(struct timeval *tv, struct timeval *sleeptv)
    785 {
    786 	/*
    787 	 * We have to recalculate the timeout on every retry.
    788 	 */
    789 	struct timeval slepttv;
    790 	/*
    791 	 * reduce tv by elapsed time
    792 	 * based on monotonic time scale
    793 	 */
    794 	getmicrouptime(&slepttv);
    795 	timeradd(tv, sleeptv, tv);
    796 	timersub(tv, &slepttv, tv);
    797 	*sleeptv = slepttv;
    798 	return tvtohz(tv);
    799 }
    800 
    801 int
    802 sys_select(lwp_t *l, void *v, register_t *retval)
    803 {
    804 	struct sys_select_args /* {
    805 		syscallarg(int)			nd;
    806 		syscallarg(fd_set *)		in;
    807 		syscallarg(fd_set *)		ou;
    808 		syscallarg(fd_set *)		ex;
    809 		syscallarg(struct timeval *)	tv;
    810 	} */ * const uap = v;
    811 	struct timeval atv, *tv = NULL;
    812 	int error;
    813 
    814 	if (SCARG(uap, tv)) {
    815 		error = copyin(SCARG(uap, tv), (void *)&atv,
    816 			sizeof(atv));
    817 		if (error)
    818 			return error;
    819 		tv = &atv;
    820 	}
    821 
    822 	return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
    823 	    SCARG(uap, ou), SCARG(uap, ex), tv, NULL);
    824 }
    825 
    826 int
    827 selcommon(lwp_t *l, register_t *retval, int nd, fd_set *u_in,
    828 	  fd_set *u_ou, fd_set *u_ex, struct timeval *tv, sigset_t *mask)
    829 {
    830 	char		smallbits[howmany(FD_SETSIZE, NFDBITS) *
    831 			    sizeof(fd_mask) * 6];
    832 	proc_t		* const p = l->l_proc;
    833 	char 		*bits;
    834 	int		ncoll, error, timo;
    835 	size_t		ni;
    836 	sigset_t	oldmask;
    837 	struct timeval  sleeptv;
    838 
    839 	error = 0;
    840 	if (nd < 0)
    841 		return (EINVAL);
    842 	if (nd > p->p_fd->fd_nfiles) {
    843 		/* forgiving; slightly wrong */
    844 		nd = p->p_fd->fd_nfiles;
    845 	}
    846 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
    847 	if (ni * 6 > sizeof(smallbits))
    848 		bits = kmem_alloc(ni * 6, KM_SLEEP);
    849 	else
    850 		bits = smallbits;
    851 
    852 #define	getbits(name, x)						\
    853 	if (u_ ## name) {						\
    854 		error = copyin(u_ ## name, bits + ni * x, ni);		\
    855 		if (error)						\
    856 			goto done;					\
    857 	} else								\
    858 		memset(bits + ni * x, 0, ni);
    859 	getbits(in, 0);
    860 	getbits(ou, 1);
    861 	getbits(ex, 2);
    862 #undef	getbits
    863 
    864 	timo = 0;
    865 	if (tv && inittimeleft(tv, &sleeptv) == -1) {
    866 		error = EINVAL;
    867 		goto done;
    868 	}
    869 
    870 	if (mask) {
    871 		sigminusset(&sigcantmask, mask);
    872 		mutex_enter(&p->p_smutex);
    873 		oldmask = l->l_sigmask;
    874 		l->l_sigmask = *mask;
    875 		mutex_exit(&p->p_smutex);
    876 	} else
    877 		oldmask = l->l_sigmask;	/* XXXgcc */
    878 
    879 	mutex_enter(&select_lock);
    880 	SLIST_INIT(&l->l_selwait);
    881 	for (;;) {
    882 	 	l->l_selflag = SEL_SCANNING;
    883 		ncoll = nselcoll;
    884  		mutex_exit(&select_lock);
    885 
    886 		error = selscan(l, (fd_mask *)(bits + ni * 0),
    887 		    (fd_mask *)(bits + ni * 3), nd, retval);
    888 
    889 		mutex_enter(&select_lock);
    890 		if (error || *retval)
    891 			break;
    892 		if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
    893 			break;
    894 		if (l->l_selflag != SEL_SCANNING || ncoll != nselcoll)
    895 			continue;
    896 		l->l_selflag = SEL_BLOCKING;
    897 		error = cv_timedwait_sig(&select_cv, &select_lock, timo);
    898 		if (error != 0)
    899 			break;
    900 	}
    901 	selclear();
    902 	mutex_exit(&select_lock);
    903 
    904 	if (mask) {
    905 		mutex_enter(&p->p_smutex);
    906 		l->l_sigmask = oldmask;
    907 		mutex_exit(&p->p_smutex);
    908 	}
    909 
    910  done:
    911 	/* select is not restarted after signals... */
    912 	if (error == ERESTART)
    913 		error = EINTR;
    914 	if (error == EWOULDBLOCK)
    915 		error = 0;
    916 	if (error == 0 && u_in != NULL)
    917 		error = copyout(bits + ni * 3, u_in, ni);
    918 	if (error == 0 && u_ou != NULL)
    919 		error = copyout(bits + ni * 4, u_ou, ni);
    920 	if (error == 0 && u_ex != NULL)
    921 		error = copyout(bits + ni * 5, u_ex, ni);
    922 	if (bits != smallbits)
    923 		kmem_free(bits, ni * 6);
    924 	return (error);
    925 }
    926 
    927 int
    928 selscan(lwp_t *l, fd_mask *ibitp, fd_mask *obitp, int nfd,
    929 	register_t *retval)
    930 {
    931 	static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
    932 			       POLLWRNORM | POLLHUP | POLLERR,
    933 			       POLLRDBAND };
    934 	proc_t *p = l->l_proc;
    935 	struct filedesc	*fdp;
    936 	int msk, i, j, fd, n;
    937 	fd_mask ibits, obits;
    938 	struct file *fp;
    939 
    940 	fdp = p->p_fd;
    941 	n = 0;
    942 	for (msk = 0; msk < 3; msk++) {
    943 		for (i = 0; i < nfd; i += NFDBITS) {
    944 			ibits = *ibitp++;
    945 			obits = 0;
    946 			while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
    947 				ibits &= ~(1 << j);
    948 				if ((fp = fd_getfile(fdp, fd)) == NULL)
    949 					return (EBADF);
    950 				FILE_USE(fp);
    951 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], l)) {
    952 					obits |= (1 << j);
    953 					n++;
    954 				}
    955 				FILE_UNUSE(fp, l);
    956 			}
    957 			*obitp++ = obits;
    958 		}
    959 	}
    960 	*retval = n;
    961 	return (0);
    962 }
    963 
    964 /*
    965  * Poll system call.
    966  */
    967 int
    968 sys_poll(lwp_t *l, void *v, register_t *retval)
    969 {
    970 	struct sys_poll_args /* {
    971 		syscallarg(struct pollfd *)	fds;
    972 		syscallarg(u_int)		nfds;
    973 		syscallarg(int)			timeout;
    974 	} */ * const uap = v;
    975 	struct timeval	atv, *tv = NULL;
    976 
    977 	if (SCARG(uap, timeout) != INFTIM) {
    978 		atv.tv_sec = SCARG(uap, timeout) / 1000;
    979 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
    980 		tv = &atv;
    981 	}
    982 
    983 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
    984 		tv, NULL);
    985 }
    986 
    987 /*
    988  * Poll system call.
    989  */
    990 int
    991 sys_pollts(lwp_t *l, void *v, register_t *retval)
    992 {
    993 	struct sys_pollts_args /* {
    994 		syscallarg(struct pollfd *)		fds;
    995 		syscallarg(u_int)			nfds;
    996 		syscallarg(const struct timespec *)	ts;
    997 		syscallarg(const sigset_t *)		mask;
    998 	} */ * const uap = v;
    999 	struct timespec	ats;
   1000 	struct timeval	atv, *tv = NULL;
   1001 	sigset_t	amask, *mask = NULL;
   1002 	int		error;
   1003 
   1004 	if (SCARG(uap, ts)) {
   1005 		error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
   1006 		if (error)
   1007 			return error;
   1008 		atv.tv_sec = ats.tv_sec;
   1009 		atv.tv_usec = ats.tv_nsec / 1000;
   1010 		tv = &atv;
   1011 	}
   1012 	if (SCARG(uap, mask)) {
   1013 		error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
   1014 		if (error)
   1015 			return error;
   1016 		mask = &amask;
   1017 	}
   1018 
   1019 	return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
   1020 		tv, mask);
   1021 }
   1022 
   1023 int
   1024 pollcommon(lwp_t *l, register_t *retval,
   1025 	struct pollfd *u_fds, u_int nfds,
   1026 	struct timeval *tv, sigset_t *mask)
   1027 {
   1028 	char		smallbits[32 * sizeof(struct pollfd)];
   1029 	proc_t		* const p = l->l_proc;
   1030 	void *		bits;
   1031 	sigset_t	oldmask;
   1032 	int		ncoll, error, timo;
   1033 	size_t		ni;
   1034 	struct timeval	sleeptv;
   1035 
   1036 	if (nfds > p->p_fd->fd_nfiles) {
   1037 		/* forgiving; slightly wrong */
   1038 		nfds = p->p_fd->fd_nfiles;
   1039 	}
   1040 	ni = nfds * sizeof(struct pollfd);
   1041 	if (ni > sizeof(smallbits))
   1042 		bits = kmem_alloc(ni, KM_SLEEP);
   1043 	else
   1044 		bits = smallbits;
   1045 
   1046 	error = copyin(u_fds, bits, ni);
   1047 	if (error)
   1048 		goto done;
   1049 
   1050 	timo = 0;
   1051 	if (tv && inittimeleft(tv, &sleeptv) == -1) {
   1052 		error = EINVAL;
   1053 		goto done;
   1054 	}
   1055 
   1056 	if (mask) {
   1057 		sigminusset(&sigcantmask, mask);
   1058 		mutex_enter(&p->p_smutex);
   1059 		oldmask = l->l_sigmask;
   1060 		l->l_sigmask = *mask;
   1061 		mutex_exit(&p->p_smutex);
   1062 	} else
   1063 		oldmask = l->l_sigmask;	/* XXXgcc */
   1064 
   1065 	mutex_enter(&select_lock);
   1066 	SLIST_INIT(&l->l_selwait);
   1067 	for (;;) {
   1068 		ncoll = nselcoll;
   1069 		l->l_selflag = SEL_SCANNING;
   1070 		mutex_exit(&select_lock);
   1071 
   1072 		error = pollscan(l, (struct pollfd *)bits, nfds, retval);
   1073 
   1074 		mutex_enter(&select_lock);
   1075 		if (error || *retval)
   1076 			break;
   1077 		if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
   1078 			break;
   1079 		if (l->l_selflag != SEL_SCANNING || nselcoll != ncoll)
   1080 			continue;
   1081 		l->l_selflag = SEL_BLOCKING;
   1082 		error = cv_timedwait_sig(&select_cv, &select_lock, timo);
   1083 		if (error != 0)
   1084 			break;
   1085 	}
   1086 	selclear();
   1087 	mutex_exit(&select_lock);
   1088 
   1089 	if (mask) {
   1090 		mutex_enter(&p->p_smutex);
   1091 		l->l_sigmask = oldmask;
   1092 		mutex_exit(&p->p_smutex);
   1093 	}
   1094  done:
   1095 	/* poll is not restarted after signals... */
   1096 	if (error == ERESTART)
   1097 		error = EINTR;
   1098 	if (error == EWOULDBLOCK)
   1099 		error = 0;
   1100 	if (error == 0)
   1101 		error = copyout(bits, u_fds, ni);
   1102 	if (bits != smallbits)
   1103 		kmem_free(bits, ni);
   1104 	return (error);
   1105 }
   1106 
   1107 int
   1108 pollscan(lwp_t *l, struct pollfd *fds, int nfd, register_t *retval)
   1109 {
   1110 	proc_t		*p = l->l_proc;
   1111 	struct filedesc	*fdp;
   1112 	int		i, n;
   1113 	struct file	*fp;
   1114 
   1115 	fdp = p->p_fd;
   1116 	n = 0;
   1117 	for (i = 0; i < nfd; i++, fds++) {
   1118 		if (fds->fd >= fdp->fd_nfiles) {
   1119 			fds->revents = POLLNVAL;
   1120 			n++;
   1121 		} else if (fds->fd < 0) {
   1122 			fds->revents = 0;
   1123 		} else {
   1124 			if ((fp = fd_getfile(fdp, fds->fd)) == NULL) {
   1125 				fds->revents = POLLNVAL;
   1126 				n++;
   1127 			} else {
   1128 				FILE_USE(fp);
   1129 				fds->revents = (*fp->f_ops->fo_poll)(fp,
   1130 				    fds->events | POLLERR | POLLHUP, l);
   1131 				if (fds->revents != 0)
   1132 					n++;
   1133 				FILE_UNUSE(fp, l);
   1134 			}
   1135 		}
   1136 	}
   1137 	*retval = n;
   1138 	return (0);
   1139 }
   1140 
   1141 /*ARGSUSED*/
   1142 int
   1143 seltrue(dev_t dev, int events, lwp_t *l)
   1144 {
   1145 
   1146 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
   1147 }
   1148 
   1149 /*
   1150  * Record a select request.
   1151  */
   1152 void
   1153 selrecord(lwp_t *selector, struct selinfo *sip)
   1154 {
   1155 
   1156 	mutex_enter(&select_lock);
   1157 	if (sip->sel_lwp == NULL) {
   1158 		/* First named waiter, although there may be more. */
   1159 		sip->sel_lwp = selector;
   1160 		SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
   1161 	} else if (sip->sel_lwp != selector) {
   1162 		/* Multiple waiters. */
   1163 		sip->sel_collision = true;
   1164 	}
   1165 	mutex_exit(&select_lock);
   1166 }
   1167 
   1168 /*
   1169  * Do a wakeup when a selectable event occurs.
   1170  */
   1171 void
   1172 selwakeup(struct selinfo *sip)
   1173 {
   1174 	lwp_t *l;
   1175 
   1176 	mutex_enter(&select_lock);
   1177 	if (sip->sel_collision) {
   1178 		/* Multiple waiters - just notify everybody. */
   1179 		nselcoll++;
   1180 		sip->sel_collision = false;
   1181 		cv_broadcast(&select_cv);
   1182 	} else if (sip->sel_lwp != NULL) {
   1183 		/* Only one LWP waiting. */
   1184 		l = sip->sel_lwp;
   1185 		if (l->l_selflag == SEL_BLOCKING) {
   1186 			/*
   1187 			 * If it's sleeping, wake it up.  If not, it's
   1188 			 * already awake but hasn't yet removed itself
   1189 			 * from the selector.  We reset the state below
   1190 			 * so that we only attempt to do this once.
   1191 			 */
   1192 			lwp_lock(l);
   1193 			if (l->l_wchan == &select_cv) {
   1194 				/* lwp_unsleep() releases the LWP lock. */
   1195 				lwp_unsleep(l);
   1196 			} else
   1197 				lwp_unlock(l);
   1198 		} else {
   1199 			/*
   1200 			 * Not yet asleep.  Reset its state below so that
   1201 			 * it will go around again.
   1202 			 */
   1203 		}
   1204 		l->l_selflag = SEL_RESET;
   1205 	}
   1206 	mutex_exit(&select_lock);
   1207 }
   1208 
   1209 void
   1210 selnotify(struct selinfo *sip, long knhint)
   1211 {
   1212 
   1213 	selwakeup(sip);
   1214 	KNOTE(&sip->sel_klist, knhint);
   1215 }
   1216 
   1217 /*
   1218  * Remove an LWP from all objects that it is waiting for.
   1219  */
   1220 static void
   1221 selclear(void)
   1222 {
   1223 	struct selinfo *sip;
   1224 	lwp_t *l = curlwp;
   1225 
   1226 	KASSERT(mutex_owned(&select_lock));
   1227 
   1228 	SLIST_FOREACH(sip, &l->l_selwait, sel_chain) {
   1229 		KASSERT(sip->sel_lwp == l);
   1230 		sip->sel_lwp = NULL;
   1231 	}
   1232 }
   1233 
   1234 /*
   1235  * Initialize the select/poll system calls.
   1236  */
   1237 void
   1238 selsysinit(void)
   1239 {
   1240 
   1241 	mutex_init(&select_lock, MUTEX_DRIVER, IPL_VM);
   1242 	cv_init(&select_cv, "select");
   1243 }
   1244