Home | History | Annotate | Line # | Download | only in kern
sys_generic.c revision 1.44
      1 /*	$NetBSD: sys_generic.c,v 1.44 1998/08/04 04:03:15 perry Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 1982, 1986, 1989, 1993
      5  *	The Regents of the University of California.  All rights reserved.
      6  * (c) UNIX System Laboratories, Inc.
      7  * All or some portions of this file are derived from material licensed
      8  * to the University of California by American Telephone and Telegraph
      9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
     10  * the permission of UNIX System Laboratories, Inc.
     11  *
     12  * Redistribution and use in source and binary forms, with or without
     13  * modification, are permitted provided that the following conditions
     14  * are met:
     15  * 1. Redistributions of source code must retain the above copyright
     16  *    notice, this list of conditions and the following disclaimer.
     17  * 2. Redistributions in binary form must reproduce the above copyright
     18  *    notice, this list of conditions and the following disclaimer in the
     19  *    documentation and/or other materials provided with the distribution.
     20  * 3. All advertising materials mentioning features or use of this software
     21  *    must display the following acknowledgement:
     22  *	This product includes software developed by the University of
     23  *	California, Berkeley and its contributors.
     24  * 4. Neither the name of the University nor the names of its contributors
     25  *    may be used to endorse or promote products derived from this software
     26  *    without specific prior written permission.
     27  *
     28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     38  * SUCH DAMAGE.
     39  *
     40  *	@(#)sys_generic.c	8.9 (Berkeley) 2/14/95
     41  */
     42 
     43 #include "opt_ktrace.h"
     44 
     45 #include <sys/param.h>
     46 #include <sys/systm.h>
     47 #include <sys/filedesc.h>
     48 #include <sys/ioctl.h>
     49 #include <sys/file.h>
     50 #include <sys/proc.h>
     51 #include <sys/socketvar.h>
     52 #include <sys/signalvar.h>
     53 #include <sys/uio.h>
     54 #include <sys/kernel.h>
     55 #include <sys/stat.h>
     56 #include <sys/malloc.h>
     57 #include <sys/poll.h>
     58 #ifdef KTRACE
     59 #include <sys/ktrace.h>
     60 #endif
     61 
     62 #include <sys/mount.h>
     63 #include <sys/syscallargs.h>
     64 
     65 int selscan __P((struct proc *, fd_mask *, fd_mask *, int, register_t *));
     66 int pollscan __P((struct proc *, struct pollfd *, int, register_t *));
     67 
     68 /*
     69  * Read system call.
     70  */
     71 /* ARGSUSED */
     72 int
     73 sys_read(p, v, retval)
     74 	struct proc *p;
     75 	void *v;
     76 	register_t *retval;
     77 {
     78 	register struct sys_read_args /* {
     79 		syscallarg(int) fd;
     80 		syscallarg(void *) buf;
     81 		syscallarg(size_t) nbyte;
     82 	} */ *uap = v;
     83 	int fd = SCARG(uap, fd);
     84 	register struct file *fp;
     85 	register struct filedesc *fdp = p->p_fd;
     86 
     87 	if ((u_int)fd >= fdp->fd_nfiles ||
     88 	    (fp = fdp->fd_ofiles[fd]) == NULL ||
     89 	    (fp->f_flag & FREAD) == 0)
     90 		return (EBADF);
     91 
     92 	return (dofileread(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
     93 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
     94 }
     95 
     96 int
     97 dofileread(p, fd, fp, buf, nbyte, offset, flags, retval)
     98 	struct proc *p;
     99 	int fd;
    100 	struct file *fp;
    101 	void *buf;
    102 	size_t nbyte;
    103 	off_t *offset;
    104 	int flags;
    105 	register_t *retval;
    106 {
    107 	struct uio auio;
    108 	struct iovec aiov;
    109 	long cnt, error = 0;
    110 #ifdef KTRACE
    111 	struct iovec ktriov;
    112 #endif
    113 
    114 	aiov.iov_base = (caddr_t)buf;
    115 	aiov.iov_len = nbyte;
    116 	auio.uio_iov = &aiov;
    117 	auio.uio_iovcnt = 1;
    118 	auio.uio_resid = nbyte;
    119 	auio.uio_rw = UIO_READ;
    120 	auio.uio_segflg = UIO_USERSPACE;
    121 	auio.uio_procp = p;
    122 
    123 	/*
    124 	 * Reads return ssize_t because -1 is returned on error.  Therefore
    125 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
    126 	 * values.
    127 	 */
    128 	if (auio.uio_resid > SSIZE_MAX)
    129 		return (EINVAL);
    130 
    131 #ifdef KTRACE
    132 	/*
    133 	 * if tracing, save a copy of iovec
    134 	 */
    135 	if (KTRPOINT(p, KTR_GENIO))
    136 		ktriov = aiov;
    137 #endif
    138 	cnt = auio.uio_resid;
    139 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
    140 	if (error)
    141 		if (auio.uio_resid != cnt && (error == ERESTART ||
    142 		    error == EINTR || error == EWOULDBLOCK))
    143 			error = 0;
    144 	cnt -= auio.uio_resid;
    145 #ifdef KTRACE
    146 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
    147 		ktrgenio(p->p_tracep, fd, UIO_READ, &ktriov, cnt, error);
    148 #endif
    149 	*retval = cnt;
    150 	return (error);
    151 }
    152 
    153 /*
    154  * Scatter read system call.
    155  */
    156 int
    157 sys_readv(p, v, retval)
    158 	struct proc *p;
    159 	void *v;
    160 	register_t *retval;
    161 {
    162 	register struct sys_readv_args /* {
    163 		syscallarg(int) fd;
    164 		syscallarg(const struct iovec *) iovp;
    165 		syscallarg(int) iovcnt;
    166 	} */ *uap = v;
    167 	int fd = SCARG(uap, fd);
    168 	register struct file *fp;
    169 	register struct filedesc *fdp = p->p_fd;
    170 
    171 	if ((u_int)fd >= fdp->fd_nfiles ||
    172 	    (fp = fdp->fd_ofiles[fd]) == NULL ||
    173 	    (fp->f_flag & FREAD) == 0)
    174 		return (EBADF);
    175 
    176 	return (dofilereadv(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
    177 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
    178 }
    179 
    180 int
    181 dofilereadv(p, fd, fp, iovp, iovcnt, offset, flags, retval)
    182 	struct proc *p;
    183 	int fd;
    184 	struct file *fp;
    185 	const struct iovec *iovp;
    186 	int iovcnt;
    187 	off_t *offset;
    188 	int flags;
    189 	register_t *retval;
    190 {
    191 	struct uio auio;
    192 	register struct iovec *iov;
    193 	struct iovec *needfree;
    194 	struct iovec aiov[UIO_SMALLIOV];
    195 	long i, cnt, error = 0;
    196 	u_int iovlen;
    197 #ifdef KTRACE
    198 	struct iovec *ktriov = NULL;
    199 #endif
    200 
    201 	/* note: can't use iovlen until iovcnt is validated */
    202 	iovlen = iovcnt * sizeof(struct iovec);
    203 	if ((u_int)iovcnt > UIO_SMALLIOV) {
    204 		if ((u_int)iovcnt > IOV_MAX)
    205 			return (EINVAL);
    206 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
    207 		needfree = iov;
    208 	} else if ((u_int)iovcnt > 0) {
    209 		iov = aiov;
    210 		needfree = NULL;
    211 	} else
    212 		return (EINVAL);
    213 
    214 	auio.uio_iov = iov;
    215 	auio.uio_iovcnt = iovcnt;
    216 	auio.uio_rw = UIO_READ;
    217 	auio.uio_segflg = UIO_USERSPACE;
    218 	auio.uio_procp = p;
    219 	error = copyin(iovp, iov, iovlen);
    220 	if (error)
    221 		goto done;
    222 	auio.uio_resid = 0;
    223 	for (i = 0; i < iovcnt; i++) {
    224 		auio.uio_resid += iov->iov_len;
    225 		/*
    226 		 * Reads return ssize_t because -1 is returned on error.
    227 		 * Therefore we must restrict the length to SSIZE_MAX to
    228 		 * avoid garbage return values.
    229 		 */
    230 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
    231 			error = EINVAL;
    232 			goto done;
    233 		}
    234 		iov++;
    235 	}
    236 #ifdef KTRACE
    237 	/*
    238 	 * if tracing, save a copy of iovec
    239 	 */
    240 	if (KTRPOINT(p, KTR_GENIO))  {
    241 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
    242 		memcpy((caddr_t)ktriov, (caddr_t)auio.uio_iov, iovlen);
    243 	}
    244 #endif
    245 	cnt = auio.uio_resid;
    246 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
    247 	if (error)
    248 		if (auio.uio_resid != cnt && (error == ERESTART ||
    249 		    error == EINTR || error == EWOULDBLOCK))
    250 			error = 0;
    251 	cnt -= auio.uio_resid;
    252 #ifdef KTRACE
    253 	if (KTRPOINT(p, KTR_GENIO))
    254 		if (error == 0) {
    255 			ktrgenio(p->p_tracep, fd, UIO_READ, ktriov, cnt,
    256 			    error);
    257 		FREE(ktriov, M_TEMP);
    258 	}
    259 #endif
    260 	*retval = cnt;
    261 done:
    262 	if (needfree)
    263 		FREE(needfree, M_IOV);
    264 	return (error);
    265 }
    266 
    267 /*
    268  * Write system call
    269  */
    270 int
    271 sys_write(p, v, retval)
    272 	struct proc *p;
    273 	void *v;
    274 	register_t *retval;
    275 {
    276 	register struct sys_write_args /* {
    277 		syscallarg(int) fd;
    278 		syscallarg(const void *) buf;
    279 		syscallarg(size_t) nbyte;
    280 	} */ *uap = v;
    281 	int fd = SCARG(uap, fd);
    282 	register struct file *fp;
    283 	register struct filedesc *fdp = p->p_fd;
    284 
    285 	if ((u_int)fd >= fdp->fd_nfiles ||
    286 	    (fp = fdp->fd_ofiles[fd]) == NULL ||
    287 	    (fp->f_flag & FWRITE) == 0)
    288 		return (EBADF);
    289 
    290 	return (dofilewrite(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
    291 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
    292 }
    293 
    294 int
    295 dofilewrite(p, fd, fp, buf, nbyte, offset, flags, retval)
    296 	struct proc *p;
    297 	int fd;
    298 	struct file *fp;
    299 	const void *buf;
    300 	size_t nbyte;
    301 	off_t *offset;
    302 	int flags;
    303 	register_t *retval;
    304 {
    305 	struct uio auio;
    306 	struct iovec aiov;
    307 	long cnt, error = 0;
    308 #ifdef KTRACE
    309 	struct iovec ktriov;
    310 #endif
    311 
    312 	aiov.iov_base = (caddr_t)buf;		/* XXX kills const */
    313 	aiov.iov_len = nbyte;
    314 	auio.uio_iov = &aiov;
    315 	auio.uio_iovcnt = 1;
    316 	auio.uio_resid = nbyte;
    317 	auio.uio_rw = UIO_WRITE;
    318 	auio.uio_segflg = UIO_USERSPACE;
    319 	auio.uio_procp = p;
    320 
    321 	/*
    322 	 * Writes return ssize_t because -1 is returned on error.  Therefore
    323 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
    324 	 * values.
    325 	 */
    326 	if (auio.uio_resid > SSIZE_MAX)
    327 		return (EINVAL);
    328 
    329 #ifdef KTRACE
    330 	/*
    331 	 * if tracing, save a copy of iovec
    332 	 */
    333 	if (KTRPOINT(p, KTR_GENIO))
    334 		ktriov = aiov;
    335 #endif
    336 	cnt = auio.uio_resid;
    337 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
    338 	if (error) {
    339 		if (auio.uio_resid != cnt && (error == ERESTART ||
    340 		    error == EINTR || error == EWOULDBLOCK))
    341 			error = 0;
    342 		if (error == EPIPE)
    343 			psignal(p, SIGPIPE);
    344 	}
    345 	cnt -= auio.uio_resid;
    346 #ifdef KTRACE
    347 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
    348 		ktrgenio(p->p_tracep, fd, UIO_WRITE, &ktriov, cnt, error);
    349 #endif
    350 	*retval = cnt;
    351 	return (error);
    352 }
    353 
    354 /*
    355  * Gather write system call
    356  */
    357 int
    358 sys_writev(p, v, retval)
    359 	struct proc *p;
    360 	void *v;
    361 	register_t *retval;
    362 {
    363 	register struct sys_writev_args /* {
    364 		syscallarg(int) fd;
    365 		syscallarg(const struct iovec *) iovp;
    366 		syscallarg(int) iovcnt;
    367 	} */ *uap = v;
    368 	int fd = SCARG(uap, fd);
    369 	register struct file *fp;
    370 	register struct filedesc *fdp = p->p_fd;
    371 
    372 	if ((u_int)fd >= fdp->fd_nfiles ||
    373 	    (fp = fdp->fd_ofiles[fd]) == NULL ||
    374 	    (fp->f_flag & FWRITE) == 0)
    375 		return (EBADF);
    376 
    377 	return (dofilewritev(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
    378 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
    379 }
    380 
    381 int
    382 dofilewritev(p, fd, fp, iovp, iovcnt, offset, flags, retval)
    383 	struct proc *p;
    384 	int fd;
    385 	struct file *fp;
    386 	const struct iovec *iovp;
    387 	int iovcnt;
    388 	off_t *offset;
    389 	int flags;
    390 	register_t *retval;
    391 {
    392 	struct uio auio;
    393 	register struct iovec *iov;
    394 	struct iovec *needfree;
    395 	struct iovec aiov[UIO_SMALLIOV];
    396 	long i, cnt, error = 0;
    397 	u_int iovlen;
    398 #ifdef KTRACE
    399 	struct iovec *ktriov = NULL;
    400 #endif
    401 
    402 	/* note: can't use iovlen until iovcnt is validated */
    403 	iovlen = iovcnt * sizeof(struct iovec);
    404 	if ((u_int)iovcnt > UIO_SMALLIOV) {
    405 		if ((u_int)iovcnt > IOV_MAX)
    406 			return (EINVAL);
    407 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
    408 		needfree = iov;
    409 	} else if ((u_int)iovcnt > 0) {
    410 		iov = aiov;
    411 		needfree = NULL;
    412 	} else
    413 		return (EINVAL);
    414 
    415 	auio.uio_iov = iov;
    416 	auio.uio_iovcnt = iovcnt;
    417 	auio.uio_rw = UIO_WRITE;
    418 	auio.uio_segflg = UIO_USERSPACE;
    419 	auio.uio_procp = p;
    420 	error = copyin(iovp, iov, iovlen);
    421 	if (error)
    422 		goto done;
    423 	auio.uio_resid = 0;
    424 	for (i = 0; i < iovcnt; i++) {
    425 		auio.uio_resid += iov->iov_len;
    426 		/*
    427 		 * Writes return ssize_t because -1 is returned on error.
    428 		 * Therefore we must restrict the length to SSIZE_MAX to
    429 		 * avoid garbage return values.
    430 		 */
    431 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
    432 			error = EINVAL;
    433 			goto done;
    434 		}
    435 		iov++;
    436 	}
    437 #ifdef KTRACE
    438 	/*
    439 	 * if tracing, save a copy of iovec
    440 	 */
    441 	if (KTRPOINT(p, KTR_GENIO))  {
    442 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
    443 		memcpy((caddr_t)ktriov, (caddr_t)auio.uio_iov, iovlen);
    444 	}
    445 #endif
    446 	cnt = auio.uio_resid;
    447 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
    448 	if (error) {
    449 		if (auio.uio_resid != cnt && (error == ERESTART ||
    450 		    error == EINTR || error == EWOULDBLOCK))
    451 			error = 0;
    452 		if (error == EPIPE)
    453 			psignal(p, SIGPIPE);
    454 	}
    455 	cnt -= auio.uio_resid;
    456 #ifdef KTRACE
    457 	if (KTRPOINT(p, KTR_GENIO))
    458 		if (error == 0) {
    459 			ktrgenio(p->p_tracep, fd, UIO_WRITE, ktriov, cnt,
    460 			    error);
    461 		FREE(ktriov, M_TEMP);
    462 	}
    463 #endif
    464 	*retval = cnt;
    465 done:
    466 	if (needfree)
    467 		FREE(needfree, M_IOV);
    468 	return (error);
    469 }
    470 
    471 /*
    472  * Ioctl system call
    473  */
    474 /* ARGSUSED */
    475 int
    476 sys_ioctl(p, v, retval)
    477 	struct proc *p;
    478 	void *v;
    479 	register_t *retval;
    480 {
    481 	register struct sys_ioctl_args /* {
    482 		syscallarg(int) fd;
    483 		syscallarg(u_long) com;
    484 		syscallarg(caddr_t) data;
    485 	} */ *uap = v;
    486 	register struct file *fp;
    487 	register struct filedesc *fdp;
    488 	register u_long com;
    489 	register int error;
    490 	register u_int size;
    491 	caddr_t data, memp;
    492 	int tmp;
    493 #define STK_PARAMS	128
    494 	char stkbuf[STK_PARAMS];
    495 
    496 	fdp = p->p_fd;
    497 	if ((u_int)SCARG(uap, fd) >= fdp->fd_nfiles ||
    498 	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL)
    499 		return (EBADF);
    500 
    501 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
    502 		return (EBADF);
    503 
    504 	switch (com = SCARG(uap, com)) {
    505 	case FIONCLEX:
    506 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
    507 		return (0);
    508 	case FIOCLEX:
    509 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
    510 		return (0);
    511 	}
    512 
    513 	/*
    514 	 * Interpret high order word to find amount of data to be
    515 	 * copied to/from the user's address space.
    516 	 */
    517 	size = IOCPARM_LEN(com);
    518 	if (size > IOCPARM_MAX)
    519 		return (ENOTTY);
    520 	memp = NULL;
    521 	if (size > sizeof(stkbuf)) {
    522 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
    523 		data = memp;
    524 	} else
    525 		data = stkbuf;
    526 	if (com&IOC_IN) {
    527 		if (size) {
    528 			error = copyin(SCARG(uap, data), data, size);
    529 			if (error) {
    530 				if (memp)
    531 					free(memp, M_IOCTLOPS);
    532 				return (error);
    533 			}
    534 		} else
    535 			*(caddr_t *)data = SCARG(uap, data);
    536 	} else if ((com&IOC_OUT) && size)
    537 		/*
    538 		 * Zero the buffer so the user always
    539 		 * gets back something deterministic.
    540 		 */
    541 		memset(data, 0, size);
    542 	else if (com&IOC_VOID)
    543 		*(caddr_t *)data = SCARG(uap, data);
    544 
    545 	switch (com) {
    546 
    547 	case FIONBIO:
    548 		if ((tmp = *(int *)data) != 0)
    549 			fp->f_flag |= FNONBLOCK;
    550 		else
    551 			fp->f_flag &= ~FNONBLOCK;
    552 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
    553 		break;
    554 
    555 	case FIOASYNC:
    556 		if ((tmp = *(int *)data) != 0)
    557 			fp->f_flag |= FASYNC;
    558 		else
    559 			fp->f_flag &= ~FASYNC;
    560 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
    561 		break;
    562 
    563 	case FIOSETOWN:
    564 		tmp = *(int *)data;
    565 		if (fp->f_type == DTYPE_SOCKET) {
    566 			((struct socket *)fp->f_data)->so_pgid = tmp;
    567 			error = 0;
    568 			break;
    569 		}
    570 		if (tmp <= 0) {
    571 			tmp = -tmp;
    572 		} else {
    573 			struct proc *p1 = pfind(tmp);
    574 			if (p1 == 0) {
    575 				error = ESRCH;
    576 				break;
    577 			}
    578 			tmp = p1->p_pgrp->pg_id;
    579 		}
    580 		error = (*fp->f_ops->fo_ioctl)
    581 			(fp, TIOCSPGRP, (caddr_t)&tmp, p);
    582 		break;
    583 
    584 	case FIOGETOWN:
    585 		if (fp->f_type == DTYPE_SOCKET) {
    586 			error = 0;
    587 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
    588 			break;
    589 		}
    590 		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
    591 		*(int *)data = -*(int *)data;
    592 		break;
    593 
    594 	default:
    595 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
    596 		/*
    597 		 * Copy any data to user, size was
    598 		 * already set and checked above.
    599 		 */
    600 		if (error == 0 && (com&IOC_OUT) && size)
    601 			error = copyout(data, SCARG(uap, data), size);
    602 		break;
    603 	}
    604 	if (memp)
    605 		free(memp, M_IOCTLOPS);
    606 	return (error);
    607 }
    608 
    609 int	selwait, nselcoll;
    610 
    611 /*
    612  * Select system call.
    613  */
    614 int
    615 sys_select(p, v, retval)
    616 	register struct proc *p;
    617 	void *v;
    618 	register_t *retval;
    619 {
    620 	register struct sys_select_args /* {
    621 		syscallarg(int) nd;
    622 		syscallarg(fd_set *) in;
    623 		syscallarg(fd_set *) ou;
    624 		syscallarg(fd_set *) ex;
    625 		syscallarg(struct timeval *) tv;
    626 	} */ *uap = v;
    627 	caddr_t bits;
    628 	char smallbits[howmany(FD_SETSIZE, NFDBITS) * sizeof(fd_mask) * 6];
    629 	struct timeval atv;
    630 	int s, ncoll, error = 0, timo;
    631 	size_t ni;
    632 
    633 	if (SCARG(uap, nd) < 0)
    634 		return (EINVAL);
    635 	if (SCARG(uap, nd) > p->p_fd->fd_nfiles) {
    636 		/* forgiving; slightly wrong */
    637 		SCARG(uap, nd) = p->p_fd->fd_nfiles;
    638 	}
    639 	ni = howmany(SCARG(uap, nd), NFDBITS) * sizeof(fd_mask);
    640 	if (ni * 6 > sizeof(smallbits))
    641 		bits = malloc(ni * 6, M_TEMP, M_WAITOK);
    642 	else
    643 		bits = smallbits;
    644 
    645 #define	getbits(name, x) \
    646 	if (SCARG(uap, name)) { \
    647 		error = copyin(SCARG(uap, name), bits + ni * x, ni); \
    648 		if (error) \
    649 			goto done; \
    650 	} else \
    651 		memset(bits + ni * x, 0, ni);
    652 	getbits(in, 0);
    653 	getbits(ou, 1);
    654 	getbits(ex, 2);
    655 #undef	getbits
    656 
    657 	if (SCARG(uap, tv)) {
    658 		error = copyin(SCARG(uap, tv), (caddr_t)&atv,
    659 			sizeof(atv));
    660 		if (error)
    661 			goto done;
    662 		if (itimerfix(&atv)) {
    663 			error = EINVAL;
    664 			goto done;
    665 		}
    666 		s = splclock();
    667 		timeradd(&atv, &time, &atv);
    668 		timo = hzto(&atv);
    669 		/*
    670 		 * Avoid inadvertently sleeping forever.
    671 		 */
    672 		if (timo == 0)
    673 			timo = 1;
    674 		splx(s);
    675 	} else
    676 		timo = 0;
    677 retry:
    678 	ncoll = nselcoll;
    679 	p->p_flag |= P_SELECT;
    680 	error = selscan(p, (fd_mask *)(bits + ni * 0),
    681 			   (fd_mask *)(bits + ni * 3), SCARG(uap, nd), retval);
    682 	if (error || *retval)
    683 		goto done;
    684 	s = splhigh();
    685 	if (timo && timercmp(&time, &atv, >=)) {
    686 		splx(s);
    687 		goto done;
    688 	}
    689 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
    690 		splx(s);
    691 		goto retry;
    692 	}
    693 	p->p_flag &= ~P_SELECT;
    694 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
    695 	splx(s);
    696 	if (error == 0)
    697 		goto retry;
    698 done:
    699 	p->p_flag &= ~P_SELECT;
    700 	/* select is not restarted after signals... */
    701 	if (error == ERESTART)
    702 		error = EINTR;
    703 	if (error == EWOULDBLOCK)
    704 		error = 0;
    705 	if (error == 0) {
    706 #define	putbits(name, x) \
    707 		if (SCARG(uap, name)) { \
    708 			error = copyout(bits + ni * x, SCARG(uap, name), ni); \
    709 			if (error) \
    710 				goto out; \
    711 		}
    712 		putbits(in, 3);
    713 		putbits(ou, 4);
    714 		putbits(ex, 5);
    715 #undef putbits
    716 	}
    717 out:
    718 	if (ni * 6 > sizeof(smallbits))
    719 		free(bits, M_TEMP);
    720 	return (error);
    721 }
    722 
    723 int
    724 selscan(p, ibitp, obitp, nfd, retval)
    725 	struct proc *p;
    726 	fd_mask *ibitp, *obitp;
    727 	int nfd;
    728 	register_t *retval;
    729 {
    730 	register struct filedesc *fdp = p->p_fd;
    731 	register int msk, i, j, fd;
    732 	register fd_mask ibits, obits;
    733 	struct file *fp;
    734 	int n = 0;
    735 	static int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
    736 			       POLLWRNORM | POLLHUP | POLLERR,
    737 			       POLLRDBAND };
    738 
    739 	for (msk = 0; msk < 3; msk++) {
    740 		for (i = 0; i < nfd; i += NFDBITS) {
    741 			ibits = *ibitp++;
    742 			obits = 0;
    743 			while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
    744 				ibits &= ~(1 << j);
    745 				fp = fdp->fd_ofiles[fd];
    746 				if (fp == NULL)
    747 					return (EBADF);
    748 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], p)) {
    749 					obits |= (1 << j);
    750 					n++;
    751 				}
    752 			}
    753 			*obitp++ = obits;
    754 		}
    755 	}
    756 	*retval = n;
    757 	return (0);
    758 }
    759 
    760 /*
    761  * Poll system call.
    762  */
    763 int
    764 sys_poll(p, v, retval)
    765 	register struct proc *p;
    766 	void *v;
    767 	register_t *retval;
    768 {
    769 	register struct sys_poll_args /* {
    770 		syscallarg(struct pollfd *) fds;
    771 		syscallarg(u_int) nfds;
    772 		syscallarg(int) timeout;
    773 	} */ *uap = v;
    774 	caddr_t bits;
    775 	char smallbits[32 * sizeof(struct pollfd)];
    776 	struct timeval atv;
    777 	int s, ncoll, error = 0, timo;
    778 	size_t ni;
    779 
    780 	if (SCARG(uap, nfds) > p->p_fd->fd_nfiles) {
    781 		/* forgiving; slightly wrong */
    782 		SCARG(uap, nfds) = p->p_fd->fd_nfiles;
    783 	}
    784 	ni = SCARG(uap, nfds) * sizeof(struct pollfd);
    785 	if (ni > sizeof(smallbits))
    786 		bits = malloc(ni, M_TEMP, M_WAITOK);
    787 	else
    788 		bits = smallbits;
    789 
    790 	error = copyin(SCARG(uap, fds), bits, ni);
    791 	if (error)
    792 		goto done;
    793 
    794 	if (SCARG(uap, timeout) != INFTIM) {
    795 		atv.tv_sec = SCARG(uap, timeout) / 1000;
    796 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
    797 		if (itimerfix(&atv)) {
    798 			error = EINVAL;
    799 			goto done;
    800 		}
    801 		s = splclock();
    802 		timeradd(&atv, &time, &atv);
    803 		timo = hzto(&atv);
    804 		/*
    805 		 * Avoid inadvertently sleeping forever.
    806 		 */
    807 		if (timo == 0)
    808 			timo = 1;
    809 		splx(s);
    810 	} else
    811 		timo = 0;
    812 retry:
    813 	ncoll = nselcoll;
    814 	p->p_flag |= P_SELECT;
    815 	error = pollscan(p, (struct pollfd *)bits, SCARG(uap, nfds), retval);
    816 	if (error || *retval)
    817 		goto done;
    818 	s = splhigh();
    819 	if (timo && timercmp(&time, &atv, >=)) {
    820 		splx(s);
    821 		goto done;
    822 	}
    823 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
    824 		splx(s);
    825 		goto retry;
    826 	}
    827 	p->p_flag &= ~P_SELECT;
    828 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
    829 	splx(s);
    830 	if (error == 0)
    831 		goto retry;
    832 done:
    833 	p->p_flag &= ~P_SELECT;
    834 	/* poll is not restarted after signals... */
    835 	if (error == ERESTART)
    836 		error = EINTR;
    837 	if (error == EWOULDBLOCK)
    838 		error = 0;
    839 	if (error == 0) {
    840 		error = copyout(bits, SCARG(uap, fds), ni);
    841 		if (error)
    842 			goto out;
    843 	}
    844 out:
    845 	if (ni > sizeof(smallbits))
    846 		free(bits, M_TEMP);
    847 	return (error);
    848 }
    849 
    850 int
    851 pollscan(p, fds, nfd, retval)
    852 	struct proc *p;
    853 	struct pollfd *fds;
    854 	int nfd;
    855 	register_t *retval;
    856 {
    857 	register struct filedesc *fdp = p->p_fd;
    858 	int i;
    859 	struct file *fp;
    860 	int n = 0;
    861 
    862 	for (i = 0; i < nfd; i++, fds++) {
    863 		if ((u_int)fds->fd >= fdp->fd_nfiles) {
    864 			fds->revents = POLLNVAL;
    865 			n++;
    866 		} else {
    867 			fp = fdp->fd_ofiles[fds->fd];
    868 			if (fp == 0) {
    869 				fds->revents = POLLNVAL;
    870 				n++;
    871 			} else {
    872 				fds->revents = (*fp->f_ops->fo_poll)(fp,
    873 				    fds->events | POLLERR | POLLHUP, p);
    874 				if (fds->revents != 0)
    875 					n++;
    876 			}
    877 		}
    878 	}
    879 	*retval = n;
    880 	return (0);
    881 }
    882 
    883 /*ARGSUSED*/
    884 int
    885 seltrue(dev, events, p)
    886 	dev_t dev;
    887 	int events;
    888 	struct proc *p;
    889 {
    890 
    891 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
    892 }
    893 
    894 /*
    895  * Record a select request.
    896  */
    897 void
    898 selrecord(selector, sip)
    899 	struct proc *selector;
    900 	struct selinfo *sip;
    901 {
    902 	struct proc *p;
    903 	pid_t mypid;
    904 
    905 	mypid = selector->p_pid;
    906 	if (sip->si_pid == mypid)
    907 		return;
    908 	if (sip->si_pid && (p = pfind(sip->si_pid)) &&
    909 	    p->p_wchan == (caddr_t)&selwait)
    910 		sip->si_flags |= SI_COLL;
    911 	else
    912 		sip->si_pid = mypid;
    913 }
    914 
    915 /*
    916  * Do a wakeup when a selectable event occurs.
    917  */
    918 void
    919 selwakeup(sip)
    920 	register struct selinfo *sip;
    921 {
    922 	register struct proc *p;
    923 	int s;
    924 
    925 	if (sip->si_pid == 0)
    926 		return;
    927 	if (sip->si_flags & SI_COLL) {
    928 		nselcoll++;
    929 		sip->si_flags &= ~SI_COLL;
    930 		wakeup((caddr_t)&selwait);
    931 	}
    932 	p = pfind(sip->si_pid);
    933 	sip->si_pid = 0;
    934 	if (p != NULL) {
    935 		s = splhigh();
    936 		if (p->p_wchan == (caddr_t)&selwait) {
    937 			if (p->p_stat == SSLEEP)
    938 				setrunnable(p);
    939 			else
    940 				unsleep(p);
    941 		} else if (p->p_flag & P_SELECT)
    942 			p->p_flag &= ~P_SELECT;
    943 		splx(s);
    944 	}
    945 }
    946