Home | History | Annotate | Line # | Download | only in kern
kern_descrip.c revision 1.225.2.1.2.1
      1  1.225.2.1.2.1       snj /*	$NetBSD: kern_descrip.c,v 1.225.2.1.2.1 2017/06/03 16:56:32 snj Exp $	*/
      2          1.173        ad 
      3          1.173        ad /*-
      4          1.190        ad  * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
      5          1.173        ad  * All rights reserved.
      6          1.173        ad  *
      7          1.190        ad  * This code is derived from software contributed to The NetBSD Foundation
      8          1.190        ad  * by Andrew Doran.
      9          1.190        ad  *
     10          1.173        ad  * Redistribution and use in source and binary forms, with or without
     11          1.173        ad  * modification, are permitted provided that the following conditions
     12          1.173        ad  * are met:
     13          1.173        ad  * 1. Redistributions of source code must retain the above copyright
     14          1.173        ad  *    notice, this list of conditions and the following disclaimer.
     15          1.173        ad  * 2. Redistributions in binary form must reproduce the above copyright
     16          1.173        ad  *    notice, this list of conditions and the following disclaimer in the
     17          1.173        ad  *    documentation and/or other materials provided with the distribution.
     18          1.173        ad  *
     19          1.173        ad  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20          1.173        ad  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21          1.173        ad  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22          1.173        ad  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23          1.173        ad  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24          1.173        ad  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25          1.173        ad  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26          1.173        ad  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27          1.173        ad  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28          1.173        ad  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29          1.173        ad  * POSSIBILITY OF SUCH DAMAGE.
     30          1.173        ad  */
     31           1.22       cgd 
     32           1.16       cgd /*
     33           1.17       cgd  * Copyright (c) 1982, 1986, 1989, 1991, 1993
     34           1.17       cgd  *	The Regents of the University of California.  All rights reserved.
     35           1.16       cgd  * (c) UNIX System Laboratories, Inc.
     36           1.16       cgd  * All or some portions of this file are derived from material licensed
     37           1.16       cgd  * to the University of California by American Telephone and Telegraph
     38           1.16       cgd  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
     39           1.16       cgd  * the permission of UNIX System Laboratories, Inc.
     40           1.16       cgd  *
     41           1.16       cgd  * Redistribution and use in source and binary forms, with or without
     42           1.16       cgd  * modification, are permitted provided that the following conditions
     43           1.16       cgd  * are met:
     44           1.16       cgd  * 1. Redistributions of source code must retain the above copyright
     45           1.16       cgd  *    notice, this list of conditions and the following disclaimer.
     46           1.16       cgd  * 2. Redistributions in binary form must reproduce the above copyright
     47           1.16       cgd  *    notice, this list of conditions and the following disclaimer in the
     48           1.16       cgd  *    documentation and/or other materials provided with the distribution.
     49          1.111       agc  * 3. Neither the name of the University nor the names of its contributors
     50           1.16       cgd  *    may be used to endorse or promote products derived from this software
     51           1.16       cgd  *    without specific prior written permission.
     52           1.16       cgd  *
     53           1.16       cgd  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54           1.16       cgd  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55           1.16       cgd  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56           1.16       cgd  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57           1.16       cgd  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58           1.16       cgd  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59           1.16       cgd  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60           1.16       cgd  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61           1.16       cgd  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62           1.16       cgd  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63           1.16       cgd  * SUCH DAMAGE.
     64           1.16       cgd  *
     65           1.51      fvdl  *	@(#)kern_descrip.c	8.8 (Berkeley) 2/14/95
     66           1.16       cgd  */
     67           1.81     lukem 
     68          1.173        ad /*
     69          1.173        ad  * File descriptor management.
     70          1.173        ad  */
     71          1.173        ad 
     72           1.81     lukem #include <sys/cdefs.h>
     73  1.225.2.1.2.1       snj __KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.225.2.1.2.1 2017/06/03 16:56:32 snj Exp $");
     74           1.50       mrg 
     75           1.16       cgd #include <sys/param.h>
     76           1.16       cgd #include <sys/systm.h>
     77           1.16       cgd #include <sys/filedesc.h>
     78           1.16       cgd #include <sys/kernel.h>
     79           1.16       cgd #include <sys/proc.h>
     80           1.16       cgd #include <sys/file.h>
     81           1.16       cgd #include <sys/socket.h>
     82           1.16       cgd #include <sys/socketvar.h>
     83           1.16       cgd #include <sys/stat.h>
     84           1.16       cgd #include <sys/ioctl.h>
     85           1.16       cgd #include <sys/fcntl.h>
     86           1.55   thorpej #include <sys/pool.h>
     87           1.17       cgd #include <sys/unistd.h>
     88           1.16       cgd #include <sys/resourcevar.h>
     89           1.42  christos #include <sys/conf.h>
     90           1.96  jdolecek #include <sys/event.h>
     91          1.143      elad #include <sys/kauth.h>
     92          1.163        ad #include <sys/atomic.h>
     93           1.25       cgd #include <sys/syscallargs.h>
     94          1.176        ad #include <sys/cpu.h>
     95          1.184     pooka #include <sys/kmem.h>
     96          1.184     pooka #include <sys/vnode.h>
     97          1.210     pooka #include <sys/sysctl.h>
     98          1.210     pooka #include <sys/ktrace.h>
     99           1.38  christos 
    100          1.213     rmind /*
    101          1.213     rmind  * A list (head) of open files, counter, and lock protecting them.
    102          1.213     rmind  */
    103          1.213     rmind struct filelist		filehead	__cacheline_aligned;
    104          1.213     rmind static u_int		nfiles		__cacheline_aligned;
    105          1.213     rmind kmutex_t		filelist_lock	__cacheline_aligned;
    106          1.213     rmind 
    107          1.213     rmind static pool_cache_t	filedesc_cache	__read_mostly;
    108          1.213     rmind static pool_cache_t	file_cache	__read_mostly;
    109          1.213     rmind static pool_cache_t	fdfile_cache	__read_mostly;
    110          1.213     rmind 
    111          1.167        ad static int	file_ctor(void *, void *, int);
    112          1.167        ad static void	file_dtor(void *, void *);
    113          1.173        ad static int	fdfile_ctor(void *, void *, int);
    114          1.173        ad static void	fdfile_dtor(void *, void *);
    115          1.173        ad static int	filedesc_ctor(void *, void *, int);
    116          1.173        ad static void	filedesc_dtor(void *, void *);
    117          1.173        ad static int	filedescopen(dev_t, int, int, lwp_t *);
    118          1.162        ad 
    119          1.210     pooka static int sysctl_kern_file(SYSCTLFN_PROTO);
    120          1.210     pooka static int sysctl_kern_file2(SYSCTLFN_PROTO);
    121          1.210     pooka static void fill_file(struct kinfo_file *, const file_t *, const fdfile_t *,
    122          1.210     pooka 		      int, pid_t);
    123          1.210     pooka 
    124          1.173        ad const struct cdevsw filedesc_cdevsw = {
    125          1.224  dholland 	.d_open = filedescopen,
    126          1.224  dholland 	.d_close = noclose,
    127          1.224  dholland 	.d_read = noread,
    128          1.224  dholland 	.d_write = nowrite,
    129          1.224  dholland 	.d_ioctl = noioctl,
    130          1.224  dholland 	.d_stop = nostop,
    131          1.224  dholland 	.d_tty = notty,
    132          1.224  dholland 	.d_poll = nopoll,
    133          1.224  dholland 	.d_mmap = nommap,
    134          1.224  dholland 	.d_kqfilter = nokqfilter,
    135          1.225  dholland 	.d_discard = nodiscard,
    136          1.224  dholland 	.d_flag = D_OTHER | D_MPSAFE
    137          1.173        ad };
    138          1.173        ad 
    139          1.173        ad /* For ease of reading. */
    140          1.173        ad __strong_alias(fd_putvnode,fd_putfile)
    141          1.173        ad __strong_alias(fd_putsock,fd_putfile)
    142          1.173        ad 
    143          1.173        ad /*
    144          1.173        ad  * Initialize the descriptor system.
    145          1.173        ad  */
    146          1.173        ad void
    147          1.173        ad fd_sys_init(void)
    148          1.173        ad {
    149          1.210     pooka 	static struct sysctllog *clog;
    150          1.173        ad 
    151          1.173        ad 	mutex_init(&filelist_lock, MUTEX_DEFAULT, IPL_NONE);
    152          1.173        ad 
    153          1.174        ad 	file_cache = pool_cache_init(sizeof(file_t), coherency_unit, 0,
    154          1.173        ad 	    0, "file", NULL, IPL_NONE, file_ctor, file_dtor, NULL);
    155          1.173        ad 	KASSERT(file_cache != NULL);
    156          1.173        ad 
    157          1.174        ad 	fdfile_cache = pool_cache_init(sizeof(fdfile_t), coherency_unit, 0,
    158          1.173        ad 	    PR_LARGECACHE, "fdfile", NULL, IPL_NONE, fdfile_ctor, fdfile_dtor,
    159          1.173        ad 	    NULL);
    160          1.173        ad 	KASSERT(fdfile_cache != NULL);
    161          1.173        ad 
    162          1.174        ad 	filedesc_cache = pool_cache_init(sizeof(filedesc_t), coherency_unit,
    163          1.173        ad 	    0, 0, "filedesc", NULL, IPL_NONE, filedesc_ctor, filedesc_dtor,
    164          1.173        ad 	    NULL);
    165          1.173        ad 	KASSERT(filedesc_cache != NULL);
    166          1.210     pooka 
    167          1.210     pooka 	sysctl_createv(&clog, 0, NULL, NULL,
    168          1.210     pooka 		       CTLFLAG_PERMANENT,
    169          1.210     pooka 		       CTLTYPE_STRUCT, "file",
    170          1.210     pooka 		       SYSCTL_DESCR("System open file table"),
    171          1.210     pooka 		       sysctl_kern_file, 0, NULL, 0,
    172          1.210     pooka 		       CTL_KERN, KERN_FILE, CTL_EOL);
    173          1.210     pooka 	sysctl_createv(&clog, 0, NULL, NULL,
    174          1.210     pooka 		       CTLFLAG_PERMANENT,
    175          1.210     pooka 		       CTLTYPE_STRUCT, "file2",
    176          1.210     pooka 		       SYSCTL_DESCR("System open file table"),
    177          1.210     pooka 		       sysctl_kern_file2, 0, NULL, 0,
    178          1.210     pooka 		       CTL_KERN, KERN_FILE2, CTL_EOL);
    179          1.173        ad }
    180           1.72     lukem 
    181          1.192        ad static bool
    182          1.192        ad fd_isused(filedesc_t *fdp, unsigned fd)
    183          1.192        ad {
    184          1.192        ad 	u_int off = fd >> NDENTRYSHIFT;
    185          1.192        ad 
    186          1.192        ad 	KASSERT(fd < fdp->fd_dt->dt_nfiles);
    187          1.192        ad 
    188          1.192        ad 	return (fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0;
    189          1.192        ad }
    190          1.192        ad 
    191          1.192        ad /*
    192          1.192        ad  * Verify that the bitmaps match the descriptor table.
    193          1.192        ad  */
    194          1.192        ad static inline void
    195          1.192        ad fd_checkmaps(filedesc_t *fdp)
    196          1.192        ad {
    197          1.192        ad #ifdef DEBUG
    198          1.192        ad 	fdtab_t *dt;
    199          1.192        ad 	u_int fd;
    200          1.192        ad 
    201          1.192        ad 	dt = fdp->fd_dt;
    202          1.196      yamt 	if (fdp->fd_refcnt == -1) {
    203          1.196      yamt 		/*
    204          1.196      yamt 		 * fd_free tears down the table without maintaining its bitmap.
    205          1.196      yamt 		 */
    206          1.196      yamt 		return;
    207          1.196      yamt 	}
    208          1.192        ad 	for (fd = 0; fd < dt->dt_nfiles; fd++) {
    209          1.192        ad 		if (fd < NDFDFILE) {
    210          1.192        ad 			KASSERT(dt->dt_ff[fd] ==
    211          1.192        ad 			    (fdfile_t *)fdp->fd_dfdfile[fd]);
    212          1.192        ad 		}
    213          1.192        ad 		if (dt->dt_ff[fd] == NULL) {
    214          1.192        ad 			KASSERT(!fd_isused(fdp, fd));
    215          1.192        ad 		} else if (dt->dt_ff[fd]->ff_file != NULL) {
    216          1.192        ad 			KASSERT(fd_isused(fdp, fd));
    217          1.192        ad 		}
    218          1.192        ad 	}
    219          1.213     rmind #endif
    220          1.192        ad }
    221          1.192        ad 
    222          1.173        ad static int
    223          1.173        ad fd_next_zero(filedesc_t *fdp, uint32_t *bitmap, int want, u_int bits)
    224          1.115    provos {
    225          1.115    provos 	int i, off, maxoff;
    226          1.115    provos 	uint32_t sub;
    227          1.115    provos 
    228          1.173        ad 	KASSERT(mutex_owned(&fdp->fd_lock));
    229          1.173        ad 
    230          1.192        ad 	fd_checkmaps(fdp);
    231          1.192        ad 
    232          1.115    provos 	if (want > bits)
    233          1.115    provos 		return -1;
    234          1.115    provos 
    235          1.115    provos 	off = want >> NDENTRYSHIFT;
    236          1.115    provos 	i = want & NDENTRYMASK;
    237          1.115    provos 	if (i) {
    238          1.115    provos 		sub = bitmap[off] | ((u_int)~0 >> (NDENTRIES - i));
    239          1.115    provos 		if (sub != ~0)
    240          1.115    provos 			goto found;
    241          1.115    provos 		off++;
    242          1.115    provos 	}
    243          1.115    provos 
    244          1.115    provos 	maxoff = NDLOSLOTS(bits);
    245          1.115    provos 	while (off < maxoff) {
    246          1.115    provos 		if ((sub = bitmap[off]) != ~0)
    247          1.115    provos 			goto found;
    248          1.115    provos 		off++;
    249          1.115    provos 	}
    250          1.115    provos 
    251          1.213     rmind 	return -1;
    252          1.115    provos 
    253          1.115    provos  found:
    254          1.115    provos 	return (off << NDENTRYSHIFT) + ffs(~sub) - 1;
    255          1.115    provos }
    256          1.115    provos 
    257          1.134   thorpej static int
    258          1.173        ad fd_last_set(filedesc_t *fd, int last)
    259          1.115    provos {
    260          1.115    provos 	int off, i;
    261          1.192        ad 	fdfile_t **ff = fd->fd_dt->dt_ff;
    262          1.115    provos 	uint32_t *bitmap = fd->fd_lomap;
    263          1.115    provos 
    264          1.173        ad 	KASSERT(mutex_owned(&fd->fd_lock));
    265          1.173        ad 
    266          1.192        ad 	fd_checkmaps(fd);
    267          1.192        ad 
    268          1.115    provos 	off = (last - 1) >> NDENTRYSHIFT;
    269          1.115    provos 
    270          1.121    provos 	while (off >= 0 && !bitmap[off])
    271          1.115    provos 		off--;
    272          1.115    provos 
    273          1.115    provos 	if (off < 0)
    274          1.213     rmind 		return -1;
    275          1.131     perry 
    276          1.115    provos 	i = ((off + 1) << NDENTRYSHIFT) - 1;
    277          1.115    provos 	if (i >= last)
    278          1.115    provos 		i = last - 1;
    279          1.115    provos 
    280          1.173        ad 	/* XXX should use bitmap */
    281          1.192        ad 	while (i > 0 && (ff[i] == NULL || !ff[i]->ff_allocated))
    282          1.115    provos 		i--;
    283          1.115    provos 
    284          1.213     rmind 	return i;
    285          1.115    provos }
    286          1.115    provos 
    287          1.192        ad static inline void
    288          1.173        ad fd_used(filedesc_t *fdp, unsigned fd)
    289           1.27   mycroft {
    290          1.115    provos 	u_int off = fd >> NDENTRYSHIFT;
    291          1.173        ad 	fdfile_t *ff;
    292          1.173        ad 
    293          1.192        ad 	ff = fdp->fd_dt->dt_ff[fd];
    294          1.115    provos 
    295          1.173        ad 	KASSERT(mutex_owned(&fdp->fd_lock));
    296          1.173        ad 	KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) == 0);
    297          1.173        ad 	KASSERT(ff != NULL);
    298          1.173        ad 	KASSERT(ff->ff_file == NULL);
    299          1.213     rmind 	KASSERT(!ff->ff_allocated);
    300          1.124      yamt 
    301          1.217       chs 	ff->ff_allocated = true;
    302          1.115    provos 	fdp->fd_lomap[off] |= 1 << (fd & NDENTRYMASK);
    303          1.192        ad 	if (__predict_false(fdp->fd_lomap[off] == ~0)) {
    304          1.173        ad 		KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
    305          1.124      yamt 		    (1 << (off & NDENTRYMASK))) == 0);
    306          1.115    provos 		fdp->fd_himap[off >> NDENTRYSHIFT] |= 1 << (off & NDENTRYMASK);
    307          1.124      yamt 	}
    308           1.27   mycroft 
    309          1.173        ad 	if ((int)fd > fdp->fd_lastfile) {
    310           1.27   mycroft 		fdp->fd_lastfile = fd;
    311          1.173        ad 	}
    312          1.173        ad 
    313          1.192        ad 	fd_checkmaps(fdp);
    314           1.27   mycroft }
    315           1.27   mycroft 
    316          1.192        ad static inline void
    317          1.173        ad fd_unused(filedesc_t *fdp, unsigned fd)
    318           1.27   mycroft {
    319          1.115    provos 	u_int off = fd >> NDENTRYSHIFT;
    320          1.173        ad 	fdfile_t *ff;
    321           1.27   mycroft 
    322          1.192        ad 	ff = fdp->fd_dt->dt_ff[fd];
    323          1.173        ad 
    324          1.173        ad 	/*
    325          1.173        ad 	 * Don't assert the lock is held here, as we may be copying
    326          1.173        ad 	 * the table during exec() and it is not needed there.
    327          1.173        ad 	 * procfs and sysctl are locked out by proc::p_reflock.
    328          1.173        ad 	 *
    329          1.173        ad 	 * KASSERT(mutex_owned(&fdp->fd_lock));
    330          1.173        ad 	 */
    331          1.173        ad 	KASSERT(ff != NULL);
    332          1.173        ad 	KASSERT(ff->ff_file == NULL);
    333          1.213     rmind 	KASSERT(ff->ff_allocated);
    334          1.173        ad 
    335          1.173        ad 	if (fd < fdp->fd_freefile) {
    336           1.27   mycroft 		fdp->fd_freefile = fd;
    337          1.173        ad 	}
    338          1.115    provos 
    339          1.124      yamt 	if (fdp->fd_lomap[off] == ~0) {
    340          1.173        ad 		KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
    341          1.124      yamt 		    (1 << (off & NDENTRYMASK))) != 0);
    342          1.124      yamt 		fdp->fd_himap[off >> NDENTRYSHIFT] &=
    343          1.124      yamt 		    ~(1 << (off & NDENTRYMASK));
    344          1.124      yamt 	}
    345          1.173        ad 	KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0);
    346          1.115    provos 	fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK));
    347          1.217       chs 	ff->ff_allocated = false;
    348          1.115    provos 
    349          1.173        ad 	KASSERT(fd <= fdp->fd_lastfile);
    350          1.173        ad 	if (fd == fdp->fd_lastfile) {
    351          1.173        ad 		fdp->fd_lastfile = fd_last_set(fdp, fd);
    352          1.173        ad 	}
    353          1.192        ad 	fd_checkmaps(fdp);
    354           1.77   thorpej }
    355           1.77   thorpej 
    356           1.16       cgd /*
    357          1.173        ad  * Look up the file structure corresponding to a file descriptor
    358          1.173        ad  * and return the file, holding a reference on the descriptor.
    359          1.134   thorpej  */
    360          1.214     rmind file_t *
    361          1.173        ad fd_getfile(unsigned fd)
    362          1.134   thorpej {
    363          1.173        ad 	filedesc_t *fdp;
    364          1.173        ad 	fdfile_t *ff;
    365          1.173        ad 	file_t *fp;
    366          1.192        ad 	fdtab_t *dt;
    367          1.134   thorpej 
    368          1.134   thorpej 	/*
    369          1.173        ad 	 * Look up the fdfile structure representing this descriptor.
    370          1.192        ad 	 * We are doing this unlocked.  See fd_tryexpand().
    371          1.134   thorpej 	 */
    372          1.192        ad 	fdp = curlwp->l_fd;
    373          1.192        ad 	dt = fdp->fd_dt;
    374          1.192        ad 	if (__predict_false(fd >= dt->dt_nfiles)) {
    375          1.173        ad 		return NULL;
    376          1.173        ad 	}
    377          1.192        ad 	ff = dt->dt_ff[fd];
    378          1.173        ad 	KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
    379          1.173        ad 	if (__predict_false(ff == NULL)) {
    380          1.173        ad 		return NULL;
    381          1.173        ad 	}
    382          1.134   thorpej 
    383          1.191        ad 	/* Now get a reference to the descriptor. */
    384          1.191        ad 	if (fdp->fd_refcnt == 1) {
    385          1.191        ad 		/*
    386          1.191        ad 		 * Single threaded: don't need to worry about concurrent
    387          1.191        ad 		 * access (other than earlier calls to kqueue, which may
    388          1.191        ad 		 * hold a reference to the descriptor).
    389          1.191        ad 		 */
    390          1.191        ad 		ff->ff_refcnt++;
    391          1.191        ad 	} else {
    392          1.191        ad 		/*
    393          1.192        ad 		 * Multi threaded: issue a memory barrier to ensure that we
    394          1.192        ad 		 * acquire the file pointer _after_ adding a reference.  If
    395          1.192        ad 		 * no memory barrier, we could fetch a stale pointer.
    396          1.191        ad 		 */
    397          1.191        ad 		atomic_inc_uint(&ff->ff_refcnt);
    398          1.173        ad #ifndef __HAVE_ATOMIC_AS_MEMBAR
    399          1.191        ad 		membar_enter();
    400          1.173        ad #endif
    401          1.191        ad 	}
    402          1.134   thorpej 
    403          1.173        ad 	/*
    404          1.173        ad 	 * If the file is not open or is being closed then put the
    405          1.173        ad 	 * reference back.
    406          1.173        ad 	 */
    407          1.173        ad 	fp = ff->ff_file;
    408          1.173        ad 	if (__predict_true(fp != NULL)) {
    409          1.173        ad 		return fp;
    410          1.134   thorpej 	}
    411          1.173        ad 	fd_putfile(fd);
    412          1.173        ad 	return NULL;
    413          1.134   thorpej }
    414          1.134   thorpej 
    415          1.134   thorpej /*
    416          1.173        ad  * Release a reference to a file descriptor acquired with fd_getfile().
    417          1.161        ad  */
    418          1.161        ad void
    419          1.173        ad fd_putfile(unsigned fd)
    420          1.161        ad {
    421          1.173        ad 	filedesc_t *fdp;
    422          1.173        ad 	fdfile_t *ff;
    423          1.173        ad 	u_int u, v;
    424          1.173        ad 
    425          1.173        ad 	fdp = curlwp->l_fd;
    426          1.192        ad 	ff = fdp->fd_dt->dt_ff[fd];
    427          1.173        ad 
    428          1.192        ad 	KASSERT(fd < fdp->fd_dt->dt_nfiles);
    429          1.173        ad 	KASSERT(ff != NULL);
    430          1.173        ad 	KASSERT((ff->ff_refcnt & FR_MASK) > 0);
    431          1.173        ad 	KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
    432          1.161        ad 
    433          1.191        ad 	if (fdp->fd_refcnt == 1) {
    434          1.191        ad 		/*
    435          1.191        ad 		 * Single threaded: don't need to worry about concurrent
    436          1.191        ad 		 * access (other than earlier calls to kqueue, which may
    437          1.191        ad 		 * hold a reference to the descriptor).
    438          1.191        ad 		 */
    439          1.191        ad 		if (__predict_false((ff->ff_refcnt & FR_CLOSING) != 0)) {
    440          1.191        ad 			fd_close(fd);
    441          1.191        ad 			return;
    442          1.191        ad 		}
    443          1.191        ad 		ff->ff_refcnt--;
    444          1.191        ad 		return;
    445          1.191        ad 	}
    446          1.191        ad 
    447          1.173        ad 	/*
    448          1.173        ad 	 * Ensure that any use of the file is complete and globally
    449          1.173        ad 	 * visible before dropping the final reference.  If no membar,
    450          1.173        ad 	 * the current CPU could still access memory associated with
    451          1.173        ad 	 * the file after it has been freed or recycled by another
    452          1.173        ad 	 * CPU.
    453          1.173        ad 	 */
    454          1.173        ad #ifndef __HAVE_ATOMIC_AS_MEMBAR
    455          1.173        ad 	membar_exit();
    456          1.173        ad #endif
    457          1.161        ad 
    458          1.173        ad 	/*
    459          1.173        ad 	 * Be optimistic and start out with the assumption that no other
    460          1.173        ad 	 * threads are trying to close the descriptor.  If the CAS fails,
    461          1.173        ad 	 * we lost a race and/or it's being closed.
    462          1.173        ad 	 */
    463          1.173        ad 	for (u = ff->ff_refcnt & FR_MASK;; u = v) {
    464          1.173        ad 		v = atomic_cas_uint(&ff->ff_refcnt, u, u - 1);
    465          1.173        ad 		if (__predict_true(u == v)) {
    466          1.173        ad 			return;
    467          1.173        ad 		}
    468          1.173        ad 		if (__predict_false((v & FR_CLOSING) != 0)) {
    469          1.173        ad 			break;
    470          1.173        ad 		}
    471          1.173        ad 	}
    472          1.162        ad 
    473          1.173        ad 	/* Another thread is waiting to close the file: join it. */
    474          1.173        ad 	(void)fd_close(fd);
    475          1.161        ad }
    476          1.161        ad 
    477          1.161        ad /*
    478          1.173        ad  * Convenience wrapper around fd_getfile() that returns reference
    479          1.173        ad  * to a vnode.
    480           1.16       cgd  */
    481           1.38  christos int
    482          1.173        ad fd_getvnode(unsigned fd, file_t **fpp)
    483           1.36   thorpej {
    484          1.173        ad 	vnode_t *vp;
    485          1.173        ad 	file_t *fp;
    486           1.72     lukem 
    487          1.173        ad 	fp = fd_getfile(fd);
    488          1.173        ad 	if (__predict_false(fp == NULL)) {
    489          1.173        ad 		return EBADF;
    490          1.173        ad 	}
    491          1.173        ad 	if (__predict_false(fp->f_type != DTYPE_VNODE)) {
    492          1.173        ad 		fd_putfile(fd);
    493          1.173        ad 		return EINVAL;
    494          1.173        ad 	}
    495          1.173        ad 	vp = fp->f_data;
    496          1.173        ad 	if (__predict_false(vp->v_type == VBAD)) {
    497          1.173        ad 		/* XXX Is this case really necessary? */
    498          1.173        ad 		fd_putfile(fd);
    499          1.173        ad 		return EBADF;
    500           1.59   thorpej 	}
    501          1.173        ad 	*fpp = fp;
    502          1.173        ad 	return 0;
    503           1.16       cgd }
    504           1.16       cgd 
    505           1.16       cgd /*
    506          1.173        ad  * Convenience wrapper around fd_getfile() that returns reference
    507          1.173        ad  * to a socket.
    508           1.16       cgd  */
    509           1.38  christos int
    510          1.218  christos fd_getsock1(unsigned fd, struct socket **sop, file_t **fp)
    511           1.36   thorpej {
    512          1.218  christos 	*fp = fd_getfile(fd);
    513          1.218  christos 	if (__predict_false(*fp == NULL)) {
    514          1.173        ad 		return EBADF;
    515          1.103        pk 	}
    516          1.218  christos 	if (__predict_false((*fp)->f_type != DTYPE_SOCKET)) {
    517          1.173        ad 		fd_putfile(fd);
    518          1.173        ad 		return ENOTSOCK;
    519           1.17       cgd 	}
    520          1.218  christos 	*sop = (*fp)->f_data;
    521          1.173        ad 	return 0;
    522           1.16       cgd }
    523           1.16       cgd 
    524          1.218  christos int
    525          1.218  christos fd_getsock(unsigned fd, struct socket **sop)
    526          1.218  christos {
    527          1.218  christos 	file_t *fp;
    528          1.218  christos 	return fd_getsock1(fd, sop, &fp);
    529          1.218  christos }
    530          1.218  christos 
    531           1.16       cgd /*
    532          1.173        ad  * Look up the file structure corresponding to a file descriptor
    533          1.173        ad  * and return it with a reference held on the file, not the
    534          1.173        ad  * descriptor.
    535          1.173        ad  *
    536          1.173        ad  * This is heavyweight and only used when accessing descriptors
    537          1.173        ad  * from a foreign process.  The caller must ensure that `p' does
    538          1.173        ad  * not exit or fork across this call.
    539          1.173        ad  *
    540          1.173        ad  * To release the file (not descriptor) reference, use closef().
    541          1.134   thorpej  */
    542          1.173        ad file_t *
    543          1.173        ad fd_getfile2(proc_t *p, unsigned fd)
    544          1.134   thorpej {
    545          1.173        ad 	filedesc_t *fdp;
    546          1.173        ad 	fdfile_t *ff;
    547          1.173        ad 	file_t *fp;
    548          1.192        ad 	fdtab_t *dt;
    549          1.134   thorpej 
    550          1.173        ad 	fdp = p->p_fd;
    551          1.173        ad 	mutex_enter(&fdp->fd_lock);
    552          1.192        ad 	dt = fdp->fd_dt;
    553          1.192        ad 	if (fd >= dt->dt_nfiles) {
    554          1.173        ad 		mutex_exit(&fdp->fd_lock);
    555          1.173        ad 		return NULL;
    556          1.173        ad 	}
    557          1.192        ad 	if ((ff = dt->dt_ff[fd]) == NULL) {
    558          1.173        ad 		mutex_exit(&fdp->fd_lock);
    559          1.173        ad 		return NULL;
    560          1.173        ad 	}
    561          1.173        ad 	if ((fp = ff->ff_file) == NULL) {
    562          1.173        ad 		mutex_exit(&fdp->fd_lock);
    563          1.173        ad 		return NULL;
    564          1.158       dsl 	}
    565          1.173        ad 	mutex_enter(&fp->f_lock);
    566          1.173        ad 	fp->f_count++;
    567          1.173        ad 	mutex_exit(&fp->f_lock);
    568          1.173        ad 	mutex_exit(&fdp->fd_lock);
    569          1.158       dsl 
    570          1.173        ad 	return fp;
    571          1.158       dsl }
    572          1.158       dsl 
    573          1.134   thorpej /*
    574          1.173        ad  * Internal form of close.  Must be called with a reference to the
    575          1.173        ad  * descriptor, and will drop the reference.  When all descriptor
    576          1.173        ad  * references are dropped, releases the descriptor slot and a single
    577          1.173        ad  * reference to the file structure.
    578          1.173        ad  */
    579          1.173        ad int
    580          1.173        ad fd_close(unsigned fd)
    581          1.173        ad {
    582          1.173        ad 	struct flock lf;
    583          1.173        ad 	filedesc_t *fdp;
    584          1.173        ad 	fdfile_t *ff;
    585          1.173        ad 	file_t *fp;
    586          1.173        ad 	proc_t *p;
    587          1.173        ad 	lwp_t *l;
    588          1.192        ad 	u_int refcnt;
    589           1.72     lukem 
    590          1.173        ad 	l = curlwp;
    591           1.99   thorpej 	p = l->l_proc;
    592          1.173        ad 	fdp = l->l_fd;
    593          1.192        ad 	ff = fdp->fd_dt->dt_ff[fd];
    594           1.16       cgd 
    595          1.173        ad 	KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
    596          1.122  christos 
    597          1.192        ad 	mutex_enter(&fdp->fd_lock);
    598          1.173        ad 	KASSERT((ff->ff_refcnt & FR_MASK) > 0);
    599          1.192        ad 	if (__predict_false(ff->ff_file == NULL)) {
    600          1.173        ad 		/*
    601          1.173        ad 		 * Another user of the file is already closing, and is
    602          1.173        ad 		 * waiting for other users of the file to drain.  Release
    603          1.173        ad 		 * our reference, and wake up the closer.
    604          1.173        ad 		 */
    605          1.173        ad 		atomic_dec_uint(&ff->ff_refcnt);
    606          1.173        ad 		cv_broadcast(&ff->ff_closing);
    607          1.192        ad 		mutex_exit(&fdp->fd_lock);
    608          1.122  christos 
    609          1.173        ad 		/*
    610          1.173        ad 		 * An application error, so pretend that the descriptor
    611          1.173        ad 		 * was already closed.  We can't safely wait for it to
    612          1.173        ad 		 * be closed without potentially deadlocking.
    613          1.173        ad 		 */
    614           1.16       cgd 		return (EBADF);
    615           1.61  wrstuden 	}
    616          1.173        ad 	KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
    617           1.61  wrstuden 
    618          1.173        ad 	/*
    619          1.173        ad 	 * There may be multiple users of this file within the process.
    620          1.173        ad 	 * Notify existing and new users that the file is closing.  This
    621          1.173        ad 	 * will prevent them from adding additional uses to this file
    622          1.173        ad 	 * while we are closing it.
    623          1.173        ad 	 */
    624          1.173        ad 	fp = ff->ff_file;
    625          1.173        ad 	ff->ff_file = NULL;
    626          1.182      matt 	ff->ff_exclose = false;
    627           1.17       cgd 
    628          1.173        ad 	/*
    629          1.173        ad 	 * We expect the caller to hold a descriptor reference - drop it.
    630          1.173        ad 	 * The reference count may increase beyond zero at this point due
    631          1.173        ad 	 * to an erroneous descriptor reference by an application, but
    632          1.173        ad 	 * fd_getfile() will notice that the file is being closed and drop
    633          1.173        ad 	 * the reference again.
    634          1.173        ad 	 */
    635          1.192        ad 	if (fdp->fd_refcnt == 1) {
    636          1.192        ad 		/* Single threaded. */
    637          1.192        ad 		refcnt = --(ff->ff_refcnt);
    638          1.192        ad 	} else {
    639          1.192        ad 		/* Multi threaded. */
    640          1.173        ad #ifndef __HAVE_ATOMIC_AS_MEMBAR
    641          1.192        ad 		membar_producer();
    642          1.173        ad #endif
    643          1.192        ad 		refcnt = atomic_dec_uint_nv(&ff->ff_refcnt);
    644          1.192        ad 	}
    645          1.192        ad 	if (__predict_false(refcnt != 0)) {
    646          1.173        ad 		/*
    647          1.173        ad 		 * Wait for other references to drain.  This is typically
    648          1.173        ad 		 * an application error - the descriptor is being closed
    649          1.173        ad 		 * while still in use.
    650          1.202       dsl 		 * (Or just a threaded application trying to unblock its
    651          1.202       dsl 		 * thread that sleeps in (say) accept()).
    652          1.173        ad 		 */
    653          1.173        ad 		atomic_or_uint(&ff->ff_refcnt, FR_CLOSING);
    654          1.190        ad 
    655          1.173        ad 		/*
    656          1.173        ad 		 * Remove any knotes attached to the file.  A knote
    657          1.173        ad 		 * attached to the descriptor can hold references on it.
    658          1.173        ad 		 */
    659          1.192        ad 		mutex_exit(&fdp->fd_lock);
    660          1.173        ad 		if (!SLIST_EMPTY(&ff->ff_knlist)) {
    661          1.173        ad 			knote_fdclose(fd);
    662           1.75   thorpej 		}
    663          1.190        ad 
    664          1.202       dsl 		/*
    665          1.202       dsl 		 * Since the file system code doesn't know which fd
    666          1.202       dsl 		 * each request came from (think dup()), we have to
    667          1.202       dsl 		 * ask it to return ERESTART for any long-term blocks.
    668          1.202       dsl 		 * The re-entry through read/write/etc will detect the
    669          1.202       dsl 		 * closed fd and return EBAFD.
    670          1.202       dsl 		 * Blocked partial writes may return a short length.
    671          1.202       dsl 		 */
    672          1.202       dsl 		(*fp->f_ops->fo_restart)(fp);
    673          1.192        ad 		mutex_enter(&fdp->fd_lock);
    674          1.190        ad 
    675          1.173        ad 		/*
    676          1.173        ad 		 * We need to see the count drop to zero at least once,
    677          1.173        ad 		 * in order to ensure that all pre-existing references
    678          1.173        ad 		 * have been drained.  New references past this point are
    679          1.173        ad 		 * of no interest.
    680          1.202       dsl 		 * XXX (dsl) this may need to call fo_restart() after a
    681          1.202       dsl 		 * timeout to guarantee that all the system calls exit.
    682          1.173        ad 		 */
    683          1.173        ad 		while ((ff->ff_refcnt & FR_MASK) != 0) {
    684          1.192        ad 			cv_wait(&ff->ff_closing, &fdp->fd_lock);
    685          1.107       dsl 		}
    686          1.173        ad 		atomic_and_uint(&ff->ff_refcnt, ~FR_CLOSING);
    687          1.173        ad 	} else {
    688          1.173        ad 		/* If no references, there must be no knotes. */
    689          1.173        ad 		KASSERT(SLIST_EMPTY(&ff->ff_knlist));
    690           1.16       cgd 	}
    691           1.59   thorpej 
    692          1.173        ad 	/*
    693          1.173        ad 	 * POSIX record locking dictates that any close releases ALL
    694          1.173        ad 	 * locks owned by this process.  This is handled by setting
    695          1.173        ad 	 * a flag in the unlock to free ONLY locks obeying POSIX
    696          1.173        ad 	 * semantics, and not to free BSD-style file locks.
    697          1.173        ad 	 * If the descriptor was in a message, POSIX-style locks
    698          1.173        ad 	 * aren't passed with the descriptor.
    699          1.173        ad 	 */
    700          1.192        ad 	if (__predict_false((p->p_flag & PK_ADVLOCK) != 0 &&
    701          1.192        ad 	    fp->f_type == DTYPE_VNODE)) {
    702          1.173        ad 		lf.l_whence = SEEK_SET;
    703          1.173        ad 		lf.l_start = 0;
    704          1.173        ad 		lf.l_len = 0;
    705          1.173        ad 		lf.l_type = F_UNLCK;
    706          1.192        ad 		mutex_exit(&fdp->fd_lock);
    707          1.173        ad 		(void)VOP_ADVLOCK(fp->f_data, p, F_UNLCK, &lf, F_POSIX);
    708          1.192        ad 		mutex_enter(&fdp->fd_lock);
    709          1.103        pk 	}
    710          1.103        pk 
    711          1.173        ad 	/* Free descriptor slot. */
    712          1.126        pk 	fd_unused(fdp, fd);
    713          1.173        ad 	mutex_exit(&fdp->fd_lock);
    714          1.126        pk 
    715          1.173        ad 	/* Now drop reference to the file itself. */
    716          1.173        ad 	return closef(fp);
    717           1.27   mycroft }
    718           1.27   mycroft 
    719           1.17       cgd /*
    720          1.173        ad  * Duplicate a file descriptor.
    721           1.16       cgd  */
    722           1.38  christos int
    723          1.182      matt fd_dup(file_t *fp, int minfd, int *newp, bool exclose)
    724           1.36   thorpej {
    725          1.213     rmind 	proc_t *p = curproc;
    726          1.173        ad 	int error;
    727           1.16       cgd 
    728          1.173        ad 	while ((error = fd_alloc(p, minfd, newp)) != 0) {
    729          1.173        ad 		if (error != ENOSPC) {
    730          1.173        ad 			return error;
    731          1.173        ad 		}
    732          1.173        ad 		fd_tryexpand(p);
    733          1.173        ad 	}
    734           1.79   thorpej 
    735          1.192        ad 	curlwp->l_fd->fd_dt->dt_ff[*newp]->ff_exclose = exclose;
    736          1.173        ad 	fd_affix(p, fp, *newp);
    737          1.173        ad 	return 0;
    738           1.16       cgd }
    739           1.16       cgd 
    740           1.17       cgd /*
    741          1.173        ad  * dup2 operation.
    742          1.153       dsl  */
    743          1.153       dsl int
    744          1.215  christos fd_dup2(file_t *fp, unsigned new, int flags)
    745          1.153       dsl {
    746          1.213     rmind 	filedesc_t *fdp = curlwp->l_fd;
    747          1.173        ad 	fdfile_t *ff;
    748          1.192        ad 	fdtab_t *dt;
    749          1.153       dsl 
    750          1.216  christos 	if (flags & ~(O_CLOEXEC|O_NONBLOCK))
    751          1.216  christos 		return EINVAL;
    752          1.173        ad 	/*
    753          1.173        ad 	 * Ensure there are enough slots in the descriptor table,
    754          1.173        ad 	 * and allocate an fdfile_t up front in case we need it.
    755          1.173        ad 	 */
    756          1.192        ad 	while (new >= fdp->fd_dt->dt_nfiles) {
    757          1.173        ad 		fd_tryexpand(curproc);
    758          1.173        ad 	}
    759          1.173        ad 	ff = pool_cache_get(fdfile_cache, PR_WAITOK);
    760          1.153       dsl 
    761          1.173        ad 	/*
    762          1.173        ad 	 * If there is already a file open, close it.  If the file is
    763          1.173        ad 	 * half open, wait for it to be constructed before closing it.
    764          1.173        ad 	 * XXX Potential for deadlock here?
    765          1.173        ad 	 */
    766          1.173        ad 	mutex_enter(&fdp->fd_lock);
    767          1.173        ad 	while (fd_isused(fdp, new)) {
    768          1.173        ad 		mutex_exit(&fdp->fd_lock);
    769          1.173        ad 		if (fd_getfile(new) != NULL) {
    770          1.173        ad 			(void)fd_close(new);
    771          1.173        ad 		} else {
    772          1.192        ad 			/*
    773          1.192        ad 			 * Crummy, but unlikely to happen.
    774          1.192        ad 			 * Can occur if we interrupt another
    775          1.192        ad 			 * thread while it is opening a file.
    776          1.192        ad 			 */
    777          1.173        ad 			kpause("dup2", false, 1, NULL);
    778          1.173        ad 		}
    779          1.173        ad 		mutex_enter(&fdp->fd_lock);
    780          1.173        ad 	}
    781          1.192        ad 	dt = fdp->fd_dt;
    782          1.192        ad 	if (dt->dt_ff[new] == NULL) {
    783          1.173        ad 		KASSERT(new >= NDFDFILE);
    784          1.192        ad 		dt->dt_ff[new] = ff;
    785          1.173        ad 		ff = NULL;
    786          1.213     rmind 	}
    787          1.173        ad 	fd_used(fdp, new);
    788          1.173        ad 	mutex_exit(&fdp->fd_lock);
    789          1.173        ad 
    790          1.215  christos 	dt->dt_ff[new]->ff_exclose = (flags & O_CLOEXEC) != 0;
    791          1.215  christos 	fp->f_flag |= flags & FNONBLOCK;
    792          1.173        ad 	/* Slot is now allocated.  Insert copy of the file. */
    793          1.173        ad 	fd_affix(curproc, fp, new);
    794          1.173        ad 	if (ff != NULL) {
    795          1.173        ad 		pool_cache_put(fdfile_cache, ff);
    796          1.173        ad 	}
    797          1.173        ad 	return 0;
    798          1.153       dsl }
    799          1.153       dsl 
    800          1.153       dsl /*
    801          1.173        ad  * Drop reference to a file structure.
    802           1.17       cgd  */
    803           1.38  christos int
    804          1.173        ad closef(file_t *fp)
    805           1.36   thorpej {
    806          1.173        ad 	struct flock lf;
    807          1.173        ad 	int error;
    808           1.16       cgd 
    809          1.173        ad 	/*
    810          1.173        ad 	 * Drop reference.  If referenced elsewhere it's still open
    811          1.173        ad 	 * and we have nothing more to do.
    812          1.173        ad 	 */
    813          1.173        ad 	mutex_enter(&fp->f_lock);
    814          1.173        ad 	KASSERT(fp->f_count > 0);
    815          1.173        ad 	if (--fp->f_count > 0) {
    816          1.173        ad 		mutex_exit(&fp->f_lock);
    817          1.173        ad 		return 0;
    818          1.173        ad 	}
    819          1.173        ad 	KASSERT(fp->f_count == 0);
    820          1.173        ad 	mutex_exit(&fp->f_lock);
    821           1.59   thorpej 
    822          1.173        ad 	/* We held the last reference - release locks, close and free. */
    823          1.213     rmind 	if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) {
    824          1.213     rmind 		lf.l_whence = SEEK_SET;
    825          1.173        ad 		lf.l_start = 0;
    826          1.173        ad 		lf.l_len = 0;
    827          1.173        ad 		lf.l_type = F_UNLCK;
    828          1.173        ad 		(void)VOP_ADVLOCK(fp->f_data, fp, F_UNLCK, &lf, F_FLOCK);
    829          1.173        ad 	}
    830          1.173        ad 	if (fp->f_ops != NULL) {
    831          1.173        ad 		error = (*fp->f_ops->fo_close)(fp);
    832          1.173        ad 	} else {
    833          1.173        ad 		error = 0;
    834           1.17       cgd 	}
    835          1.191        ad 	KASSERT(fp->f_count == 0);
    836          1.191        ad 	KASSERT(fp->f_cred != NULL);
    837          1.191        ad 	pool_cache_put(file_cache, fp);
    838           1.59   thorpej 
    839          1.173        ad 	return error;
    840           1.16       cgd }
    841           1.16       cgd 
    842           1.16       cgd /*
    843           1.16       cgd  * Allocate a file descriptor for the process.
    844           1.16       cgd  */
    845           1.38  christos int
    846          1.173        ad fd_alloc(proc_t *p, int want, int *result)
    847           1.72     lukem {
    848          1.213     rmind 	filedesc_t *fdp = p->p_fd;
    849          1.126        pk 	int i, lim, last, error;
    850          1.115    provos 	u_int off, new;
    851          1.192        ad 	fdtab_t *dt;
    852          1.173        ad 
    853          1.173        ad 	KASSERT(p == curproc || p == &proc0);
    854           1.72     lukem 
    855           1.16       cgd 	/*
    856           1.16       cgd 	 * Search for a free descriptor starting at the higher
    857          1.173        ad 	 * of want or fd_freefile.
    858           1.16       cgd 	 */
    859          1.173        ad 	mutex_enter(&fdp->fd_lock);
    860          1.192        ad 	fd_checkmaps(fdp);
    861          1.192        ad 	dt = fdp->fd_dt;
    862          1.192        ad 	KASSERT(dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
    863           1.17       cgd 	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
    864          1.192        ad 	last = min(dt->dt_nfiles, lim);
    865          1.173        ad 	for (;;) {
    866          1.173        ad 		if ((i = want) < fdp->fd_freefile)
    867          1.173        ad 			i = fdp->fd_freefile;
    868          1.173        ad 		off = i >> NDENTRYSHIFT;
    869          1.173        ad 		new = fd_next_zero(fdp, fdp->fd_himap, off,
    870          1.173        ad 		    (last + NDENTRIES - 1) >> NDENTRYSHIFT);
    871          1.173        ad 		if (new == -1)
    872          1.173        ad 			break;
    873          1.173        ad 		i = fd_next_zero(fdp, &fdp->fd_lomap[new],
    874          1.115    provos 		    new > off ? 0 : i & NDENTRYMASK, NDENTRIES);
    875          1.115    provos 		if (i == -1) {
    876          1.131     perry 			/*
    877          1.173        ad 			 * Free file descriptor in this block was
    878          1.115    provos 			 * below want, try again with higher want.
    879          1.115    provos 			 */
    880          1.115    provos 			want = (new + 1) << NDENTRYSHIFT;
    881          1.173        ad 			continue;
    882          1.115    provos 		}
    883          1.115    provos 		i += (new << NDENTRYSHIFT);
    884          1.173        ad 		if (i >= last) {
    885          1.173        ad 			break;
    886          1.173        ad 		}
    887          1.192        ad 		if (dt->dt_ff[i] == NULL) {
    888          1.173        ad 			KASSERT(i >= NDFDFILE);
    889          1.192        ad 			dt->dt_ff[i] = pool_cache_get(fdfile_cache, PR_WAITOK);
    890          1.173        ad 		}
    891          1.192        ad 		KASSERT(dt->dt_ff[i]->ff_file == NULL);
    892          1.173        ad 		fd_used(fdp, i);
    893          1.173        ad 		if (want <= fdp->fd_freefile) {
    894          1.173        ad 			fdp->fd_freefile = i;
    895           1.16       cgd 		}
    896          1.173        ad 		*result = i;
    897          1.192        ad 		KASSERT(i >= NDFDFILE ||
    898          1.192        ad 		    dt->dt_ff[i] == (fdfile_t *)fdp->fd_dfdfile[i]);
    899          1.192        ad 		fd_checkmaps(fdp);
    900          1.173        ad 		mutex_exit(&fdp->fd_lock);
    901          1.173        ad 		return 0;
    902           1.90     enami 	}
    903           1.16       cgd 
    904          1.173        ad 	/* No space in current array.  Let the caller expand and retry. */
    905          1.192        ad 	error = (dt->dt_nfiles >= lim) ? EMFILE : ENOSPC;
    906          1.173        ad 	mutex_exit(&fdp->fd_lock);
    907          1.173        ad 	return error;
    908           1.16       cgd }
    909           1.16       cgd 
    910          1.173        ad /*
    911          1.192        ad  * Allocate memory for a descriptor table.
    912          1.185        ad  */
    913          1.192        ad static fdtab_t *
    914          1.192        ad fd_dtab_alloc(int n)
    915          1.185        ad {
    916          1.192        ad 	fdtab_t *dt;
    917          1.192        ad 	size_t sz;
    918          1.185        ad 
    919          1.185        ad 	KASSERT(n > NDFILE);
    920          1.185        ad 
    921          1.192        ad 	sz = sizeof(*dt) + (n - NDFILE) * sizeof(dt->dt_ff[0]);
    922          1.192        ad 	dt = kmem_alloc(sz, KM_SLEEP);
    923          1.192        ad #ifdef DIAGNOSTIC
    924          1.192        ad 	memset(dt, 0xff, sz);
    925          1.192        ad #endif
    926          1.192        ad 	dt->dt_nfiles = n;
    927          1.192        ad 	dt->dt_link = NULL;
    928          1.192        ad 	return dt;
    929          1.185        ad }
    930          1.185        ad 
    931          1.185        ad /*
    932          1.192        ad  * Free a descriptor table, and all tables linked for deferred free.
    933          1.185        ad  */
    934          1.185        ad static void
    935          1.192        ad fd_dtab_free(fdtab_t *dt)
    936          1.185        ad {
    937          1.192        ad 	fdtab_t *next;
    938          1.192        ad 	size_t sz;
    939          1.185        ad 
    940          1.192        ad 	do {
    941          1.192        ad 		next = dt->dt_link;
    942          1.192        ad 		KASSERT(dt->dt_nfiles > NDFILE);
    943          1.192        ad 		sz = sizeof(*dt) +
    944          1.192        ad 		    (dt->dt_nfiles - NDFILE) * sizeof(dt->dt_ff[0]);
    945          1.192        ad #ifdef DIAGNOSTIC
    946          1.192        ad 		memset(dt, 0xff, sz);
    947          1.192        ad #endif
    948          1.192        ad 		kmem_free(dt, sz);
    949          1.192        ad 		dt = next;
    950          1.192        ad 	} while (dt != NULL);
    951          1.185        ad }
    952          1.185        ad 
    953          1.185        ad /*
    954          1.185        ad  * Allocate descriptor bitmap.
    955          1.185        ad  */
    956          1.185        ad static void
    957          1.185        ad fd_map_alloc(int n, uint32_t **lo, uint32_t **hi)
    958          1.185        ad {
    959          1.185        ad 	uint8_t *ptr;
    960          1.185        ad 	size_t szlo, szhi;
    961          1.185        ad 
    962          1.185        ad 	KASSERT(n > NDENTRIES);
    963          1.185        ad 
    964          1.185        ad 	szlo = NDLOSLOTS(n) * sizeof(uint32_t);
    965          1.185        ad 	szhi = NDHISLOTS(n) * sizeof(uint32_t);
    966          1.185        ad 	ptr = kmem_alloc(szlo + szhi, KM_SLEEP);
    967          1.185        ad 	*lo = (uint32_t *)ptr;
    968          1.185        ad 	*hi = (uint32_t *)(ptr + szlo);
    969          1.185        ad }
    970          1.185        ad 
    971          1.185        ad /*
    972          1.185        ad  * Free descriptor bitmap.
    973          1.185        ad  */
    974          1.185        ad static void
    975          1.185        ad fd_map_free(int n, uint32_t *lo, uint32_t *hi)
    976          1.185        ad {
    977          1.185        ad 	size_t szlo, szhi;
    978          1.185        ad 
    979          1.185        ad 	KASSERT(n > NDENTRIES);
    980          1.185        ad 
    981          1.185        ad 	szlo = NDLOSLOTS(n) * sizeof(uint32_t);
    982          1.185        ad 	szhi = NDHISLOTS(n) * sizeof(uint32_t);
    983          1.185        ad 	KASSERT(hi == (uint32_t *)((uint8_t *)lo + szlo));
    984          1.185        ad 	kmem_free(lo, szlo + szhi);
    985          1.185        ad }
    986          1.185        ad 
    987          1.185        ad /*
    988          1.173        ad  * Expand a process' descriptor table.
    989          1.173        ad  */
    990           1.76   thorpej void
    991          1.173        ad fd_tryexpand(proc_t *p)
    992           1.76   thorpej {
    993          1.173        ad 	filedesc_t *fdp;
    994          1.173        ad 	int i, numfiles, oldnfiles;
    995          1.192        ad 	fdtab_t *newdt, *dt;
    996          1.173        ad 	uint32_t *newhimap, *newlomap;
    997          1.173        ad 
    998          1.173        ad 	KASSERT(p == curproc || p == &proc0);
    999           1.76   thorpej 
   1000           1.76   thorpej 	fdp = p->p_fd;
   1001          1.173        ad 	newhimap = NULL;
   1002          1.173        ad 	newlomap = NULL;
   1003          1.192        ad 	oldnfiles = fdp->fd_dt->dt_nfiles;
   1004          1.126        pk 
   1005          1.126        pk 	if (oldnfiles < NDEXTENT)
   1006          1.133  christos 		numfiles = NDEXTENT;
   1007           1.76   thorpej 	else
   1008          1.133  christos 		numfiles = 2 * oldnfiles;
   1009          1.126        pk 
   1010          1.192        ad 	newdt = fd_dtab_alloc(numfiles);
   1011          1.133  christos 	if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
   1012          1.185        ad 		fd_map_alloc(numfiles, &newlomap, &newhimap);
   1013          1.126        pk 	}
   1014          1.126        pk 
   1015          1.173        ad 	mutex_enter(&fdp->fd_lock);
   1016          1.192        ad 	dt = fdp->fd_dt;
   1017          1.192        ad 	KASSERT(dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
   1018          1.192        ad 	if (dt->dt_nfiles != oldnfiles) {
   1019          1.173        ad 		/* fdp changed; caller must retry */
   1020          1.173        ad 		mutex_exit(&fdp->fd_lock);
   1021          1.192        ad 		fd_dtab_free(newdt);
   1022          1.185        ad 		if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
   1023          1.185        ad 			fd_map_free(numfiles, newlomap, newhimap);
   1024          1.185        ad 		}
   1025          1.173        ad 		return;
   1026          1.173        ad 	}
   1027          1.173        ad 
   1028          1.192        ad 	/* Copy the existing descriptor table and zero the new portion. */
   1029          1.192        ad 	i = sizeof(fdfile_t *) * oldnfiles;
   1030          1.192        ad 	memcpy(newdt->dt_ff, dt->dt_ff, i);
   1031          1.194      yamt 	memset((uint8_t *)newdt->dt_ff + i, 0,
   1032          1.194      yamt 	    numfiles * sizeof(fdfile_t *) - i);
   1033          1.173        ad 
   1034          1.173        ad 	/*
   1035          1.192        ad 	 * Link old descriptor array into list to be discarded.  We defer
   1036          1.192        ad 	 * freeing until the last reference to the descriptor table goes
   1037          1.192        ad 	 * away (usually process exit).  This allows us to do lockless
   1038          1.192        ad 	 * lookups in fd_getfile().
   1039          1.173        ad 	 */
   1040          1.173        ad 	if (oldnfiles > NDFILE) {
   1041          1.191        ad 		if (fdp->fd_refcnt > 1) {
   1042          1.192        ad 			newdt->dt_link = dt;
   1043          1.173        ad 		} else {
   1044          1.192        ad 			fd_dtab_free(dt);
   1045          1.173        ad 		}
   1046          1.173        ad 	}
   1047          1.115    provos 
   1048          1.133  christos 	if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
   1049          1.173        ad 		i = NDHISLOTS(oldnfiles) * sizeof(uint32_t);
   1050          1.173        ad 		memcpy(newhimap, fdp->fd_himap, i);
   1051          1.173        ad 		memset((uint8_t *)newhimap + i, 0,
   1052          1.133  christos 		    NDHISLOTS(numfiles) * sizeof(uint32_t) - i);
   1053          1.115    provos 
   1054          1.173        ad 		i = NDLOSLOTS(oldnfiles) * sizeof(uint32_t);
   1055          1.173        ad 		memcpy(newlomap, fdp->fd_lomap, i);
   1056          1.173        ad 		memset((uint8_t *)newlomap + i, 0,
   1057          1.133  christos 		    NDLOSLOTS(numfiles) * sizeof(uint32_t) - i);
   1058          1.115    provos 
   1059          1.126        pk 		if (NDHISLOTS(oldnfiles) > NDHISLOTS(NDFILE)) {
   1060          1.185        ad 			fd_map_free(oldnfiles, fdp->fd_lomap, fdp->fd_himap);
   1061          1.115    provos 		}
   1062          1.115    provos 		fdp->fd_himap = newhimap;
   1063          1.115    provos 		fdp->fd_lomap = newlomap;
   1064          1.115    provos 	}
   1065          1.115    provos 
   1066          1.173        ad 	/*
   1067          1.173        ad 	 * All other modifications must become globally visible before
   1068          1.192        ad 	 * the change to fd_dt.  See fd_getfile().
   1069          1.173        ad 	 */
   1070          1.173        ad 	membar_producer();
   1071          1.192        ad 	fdp->fd_dt = newdt;
   1072          1.192        ad 	KASSERT(newdt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
   1073          1.192        ad 	fd_checkmaps(fdp);
   1074          1.173        ad 	mutex_exit(&fdp->fd_lock);
   1075           1.76   thorpej }
   1076           1.76   thorpej 
   1077           1.16       cgd /*
   1078          1.173        ad  * Create a new open file structure and allocate a file descriptor
   1079          1.173        ad  * for the current process.
   1080           1.16       cgd  */
   1081           1.38  christos int
   1082          1.173        ad fd_allocfile(file_t **resultfp, int *resultfd)
   1083           1.16       cgd {
   1084          1.213     rmind 	proc_t *p = curproc;
   1085          1.191        ad 	kauth_cred_t cred;
   1086          1.173        ad 	file_t *fp;
   1087          1.173        ad 	int error;
   1088           1.16       cgd 
   1089          1.173        ad 	while ((error = fd_alloc(p, 0, resultfd)) != 0) {
   1090          1.173        ad 		if (error != ENOSPC) {
   1091          1.173        ad 			return error;
   1092           1.76   thorpej 		}
   1093          1.173        ad 		fd_tryexpand(p);
   1094           1.75   thorpej 	}
   1095          1.102        pk 
   1096          1.162        ad 	fp = pool_cache_get(file_cache, PR_WAITOK);
   1097          1.191        ad 	if (fp == NULL) {
   1098          1.217       chs 		fd_abort(p, NULL, *resultfd);
   1099          1.191        ad 		return ENFILE;
   1100          1.191        ad 	}
   1101          1.173        ad 	KASSERT(fp->f_count == 0);
   1102          1.188       mrg 	KASSERT(fp->f_msgcount == 0);
   1103          1.188       mrg 	KASSERT(fp->f_unpcount == 0);
   1104          1.167        ad 
   1105          1.191        ad 	/* Replace cached credentials if not what we need. */
   1106          1.191        ad 	cred = curlwp->l_cred;
   1107          1.191        ad 	if (__predict_false(cred != fp->f_cred)) {
   1108          1.191        ad 		kauth_cred_free(fp->f_cred);
   1109          1.191        ad 		kauth_cred_hold(cred);
   1110          1.191        ad 		fp->f_cred = cred;
   1111           1.16       cgd 	}
   1112          1.167        ad 
   1113          1.188       mrg 	/*
   1114          1.188       mrg 	 * Don't allow recycled files to be scanned.
   1115          1.191        ad 	 * See uipc_usrreq.c.
   1116          1.188       mrg 	 */
   1117          1.191        ad 	if (__predict_false((fp->f_flag & FSCAN) != 0)) {
   1118          1.188       mrg 		mutex_enter(&fp->f_lock);
   1119          1.188       mrg 		atomic_and_uint(&fp->f_flag, ~FSCAN);
   1120          1.188       mrg 		mutex_exit(&fp->f_lock);
   1121          1.188       mrg 	}
   1122          1.188       mrg 
   1123          1.167        ad 	fp->f_advice = 0;
   1124          1.167        ad 	fp->f_offset = 0;
   1125          1.173        ad 	*resultfp = fp;
   1126          1.173        ad 
   1127          1.173        ad 	return 0;
   1128          1.173        ad }
   1129          1.173        ad 
   1130          1.173        ad /*
   1131          1.173        ad  * Successful creation of a new descriptor: make visible to the process.
   1132          1.173        ad  */
   1133          1.173        ad void
   1134          1.173        ad fd_affix(proc_t *p, file_t *fp, unsigned fd)
   1135          1.173        ad {
   1136          1.173        ad 	fdfile_t *ff;
   1137          1.173        ad 	filedesc_t *fdp;
   1138          1.173        ad 
   1139          1.173        ad 	KASSERT(p == curproc || p == &proc0);
   1140          1.173        ad 
   1141          1.173        ad 	/* Add a reference to the file structure. */
   1142          1.173        ad 	mutex_enter(&fp->f_lock);
   1143          1.173        ad 	fp->f_count++;
   1144          1.173        ad 	mutex_exit(&fp->f_lock);
   1145          1.167        ad 
   1146           1.16       cgd 	/*
   1147          1.173        ad 	 * Insert the new file into the descriptor slot.
   1148          1.173        ad 	 *
   1149          1.173        ad 	 * The memory barriers provided by lock activity in this routine
   1150          1.173        ad 	 * ensure that any updates to the file structure become globally
   1151          1.173        ad 	 * visible before the file becomes visible to other LWPs in the
   1152          1.173        ad 	 * current process.
   1153           1.16       cgd 	 */
   1154          1.173        ad 	fdp = p->p_fd;
   1155          1.192        ad 	ff = fdp->fd_dt->dt_ff[fd];
   1156          1.173        ad 
   1157          1.173        ad 	KASSERT(ff != NULL);
   1158          1.173        ad 	KASSERT(ff->ff_file == NULL);
   1159          1.173        ad 	KASSERT(ff->ff_allocated);
   1160          1.173        ad 	KASSERT(fd_isused(fdp, fd));
   1161          1.192        ad 	KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
   1162          1.173        ad 
   1163          1.173        ad 	/* No need to lock in order to make file initially visible. */
   1164          1.173        ad 	ff->ff_file = fp;
   1165          1.173        ad }
   1166          1.173        ad 
   1167          1.173        ad /*
   1168          1.173        ad  * Abort creation of a new descriptor: free descriptor slot and file.
   1169          1.173        ad  */
   1170          1.173        ad void
   1171          1.173        ad fd_abort(proc_t *p, file_t *fp, unsigned fd)
   1172          1.173        ad {
   1173          1.173        ad 	filedesc_t *fdp;
   1174          1.173        ad 	fdfile_t *ff;
   1175          1.173        ad 
   1176          1.173        ad 	KASSERT(p == curproc || p == &proc0);
   1177          1.173        ad 
   1178          1.173        ad 	fdp = p->p_fd;
   1179          1.192        ad 	ff = fdp->fd_dt->dt_ff[fd];
   1180          1.220     pooka 	ff->ff_exclose = false;
   1181          1.173        ad 
   1182          1.192        ad 	KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
   1183          1.173        ad 
   1184          1.173        ad 	mutex_enter(&fdp->fd_lock);
   1185          1.173        ad 	KASSERT(fd_isused(fdp, fd));
   1186          1.173        ad 	fd_unused(fdp, fd);
   1187          1.173        ad 	mutex_exit(&fdp->fd_lock);
   1188          1.167        ad 
   1189          1.173        ad 	if (fp != NULL) {
   1190          1.191        ad 		KASSERT(fp->f_count == 0);
   1191          1.191        ad 		KASSERT(fp->f_cred != NULL);
   1192          1.191        ad 		pool_cache_put(file_cache, fp);
   1193           1.59   thorpej 	}
   1194           1.16       cgd }
   1195           1.16       cgd 
   1196          1.167        ad static int
   1197          1.167        ad file_ctor(void *arg, void *obj, int flags)
   1198          1.167        ad {
   1199          1.173        ad 	file_t *fp = obj;
   1200          1.167        ad 
   1201          1.167        ad 	memset(fp, 0, sizeof(*fp));
   1202          1.167        ad 
   1203          1.167        ad 	mutex_enter(&filelist_lock);
   1204          1.191        ad 	if (__predict_false(nfiles >= maxfiles)) {
   1205          1.191        ad 		mutex_exit(&filelist_lock);
   1206          1.191        ad 		tablefull("file", "increase kern.maxfiles or MAXFILES");
   1207          1.191        ad 		return ENFILE;
   1208          1.191        ad 	}
   1209          1.191        ad 	nfiles++;
   1210          1.167        ad 	LIST_INSERT_HEAD(&filehead, fp, f_list);
   1211          1.191        ad 	mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
   1212          1.191        ad 	fp->f_cred = curlwp->l_cred;
   1213          1.191        ad 	kauth_cred_hold(fp->f_cred);
   1214          1.167        ad 	mutex_exit(&filelist_lock);
   1215          1.167        ad 
   1216          1.167        ad 	return 0;
   1217          1.167        ad }
   1218          1.167        ad 
   1219          1.167        ad static void
   1220          1.167        ad file_dtor(void *arg, void *obj)
   1221          1.167        ad {
   1222          1.173        ad 	file_t *fp = obj;
   1223          1.167        ad 
   1224          1.167        ad 	mutex_enter(&filelist_lock);
   1225          1.191        ad 	nfiles--;
   1226          1.167        ad 	LIST_REMOVE(fp, f_list);
   1227          1.167        ad 	mutex_exit(&filelist_lock);
   1228          1.167        ad 
   1229          1.191        ad 	kauth_cred_free(fp->f_cred);
   1230          1.167        ad 	mutex_destroy(&fp->f_lock);
   1231          1.167        ad }
   1232          1.167        ad 
   1233          1.173        ad static int
   1234          1.173        ad fdfile_ctor(void *arg, void *obj, int flags)
   1235          1.173        ad {
   1236          1.173        ad 	fdfile_t *ff = obj;
   1237          1.173        ad 
   1238          1.173        ad 	memset(ff, 0, sizeof(*ff));
   1239          1.173        ad 	cv_init(&ff->ff_closing, "fdclose");
   1240          1.173        ad 
   1241          1.173        ad 	return 0;
   1242          1.173        ad }
   1243          1.173        ad 
   1244          1.173        ad static void
   1245          1.173        ad fdfile_dtor(void *arg, void *obj)
   1246          1.173        ad {
   1247          1.173        ad 	fdfile_t *ff = obj;
   1248          1.173        ad 
   1249          1.173        ad 	cv_destroy(&ff->ff_closing);
   1250          1.173        ad }
   1251          1.173        ad 
   1252          1.173        ad file_t *
   1253          1.169        ad fgetdummy(void)
   1254          1.169        ad {
   1255          1.173        ad 	file_t *fp;
   1256          1.169        ad 
   1257          1.213     rmind 	fp = kmem_zalloc(sizeof(*fp), KM_SLEEP);
   1258          1.169        ad 	if (fp != NULL) {
   1259          1.169        ad 		mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
   1260          1.169        ad 	}
   1261          1.169        ad 	return fp;
   1262          1.169        ad }
   1263          1.169        ad 
   1264          1.169        ad void
   1265          1.173        ad fputdummy(file_t *fp)
   1266           1.58   thorpej {
   1267           1.58   thorpej 
   1268          1.173        ad 	mutex_destroy(&fp->f_lock);
   1269          1.173        ad 	kmem_free(fp, sizeof(*fp));
   1270           1.58   thorpej }
   1271           1.58   thorpej 
   1272           1.58   thorpej /*
   1273          1.173        ad  * Create an initial filedesc structure.
   1274           1.48   thorpej  */
   1275          1.173        ad filedesc_t *
   1276          1.173        ad fd_init(filedesc_t *fdp)
   1277           1.48   thorpej {
   1278          1.192        ad #ifdef DIAGNOSTIC
   1279          1.173        ad 	unsigned fd;
   1280          1.192        ad #endif
   1281          1.173        ad 
   1282          1.192        ad 	if (__predict_true(fdp == NULL)) {
   1283          1.173        ad 		fdp = pool_cache_get(filedesc_cache, PR_WAITOK);
   1284          1.173        ad 	} else {
   1285          1.206     pooka 		KASSERT(fdp == &filedesc0);
   1286          1.173        ad 		filedesc_ctor(NULL, fdp, PR_WAITOK);
   1287          1.173        ad 	}
   1288           1.48   thorpej 
   1289          1.192        ad #ifdef DIAGNOSTIC
   1290          1.173        ad 	KASSERT(fdp->fd_lastfile == -1);
   1291          1.173        ad 	KASSERT(fdp->fd_lastkqfile == -1);
   1292          1.173        ad 	KASSERT(fdp->fd_knhash == NULL);
   1293          1.192        ad 	KASSERT(fdp->fd_freefile == 0);
   1294          1.192        ad 	KASSERT(fdp->fd_exclose == false);
   1295          1.192        ad 	KASSERT(fdp->fd_dt == &fdp->fd_dtbuiltin);
   1296          1.192        ad 	KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE);
   1297          1.173        ad 	for (fd = 0; fd < NDFDFILE; fd++) {
   1298          1.192        ad 		KASSERT(fdp->fd_dtbuiltin.dt_ff[fd] ==
   1299          1.192        ad 		    (fdfile_t *)fdp->fd_dfdfile[fd]);
   1300          1.192        ad 	}
   1301          1.192        ad 	for (fd = NDFDFILE; fd < NDFILE; fd++) {
   1302          1.192        ad 		KASSERT(fdp->fd_dtbuiltin.dt_ff[fd] == NULL);
   1303          1.173        ad 	}
   1304          1.195      yamt 	KASSERT(fdp->fd_himap == fdp->fd_dhimap);
   1305          1.195      yamt 	KASSERT(fdp->fd_lomap == fdp->fd_dlomap);
   1306          1.192        ad #endif	/* DIAGNOSTIC */
   1307          1.192        ad 
   1308          1.192        ad 	fdp->fd_refcnt = 1;
   1309          1.199      yamt 	fd_checkmaps(fdp);
   1310           1.48   thorpej 
   1311          1.173        ad 	return fdp;
   1312           1.48   thorpej }
   1313           1.48   thorpej 
   1314           1.48   thorpej /*
   1315           1.48   thorpej  * Initialize a file descriptor table.
   1316           1.48   thorpej  */
   1317          1.173        ad static int
   1318          1.173        ad filedesc_ctor(void *arg, void *obj, int flag)
   1319           1.48   thorpej {
   1320          1.173        ad 	filedesc_t *fdp = obj;
   1321          1.192        ad 	fdfile_t **ffp;
   1322          1.173        ad 	int i;
   1323           1.48   thorpej 
   1324          1.173        ad 	memset(fdp, 0, sizeof(*fdp));
   1325          1.173        ad 	mutex_init(&fdp->fd_lock, MUTEX_DEFAULT, IPL_NONE);
   1326          1.173        ad 	fdp->fd_lastfile = -1;
   1327          1.173        ad 	fdp->fd_lastkqfile = -1;
   1328          1.192        ad 	fdp->fd_dt = &fdp->fd_dtbuiltin;
   1329          1.192        ad 	fdp->fd_dtbuiltin.dt_nfiles = NDFILE;
   1330          1.195      yamt 	fdp->fd_himap = fdp->fd_dhimap;
   1331          1.195      yamt 	fdp->fd_lomap = fdp->fd_dlomap;
   1332          1.173        ad 
   1333          1.181      matt 	CTASSERT(sizeof(fdp->fd_dfdfile[0]) >= sizeof(fdfile_t));
   1334          1.192        ad 	for (i = 0, ffp = fdp->fd_dt->dt_ff; i < NDFDFILE; i++, ffp++) {
   1335          1.192        ad 		*ffp = (fdfile_t *)fdp->fd_dfdfile[i];
   1336          1.192        ad 		(void)fdfile_ctor(NULL, fdp->fd_dfdfile[i], PR_WAITOK);
   1337          1.173        ad 	}
   1338           1.48   thorpej 
   1339          1.173        ad 	return 0;
   1340           1.48   thorpej }
   1341           1.48   thorpej 
   1342          1.173        ad static void
   1343          1.173        ad filedesc_dtor(void *arg, void *obj)
   1344           1.48   thorpej {
   1345          1.173        ad 	filedesc_t *fdp = obj;
   1346          1.173        ad 	int i;
   1347           1.48   thorpej 
   1348          1.173        ad 	for (i = 0; i < NDFDFILE; i++) {
   1349          1.173        ad 		fdfile_dtor(NULL, fdp->fd_dfdfile[i]);
   1350          1.173        ad 	}
   1351           1.48   thorpej 
   1352          1.173        ad 	mutex_destroy(&fdp->fd_lock);
   1353           1.48   thorpej }
   1354           1.48   thorpej 
   1355           1.48   thorpej /*
   1356          1.209     pooka  * Make p share curproc's filedesc structure.
   1357           1.48   thorpej  */
   1358           1.48   thorpej void
   1359          1.209     pooka fd_share(struct proc *p)
   1360           1.48   thorpej {
   1361          1.173        ad 	filedesc_t *fdp;
   1362           1.48   thorpej 
   1363          1.173        ad 	fdp = curlwp->l_fd;
   1364          1.209     pooka 	p->p_fd = fdp;
   1365          1.173        ad 	atomic_inc_uint(&fdp->fd_refcnt);
   1366           1.16       cgd }
   1367           1.16       cgd 
   1368           1.16       cgd /*
   1369          1.191        ad  * Acquire a hold on a filedesc structure.
   1370          1.191        ad  */
   1371          1.191        ad void
   1372          1.200     rmind fd_hold(lwp_t *l)
   1373          1.191        ad {
   1374          1.200     rmind 	filedesc_t *fdp = l->l_fd;
   1375          1.191        ad 
   1376          1.200     rmind 	atomic_inc_uint(&fdp->fd_refcnt);
   1377          1.191        ad }
   1378          1.191        ad 
   1379          1.191        ad /*
   1380           1.16       cgd  * Copy a filedesc structure.
   1381           1.16       cgd  */
   1382          1.173        ad filedesc_t *
   1383          1.173        ad fd_copy(void)
   1384           1.16       cgd {
   1385          1.173        ad 	filedesc_t *newfdp, *fdp;
   1386          1.192        ad 	fdfile_t *ff, **ffp, **nffp, *ff2;
   1387          1.192        ad 	int i, j, numfiles, lastfile, newlast;
   1388          1.173        ad 	file_t *fp;
   1389          1.192        ad 	fdtab_t *newdt;
   1390           1.16       cgd 
   1391          1.173        ad 	fdp = curproc->p_fd;
   1392          1.173        ad 	newfdp = pool_cache_get(filedesc_cache, PR_WAITOK);
   1393           1.16       cgd 	newfdp->fd_refcnt = 1;
   1394          1.126        pk 
   1395          1.192        ad #ifdef DIAGNOSTIC
   1396          1.192        ad 	KASSERT(newfdp->fd_lastfile == -1);
   1397          1.192        ad 	KASSERT(newfdp->fd_lastkqfile == -1);
   1398          1.173        ad 	KASSERT(newfdp->fd_knhash == NULL);
   1399          1.192        ad 	KASSERT(newfdp->fd_freefile == 0);
   1400          1.192        ad 	KASSERT(newfdp->fd_exclose == false);
   1401          1.192        ad 	KASSERT(newfdp->fd_dt == &newfdp->fd_dtbuiltin);
   1402          1.192        ad 	KASSERT(newfdp->fd_dtbuiltin.dt_nfiles == NDFILE);
   1403          1.192        ad 	for (i = 0; i < NDFDFILE; i++) {
   1404          1.192        ad 		KASSERT(newfdp->fd_dtbuiltin.dt_ff[i] ==
   1405          1.192        ad 		    (fdfile_t *)&newfdp->fd_dfdfile[i]);
   1406          1.192        ad 	}
   1407          1.192        ad 	for (i = NDFDFILE; i < NDFILE; i++) {
   1408          1.192        ad 		KASSERT(newfdp->fd_dtbuiltin.dt_ff[i] == NULL);
   1409          1.192        ad 	}
   1410          1.192        ad #endif	/* DIAGNOSTIC */
   1411          1.173        ad 
   1412          1.192        ad 	mutex_enter(&fdp->fd_lock);
   1413          1.192        ad 	fd_checkmaps(fdp);
   1414          1.192        ad 	numfiles = fdp->fd_dt->dt_nfiles;
   1415          1.192        ad 	lastfile = fdp->fd_lastfile;
   1416          1.173        ad 
   1417          1.192        ad 	/*
   1418          1.192        ad 	 * If the number of open files fits in the internal arrays
   1419          1.192        ad 	 * of the open file structure, use them, otherwise allocate
   1420          1.192        ad 	 * additional memory for the number of descriptors currently
   1421          1.192        ad 	 * in use.
   1422          1.192        ad 	 */
   1423          1.192        ad 	if (lastfile < NDFILE) {
   1424          1.192        ad 		i = NDFILE;
   1425          1.192        ad 		newdt = newfdp->fd_dt;
   1426          1.192        ad 		KASSERT(newfdp->fd_dt == &newfdp->fd_dtbuiltin);
   1427          1.192        ad 	} else {
   1428          1.173        ad 		/*
   1429          1.192        ad 		 * Compute the smallest multiple of NDEXTENT needed
   1430          1.192        ad 		 * for the file descriptors currently in use,
   1431          1.192        ad 		 * allowing the table to shrink.
   1432          1.173        ad 		 */
   1433          1.192        ad 		i = numfiles;
   1434          1.192        ad 		while (i >= 2 * NDEXTENT && i > lastfile * 2) {
   1435          1.192        ad 			i /= 2;
   1436          1.192        ad 		}
   1437          1.192        ad 		KASSERT(i > NDFILE);
   1438          1.192        ad 		newdt = fd_dtab_alloc(i);
   1439          1.192        ad 		newfdp->fd_dt = newdt;
   1440          1.192        ad 		memcpy(newdt->dt_ff, newfdp->fd_dtbuiltin.dt_ff,
   1441          1.192        ad 		    NDFDFILE * sizeof(fdfile_t **));
   1442          1.192        ad 		memset(newdt->dt_ff + NDFDFILE, 0,
   1443          1.192        ad 		    (i - NDFDFILE) * sizeof(fdfile_t **));
   1444          1.192        ad 	}
   1445          1.192        ad 	if (NDHISLOTS(i) <= NDHISLOTS(NDFILE)) {
   1446          1.192        ad 		newfdp->fd_himap = newfdp->fd_dhimap;
   1447          1.192        ad 		newfdp->fd_lomap = newfdp->fd_dlomap;
   1448          1.192        ad 	} else {
   1449          1.192        ad 		fd_map_alloc(i, &newfdp->fd_lomap, &newfdp->fd_himap);
   1450          1.192        ad 		KASSERT(i >= NDENTRIES * NDENTRIES);
   1451          1.192        ad 		memset(newfdp->fd_himap, 0, NDHISLOTS(i)*sizeof(uint32_t));
   1452          1.192        ad 		memset(newfdp->fd_lomap, 0, NDLOSLOTS(i)*sizeof(uint32_t));
   1453          1.115    provos 	}
   1454          1.126        pk 	newfdp->fd_freefile = fdp->fd_freefile;
   1455          1.173        ad 	newfdp->fd_exclose = fdp->fd_exclose;
   1456          1.126        pk 
   1457          1.192        ad 	ffp = fdp->fd_dt->dt_ff;
   1458          1.192        ad 	nffp = newdt->dt_ff;
   1459          1.173        ad 	newlast = -1;
   1460          1.192        ad 	for (i = 0; i <= (int)lastfile; i++, ffp++, nffp++) {
   1461          1.192        ad 		KASSERT(i >= NDFDFILE ||
   1462          1.192        ad 		    *nffp == (fdfile_t *)newfdp->fd_dfdfile[i]);
   1463          1.173        ad 		ff = *ffp;
   1464          1.192        ad 		if (ff == NULL || (fp = ff->ff_file) == NULL) {
   1465          1.192        ad 			/* Descriptor unused, or descriptor half open. */
   1466          1.192        ad 			KASSERT(!fd_isused(newfdp, i));
   1467          1.173        ad 			continue;
   1468          1.173        ad 		}
   1469          1.192        ad 		if (__predict_false(fp->f_type == DTYPE_KQUEUE)) {
   1470          1.173        ad 			/* kqueue descriptors cannot be copied. */
   1471          1.213     rmind 			if (i < newfdp->fd_freefile) {
   1472          1.213     rmind 				newfdp->fd_freefile = i;
   1473          1.213     rmind 			}
   1474          1.126        pk 			continue;
   1475          1.173        ad 		}
   1476          1.173        ad 		/* It's active: add a reference to the file. */
   1477          1.173        ad 		mutex_enter(&fp->f_lock);
   1478          1.173        ad 		fp->f_count++;
   1479          1.173        ad 		mutex_exit(&fp->f_lock);
   1480          1.192        ad 
   1481          1.192        ad 		/* Allocate an fdfile_t to represent it. */
   1482          1.173        ad 		if (i >= NDFDFILE) {
   1483          1.192        ad 			ff2 = pool_cache_get(fdfile_cache, PR_WAITOK);
   1484          1.192        ad 			*nffp = ff2;
   1485          1.192        ad 		} else {
   1486          1.192        ad 			ff2 = newdt->dt_ff[i];
   1487          1.173        ad 		}
   1488          1.173        ad 		ff2->ff_file = fp;
   1489          1.173        ad 		ff2->ff_exclose = ff->ff_exclose;
   1490          1.182      matt 		ff2->ff_allocated = true;
   1491          1.192        ad 
   1492          1.192        ad 		/* Fix up bitmaps. */
   1493          1.192        ad 		j = i >> NDENTRYSHIFT;
   1494          1.192        ad 		KASSERT((newfdp->fd_lomap[j] & (1 << (i & NDENTRYMASK))) == 0);
   1495          1.192        ad 		newfdp->fd_lomap[j] |= 1 << (i & NDENTRYMASK);
   1496          1.192        ad 		if (__predict_false(newfdp->fd_lomap[j] == ~0)) {
   1497          1.192        ad 			KASSERT((newfdp->fd_himap[j >> NDENTRYSHIFT] &
   1498          1.192        ad 			    (1 << (j & NDENTRYMASK))) == 0);
   1499          1.192        ad 			newfdp->fd_himap[j >> NDENTRYSHIFT] |=
   1500          1.192        ad 			    1 << (j & NDENTRYMASK);
   1501          1.173        ad 		}
   1502          1.192        ad 		newlast = i;
   1503          1.173        ad 	}
   1504          1.192        ad 	KASSERT(newdt->dt_ff[0] == (fdfile_t *)newfdp->fd_dfdfile[0]);
   1505          1.192        ad 	newfdp->fd_lastfile = newlast;
   1506          1.192        ad 	fd_checkmaps(newfdp);
   1507          1.173        ad 	mutex_exit(&fdp->fd_lock);
   1508          1.213     rmind 
   1509          1.213     rmind 	return newfdp;
   1510           1.16       cgd }
   1511           1.16       cgd 
   1512           1.16       cgd /*
   1513           1.16       cgd  * Release a filedesc structure.
   1514           1.16       cgd  */
   1515           1.16       cgd void
   1516          1.173        ad fd_free(void)
   1517           1.16       cgd {
   1518          1.173        ad 	fdfile_t *ff;
   1519          1.173        ad 	file_t *fp;
   1520          1.192        ad 	int fd, nf;
   1521          1.192        ad 	fdtab_t *dt;
   1522          1.197      yamt 	lwp_t * const l = curlwp;
   1523          1.197      yamt 	filedesc_t * const fdp = l->l_fd;
   1524          1.197      yamt 	const bool noadvlock = (l->l_proc->p_flag & PK_ADVLOCK) == 0;
   1525          1.173        ad 
   1526          1.192        ad 	KASSERT(fdp->fd_dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
   1527          1.192        ad 	KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE);
   1528          1.192        ad 	KASSERT(fdp->fd_dtbuiltin.dt_link == NULL);
   1529           1.16       cgd 
   1530          1.192        ad #ifndef __HAVE_ATOMIC_AS_MEMBAR
   1531          1.192        ad 	membar_exit();
   1532          1.192        ad #endif
   1533          1.164        ad 	if (atomic_dec_uint_nv(&fdp->fd_refcnt) > 0)
   1534           1.16       cgd 		return;
   1535          1.126        pk 
   1536           1.16       cgd 	/*
   1537          1.173        ad 	 * Close any files that the process holds open.
   1538           1.16       cgd 	 */
   1539          1.192        ad 	dt = fdp->fd_dt;
   1540          1.192        ad 	fd_checkmaps(fdp);
   1541          1.196      yamt #ifdef DEBUG
   1542          1.196      yamt 	fdp->fd_refcnt = -1; /* see fd_checkmaps */
   1543          1.196      yamt #endif
   1544          1.192        ad 	for (fd = 0, nf = dt->dt_nfiles; fd < nf; fd++) {
   1545          1.192        ad 		ff = dt->dt_ff[fd];
   1546          1.173        ad 		KASSERT(fd >= NDFDFILE ||
   1547          1.173        ad 		    ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
   1548          1.192        ad 		if (ff == NULL)
   1549          1.173        ad 			continue;
   1550          1.173        ad 		if ((fp = ff->ff_file) != NULL) {
   1551          1.173        ad 			/*
   1552          1.192        ad 			 * Must use fd_close() here if there is
   1553          1.197      yamt 			 * a reference from kqueue or we might have posix
   1554          1.197      yamt 			 * advisory locks.
   1555          1.173        ad 			 */
   1556          1.197      yamt 			if (__predict_true(ff->ff_refcnt == 0) &&
   1557          1.197      yamt 			    (noadvlock || fp->f_type != DTYPE_VNODE)) {
   1558          1.192        ad 				ff->ff_file = NULL;
   1559          1.192        ad 				ff->ff_exclose = false;
   1560          1.192        ad 				ff->ff_allocated = false;
   1561          1.192        ad 				closef(fp);
   1562          1.192        ad 			} else {
   1563          1.192        ad 				ff->ff_refcnt++;
   1564          1.192        ad 				fd_close(fd);
   1565          1.192        ad 			}
   1566          1.173        ad 		}
   1567          1.173        ad 		KASSERT(ff->ff_refcnt == 0);
   1568          1.173        ad 		KASSERT(ff->ff_file == NULL);
   1569          1.173        ad 		KASSERT(!ff->ff_exclose);
   1570          1.173        ad 		KASSERT(!ff->ff_allocated);
   1571          1.173        ad 		if (fd >= NDFDFILE) {
   1572          1.173        ad 			pool_cache_put(fdfile_cache, ff);
   1573          1.192        ad 			dt->dt_ff[fd] = NULL;
   1574          1.173        ad 		}
   1575           1.16       cgd 	}
   1576           1.59   thorpej 
   1577           1.59   thorpej 	/*
   1578          1.173        ad 	 * Clean out the descriptor table for the next user and return
   1579          1.173        ad 	 * to the cache.
   1580           1.59   thorpej 	 */
   1581          1.192        ad 	if (__predict_false(dt != &fdp->fd_dtbuiltin)) {
   1582          1.192        ad 		fd_dtab_free(fdp->fd_dt);
   1583          1.192        ad 		/* Otherwise, done above. */
   1584          1.192        ad 		memset(&fdp->fd_dtbuiltin.dt_ff[NDFDFILE], 0,
   1585          1.192        ad 		    (NDFILE - NDFDFILE) * sizeof(fdp->fd_dtbuiltin.dt_ff[0]));
   1586          1.192        ad 		fdp->fd_dt = &fdp->fd_dtbuiltin;
   1587           1.59   thorpej 	}
   1588          1.192        ad 	if (__predict_false(NDHISLOTS(nf) > NDHISLOTS(NDFILE))) {
   1589          1.173        ad 		KASSERT(fdp->fd_himap != fdp->fd_dhimap);
   1590          1.173        ad 		KASSERT(fdp->fd_lomap != fdp->fd_dlomap);
   1591          1.192        ad 		fd_map_free(nf, fdp->fd_lomap, fdp->fd_himap);
   1592           1.16       cgd 	}
   1593          1.192        ad 	if (__predict_false(fdp->fd_knhash != NULL)) {
   1594          1.179        ad 		hashdone(fdp->fd_knhash, HASH_LIST, fdp->fd_knhashmask);
   1595          1.173        ad 		fdp->fd_knhash = NULL;
   1596          1.173        ad 		fdp->fd_knhashmask = 0;
   1597          1.173        ad 	} else {
   1598          1.173        ad 		KASSERT(fdp->fd_knhashmask == 0);
   1599          1.137      yamt 	}
   1600          1.192        ad 	fdp->fd_dt = &fdp->fd_dtbuiltin;
   1601          1.173        ad 	fdp->fd_lastkqfile = -1;
   1602          1.192        ad 	fdp->fd_lastfile = -1;
   1603          1.192        ad 	fdp->fd_freefile = 0;
   1604          1.192        ad 	fdp->fd_exclose = false;
   1605          1.192        ad 	memset(&fdp->fd_startzero, 0, sizeof(*fdp) -
   1606          1.192        ad 	    offsetof(filedesc_t, fd_startzero));
   1607          1.195      yamt 	fdp->fd_himap = fdp->fd_dhimap;
   1608          1.195      yamt 	fdp->fd_lomap = fdp->fd_dlomap;
   1609          1.192        ad 	KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE);
   1610          1.192        ad 	KASSERT(fdp->fd_dtbuiltin.dt_link == NULL);
   1611          1.192        ad 	KASSERT(fdp->fd_dt == &fdp->fd_dtbuiltin);
   1612          1.196      yamt #ifdef DEBUG
   1613          1.196      yamt 	fdp->fd_refcnt = 0; /* see fd_checkmaps */
   1614          1.196      yamt #endif
   1615          1.192        ad 	fd_checkmaps(fdp);
   1616          1.173        ad 	pool_cache_put(filedesc_cache, fdp);
   1617          1.170    martin }
   1618          1.170    martin 
   1619           1.16       cgd /*
   1620           1.16       cgd  * File Descriptor pseudo-device driver (/dev/fd/).
   1621           1.16       cgd  *
   1622           1.16       cgd  * Opening minor device N dup()s the file (if any) connected to file
   1623           1.16       cgd  * descriptor N belonging to the calling process.  Note that this driver
   1624           1.16       cgd  * consists of only the ``open()'' routine, because all subsequent
   1625           1.16       cgd  * references to this file will be direct to the other driver.
   1626           1.16       cgd  */
   1627          1.134   thorpej static int
   1628          1.173        ad filedescopen(dev_t dev, int mode, int type, lwp_t *l)
   1629           1.16       cgd {
   1630           1.16       cgd 
   1631           1.28   mycroft 	/*
   1632          1.112  jdolecek 	 * XXX Kludge: set dupfd to contain the value of the
   1633           1.89     enami 	 * the file descriptor being sought for duplication. The error
   1634           1.28   mycroft 	 * return ensures that the vnode for this device will be released
   1635           1.28   mycroft 	 * by vn_open. Open will detect this special error and take the
   1636          1.208      yamt 	 * actions in fd_dupopen below. Other callers of vn_open or VOP_OPEN
   1637           1.28   mycroft 	 * will simply report the error.
   1638           1.28   mycroft 	 */
   1639          1.138  christos 	l->l_dupfd = minor(dev);	/* XXX */
   1640          1.127  christos 	return EDUPFD;
   1641           1.27   mycroft }
   1642           1.27   mycroft 
   1643           1.28   mycroft /*
   1644           1.28   mycroft  * Duplicate the specified descriptor to a free descriptor.
   1645           1.28   mycroft  */
   1646           1.27   mycroft int
   1647          1.173        ad fd_dupopen(int old, int *new, int mode, int error)
   1648           1.72     lukem {
   1649          1.173        ad 	filedesc_t *fdp;
   1650          1.173        ad 	fdfile_t *ff;
   1651          1.173        ad 	file_t *fp;
   1652          1.192        ad 	fdtab_t *dt;
   1653           1.27   mycroft 
   1654          1.173        ad 	if ((fp = fd_getfile(old)) == NULL) {
   1655          1.173        ad 		return EBADF;
   1656          1.173        ad 	}
   1657          1.173        ad 	fdp = curlwp->l_fd;
   1658          1.192        ad 	dt = fdp->fd_dt;
   1659          1.192        ad 	ff = dt->dt_ff[old];
   1660           1.59   thorpej 
   1661           1.27   mycroft 	/*
   1662           1.28   mycroft 	 * There are two cases of interest here.
   1663           1.28   mycroft 	 *
   1664          1.208      yamt 	 * For EDUPFD simply dup (old) to file descriptor
   1665          1.208      yamt 	 * (new) and return.
   1666           1.28   mycroft 	 *
   1667          1.208      yamt 	 * For EMOVEFD steal away the file structure from (old) and
   1668          1.208      yamt 	 * store it in (new).  (old) is effectively closed by
   1669           1.28   mycroft 	 * this operation.
   1670           1.28   mycroft 	 *
   1671           1.28   mycroft 	 * Any other error code is just returned.
   1672           1.27   mycroft 	 */
   1673           1.28   mycroft 	switch (error) {
   1674          1.127  christos 	case EDUPFD:
   1675           1.28   mycroft 		/*
   1676           1.28   mycroft 		 * Check that the mode the file is being opened for is a
   1677           1.28   mycroft 		 * subset of the mode of the existing descriptor.
   1678           1.28   mycroft 		 */
   1679          1.173        ad 		if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
   1680          1.173        ad 			error = EACCES;
   1681          1.173        ad 			break;
   1682          1.173        ad 		}
   1683          1.173        ad 
   1684          1.173        ad 		/* Copy it. */
   1685          1.192        ad 		error = fd_dup(fp, 0, new, ff->ff_exclose);
   1686          1.173        ad 		break;
   1687           1.27   mycroft 
   1688          1.127  christos 	case EMOVEFD:
   1689          1.173        ad 		/* Copy it. */
   1690          1.192        ad 		error = fd_dup(fp, 0, new, ff->ff_exclose);
   1691          1.173        ad 		if (error != 0) {
   1692          1.173        ad 			break;
   1693          1.173        ad 		}
   1694           1.16       cgd 
   1695          1.173        ad 		/* Steal away the file pointer from 'old'. */
   1696          1.173        ad 		(void)fd_close(old);
   1697          1.173        ad 		return 0;
   1698           1.28   mycroft 	}
   1699          1.173        ad 
   1700          1.173        ad 	fd_putfile(old);
   1701          1.173        ad 	return error;
   1702           1.61  wrstuden }
   1703           1.61  wrstuden 
   1704           1.61  wrstuden /*
   1705          1.211     pooka  * Close open files on exec.
   1706          1.211     pooka  */
   1707          1.211     pooka void
   1708          1.211     pooka fd_closeexec(void)
   1709          1.211     pooka {
   1710          1.211     pooka 	proc_t *p;
   1711          1.211     pooka 	filedesc_t *fdp;
   1712          1.211     pooka 	fdfile_t *ff;
   1713          1.211     pooka 	lwp_t *l;
   1714          1.211     pooka 	fdtab_t *dt;
   1715          1.211     pooka 	int fd;
   1716          1.211     pooka 
   1717          1.211     pooka 	l = curlwp;
   1718          1.211     pooka 	p = l->l_proc;
   1719          1.211     pooka 	fdp = p->p_fd;
   1720          1.211     pooka 
   1721          1.211     pooka 	if (fdp->fd_refcnt > 1) {
   1722          1.211     pooka 		fdp = fd_copy();
   1723          1.211     pooka 		fd_free();
   1724          1.211     pooka 		p->p_fd = fdp;
   1725          1.211     pooka 		l->l_fd = fdp;
   1726          1.211     pooka 	}
   1727          1.211     pooka 	if (!fdp->fd_exclose) {
   1728          1.211     pooka 		return;
   1729          1.211     pooka 	}
   1730          1.211     pooka 	fdp->fd_exclose = false;
   1731          1.211     pooka 	dt = fdp->fd_dt;
   1732          1.211     pooka 
   1733          1.211     pooka 	for (fd = 0; fd <= fdp->fd_lastfile; fd++) {
   1734          1.211     pooka 		if ((ff = dt->dt_ff[fd]) == NULL) {
   1735          1.211     pooka 			KASSERT(fd >= NDFDFILE);
   1736          1.211     pooka 			continue;
   1737          1.211     pooka 		}
   1738          1.211     pooka 		KASSERT(fd >= NDFDFILE ||
   1739          1.211     pooka 		    ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
   1740          1.211     pooka 		if (ff->ff_file == NULL)
   1741          1.211     pooka 			continue;
   1742          1.211     pooka 		if (ff->ff_exclose) {
   1743          1.211     pooka 			/*
   1744          1.211     pooka 			 * We need a reference to close the file.
   1745          1.211     pooka 			 * No other threads can see the fdfile_t at
   1746          1.211     pooka 			 * this point, so don't bother locking.
   1747          1.211     pooka 			 */
   1748          1.211     pooka 			KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
   1749          1.211     pooka 			ff->ff_refcnt++;
   1750          1.211     pooka 			fd_close(fd);
   1751          1.211     pooka 		}
   1752          1.211     pooka 	}
   1753          1.211     pooka }
   1754          1.211     pooka 
   1755          1.211     pooka /*
   1756          1.113  jdolecek  * Sets descriptor owner. If the owner is a process, 'pgid'
   1757          1.113  jdolecek  * is set to positive value, process ID. If the owner is process group,
   1758          1.113  jdolecek  * 'pgid' is set to -pg_id.
   1759          1.113  jdolecek  */
   1760          1.113  jdolecek int
   1761          1.180  gmcgarry fsetown(pid_t *pgid, u_long cmd, const void *data)
   1762          1.113  jdolecek {
   1763          1.203     rmind 	pid_t id = *(const pid_t *)data;
   1764          1.113  jdolecek 	int error;
   1765          1.113  jdolecek 
   1766          1.113  jdolecek 	switch (cmd) {
   1767          1.113  jdolecek 	case TIOCSPGRP:
   1768          1.113  jdolecek 		if (id < 0)
   1769          1.203     rmind 			return EINVAL;
   1770          1.113  jdolecek 		id = -id;
   1771          1.113  jdolecek 		break;
   1772          1.113  jdolecek 	default:
   1773          1.113  jdolecek 		break;
   1774          1.113  jdolecek 	}
   1775          1.203     rmind 	if (id > 0) {
   1776          1.203     rmind 		mutex_enter(proc_lock);
   1777          1.203     rmind 		error = proc_find(id) ? 0 : ESRCH;
   1778          1.203     rmind 		mutex_exit(proc_lock);
   1779          1.203     rmind 	} else if (id < 0) {
   1780          1.203     rmind 		error = pgid_in_session(curproc, -id);
   1781          1.203     rmind 	} else {
   1782          1.203     rmind 		error = 0;
   1783          1.203     rmind 	}
   1784          1.203     rmind 	if (!error) {
   1785          1.203     rmind 		*pgid = id;
   1786          1.203     rmind 	}
   1787          1.203     rmind 	return error;
   1788          1.113  jdolecek }
   1789          1.113  jdolecek 
   1790          1.212  christos void
   1791          1.212  christos fd_set_exclose(struct lwp *l, int fd, bool exclose)
   1792          1.212  christos {
   1793          1.212  christos 	filedesc_t *fdp = l->l_fd;
   1794          1.212  christos 	fdfile_t *ff = fdp->fd_dt->dt_ff[fd];
   1795          1.213     rmind 
   1796          1.212  christos 	ff->ff_exclose = exclose;
   1797          1.212  christos 	if (exclose)
   1798          1.212  christos 		fdp->fd_exclose = true;
   1799          1.212  christos }
   1800          1.212  christos 
   1801          1.113  jdolecek /*
   1802          1.113  jdolecek  * Return descriptor owner information. If the value is positive,
   1803          1.113  jdolecek  * it's process ID. If it's negative, it's process group ID and
   1804          1.113  jdolecek  * needs the sign removed before use.
   1805          1.113  jdolecek  */
   1806          1.113  jdolecek int
   1807          1.180  gmcgarry fgetown(pid_t pgid, u_long cmd, void *data)
   1808          1.113  jdolecek {
   1809          1.173        ad 
   1810          1.113  jdolecek 	switch (cmd) {
   1811          1.113  jdolecek 	case TIOCGPGRP:
   1812          1.113  jdolecek 		*(int *)data = -pgid;
   1813          1.113  jdolecek 		break;
   1814          1.113  jdolecek 	default:
   1815          1.113  jdolecek 		*(int *)data = pgid;
   1816          1.113  jdolecek 		break;
   1817          1.113  jdolecek 	}
   1818          1.213     rmind 	return 0;
   1819          1.113  jdolecek }
   1820          1.113  jdolecek 
   1821          1.113  jdolecek /*
   1822          1.113  jdolecek  * Send signal to descriptor owner, either process or process group.
   1823          1.113  jdolecek  */
   1824          1.113  jdolecek void
   1825          1.114  christos fownsignal(pid_t pgid, int signo, int code, int band, void *fdescdata)
   1826          1.113  jdolecek {
   1827          1.131     perry 	ksiginfo_t ksi;
   1828          1.113  jdolecek 
   1829          1.176        ad 	KASSERT(!cpu_intr_p());
   1830          1.176        ad 
   1831          1.189     rmind 	if (pgid == 0) {
   1832          1.189     rmind 		return;
   1833          1.189     rmind 	}
   1834          1.189     rmind 
   1835          1.148      yamt 	KSI_INIT(&ksi);
   1836          1.114  christos 	ksi.ksi_signo = signo;
   1837          1.113  jdolecek 	ksi.ksi_code = code;
   1838          1.113  jdolecek 	ksi.ksi_band = band;
   1839          1.113  jdolecek 
   1840          1.176        ad 	mutex_enter(proc_lock);
   1841          1.189     rmind 	if (pgid > 0) {
   1842          1.189     rmind 		struct proc *p1;
   1843          1.189     rmind 
   1844          1.203     rmind 		p1 = proc_find(pgid);
   1845          1.189     rmind 		if (p1 != NULL) {
   1846          1.189     rmind 			kpsignal(p1, &ksi, fdescdata);
   1847          1.189     rmind 		}
   1848          1.189     rmind 	} else {
   1849          1.189     rmind 		struct pgrp *pgrp;
   1850          1.189     rmind 
   1851          1.189     rmind 		KASSERT(pgid < 0);
   1852          1.203     rmind 		pgrp = pgrp_find(-pgid);
   1853          1.189     rmind 		if (pgrp != NULL) {
   1854          1.189     rmind 			kpgsignal(pgrp, &ksi, fdescdata, 0);
   1855          1.189     rmind 		}
   1856          1.189     rmind 	}
   1857          1.176        ad 	mutex_exit(proc_lock);
   1858          1.113  jdolecek }
   1859          1.127  christos 
   1860          1.127  christos int
   1861          1.173        ad fd_clone(file_t *fp, unsigned fd, int flag, const struct fileops *fops,
   1862          1.173        ad 	 void *data)
   1863          1.127  christos {
   1864      1.225.2.1       snj 	fdfile_t *ff;
   1865      1.225.2.1       snj 	filedesc_t *fdp;
   1866          1.173        ad 
   1867  1.225.2.1.2.1       snj 	fp->f_flag = flag & FMASK;
   1868      1.225.2.1       snj 	fdp = curproc->p_fd;
   1869      1.225.2.1       snj 	ff = fdp->fd_dt->dt_ff[fd];
   1870      1.225.2.1       snj 	KASSERT(ff != NULL);
   1871      1.225.2.1       snj 	ff->ff_exclose = (flag & O_CLOEXEC) != 0;
   1872          1.127  christos 	fp->f_type = DTYPE_MISC;
   1873          1.127  christos 	fp->f_ops = fops;
   1874          1.127  christos 	fp->f_data = data;
   1875          1.173        ad 	curlwp->l_dupfd = fd;
   1876          1.173        ad 	fd_affix(curproc, fp, fd);
   1877          1.127  christos 
   1878          1.127  christos 	return EMOVEFD;
   1879          1.127  christos }
   1880          1.127  christos 
   1881          1.127  christos int
   1882          1.173        ad fnullop_fcntl(file_t *fp, u_int cmd, void *data)
   1883          1.127  christos {
   1884          1.147      yamt 
   1885          1.127  christos 	if (cmd == F_SETFL)
   1886          1.127  christos 		return 0;
   1887          1.127  christos 
   1888          1.127  christos 	return EOPNOTSUPP;
   1889          1.127  christos }
   1890          1.127  christos 
   1891          1.127  christos int
   1892          1.173        ad fnullop_poll(file_t *fp, int which)
   1893          1.127  christos {
   1894          1.147      yamt 
   1895          1.127  christos 	return 0;
   1896          1.127  christos }
   1897          1.127  christos 
   1898          1.127  christos int
   1899          1.173        ad fnullop_kqfilter(file_t *fp, struct knote *kn)
   1900          1.127  christos {
   1901          1.127  christos 
   1902          1.219  christos 	return EOPNOTSUPP;
   1903          1.127  christos }
   1904          1.127  christos 
   1905          1.190        ad void
   1906          1.202       dsl fnullop_restart(file_t *fp)
   1907          1.190        ad {
   1908          1.190        ad 
   1909          1.190        ad }
   1910          1.190        ad 
   1911          1.127  christos int
   1912          1.173        ad fbadop_read(file_t *fp, off_t *offset, struct uio *uio,
   1913          1.173        ad 	    kauth_cred_t cred, int flags)
   1914          1.160     rmind {
   1915          1.160     rmind 
   1916          1.160     rmind 	return EOPNOTSUPP;
   1917          1.160     rmind }
   1918          1.160     rmind 
   1919          1.160     rmind int
   1920          1.173        ad fbadop_write(file_t *fp, off_t *offset, struct uio *uio,
   1921          1.173        ad 	     kauth_cred_t cred, int flags)
   1922          1.160     rmind {
   1923          1.160     rmind 
   1924          1.160     rmind 	return EOPNOTSUPP;
   1925          1.160     rmind }
   1926          1.160     rmind 
   1927          1.160     rmind int
   1928          1.173        ad fbadop_ioctl(file_t *fp, u_long com, void *data)
   1929          1.160     rmind {
   1930          1.160     rmind 
   1931          1.160     rmind 	return EOPNOTSUPP;
   1932          1.160     rmind }
   1933          1.160     rmind 
   1934          1.160     rmind int
   1935          1.173        ad fbadop_stat(file_t *fp, struct stat *sb)
   1936          1.127  christos {
   1937          1.147      yamt 
   1938          1.127  christos 	return EOPNOTSUPP;
   1939          1.127  christos }
   1940          1.160     rmind 
   1941          1.160     rmind int
   1942          1.173        ad fbadop_close(file_t *fp)
   1943          1.160     rmind {
   1944          1.160     rmind 
   1945          1.160     rmind 	return EOPNOTSUPP;
   1946          1.160     rmind }
   1947          1.210     pooka 
   1948          1.210     pooka /*
   1949          1.210     pooka  * sysctl routines pertaining to file descriptors
   1950          1.210     pooka  */
   1951          1.210     pooka 
   1952          1.210     pooka /* Initialized in sysctl_init() for now... */
   1953          1.210     pooka extern kmutex_t sysctl_file_marker_lock;
   1954          1.210     pooka static u_int sysctl_file_marker = 1;
   1955          1.210     pooka 
   1956          1.210     pooka /*
   1957          1.210     pooka  * Expects to be called with proc_lock and sysctl_file_marker_lock locked.
   1958          1.210     pooka  */
   1959          1.210     pooka static void
   1960          1.210     pooka sysctl_file_marker_reset(void)
   1961          1.210     pooka {
   1962          1.210     pooka 	struct proc *p;
   1963          1.210     pooka 
   1964          1.210     pooka 	PROCLIST_FOREACH(p, &allproc) {
   1965          1.210     pooka 		struct filedesc *fd = p->p_fd;
   1966          1.210     pooka 		fdtab_t *dt;
   1967          1.210     pooka 		u_int i;
   1968          1.210     pooka 
   1969          1.210     pooka 		mutex_enter(&fd->fd_lock);
   1970          1.210     pooka 		dt = fd->fd_dt;
   1971          1.210     pooka 		for (i = 0; i < dt->dt_nfiles; i++) {
   1972          1.210     pooka 			struct file *fp;
   1973          1.210     pooka 			fdfile_t *ff;
   1974          1.210     pooka 
   1975          1.210     pooka 			if ((ff = dt->dt_ff[i]) == NULL) {
   1976          1.210     pooka 				continue;
   1977          1.210     pooka 			}
   1978          1.210     pooka 			if ((fp = ff->ff_file) == NULL) {
   1979          1.210     pooka 				continue;
   1980          1.210     pooka 			}
   1981          1.210     pooka 			fp->f_marker = 0;
   1982          1.210     pooka 		}
   1983          1.210     pooka 		mutex_exit(&fd->fd_lock);
   1984          1.210     pooka 	}
   1985          1.210     pooka }
   1986          1.210     pooka 
   1987          1.210     pooka /*
   1988          1.210     pooka  * sysctl helper routine for kern.file pseudo-subtree.
   1989          1.210     pooka  */
   1990          1.210     pooka static int
   1991          1.210     pooka sysctl_kern_file(SYSCTLFN_ARGS)
   1992          1.210     pooka {
   1993          1.210     pooka 	int error;
   1994          1.210     pooka 	size_t buflen;
   1995          1.210     pooka 	struct file *fp, fbuf;
   1996          1.210     pooka 	char *start, *where;
   1997          1.210     pooka 	struct proc *p;
   1998          1.210     pooka 
   1999          1.210     pooka 	start = where = oldp;
   2000          1.210     pooka 	buflen = *oldlenp;
   2001          1.210     pooka 
   2002          1.210     pooka 	if (where == NULL) {
   2003          1.210     pooka 		/*
   2004          1.210     pooka 		 * overestimate by 10 files
   2005          1.210     pooka 		 */
   2006          1.210     pooka 		*oldlenp = sizeof(filehead) + (nfiles + 10) *
   2007          1.210     pooka 		    sizeof(struct file);
   2008          1.213     rmind 		return 0;
   2009          1.210     pooka 	}
   2010          1.210     pooka 
   2011          1.210     pooka 	/*
   2012          1.210     pooka 	 * first sysctl_copyout filehead
   2013          1.210     pooka 	 */
   2014          1.210     pooka 	if (buflen < sizeof(filehead)) {
   2015          1.210     pooka 		*oldlenp = 0;
   2016          1.213     rmind 		return 0;
   2017          1.210     pooka 	}
   2018          1.210     pooka 	sysctl_unlock();
   2019          1.210     pooka 	error = sysctl_copyout(l, &filehead, where, sizeof(filehead));
   2020          1.210     pooka 	if (error) {
   2021          1.213     rmind 		sysctl_relock();
   2022          1.210     pooka 		return error;
   2023          1.210     pooka 	}
   2024          1.210     pooka 	buflen -= sizeof(filehead);
   2025          1.210     pooka 	where += sizeof(filehead);
   2026          1.210     pooka 
   2027          1.210     pooka 	/*
   2028          1.210     pooka 	 * followed by an array of file structures
   2029          1.210     pooka 	 */
   2030          1.210     pooka 	mutex_enter(&sysctl_file_marker_lock);
   2031          1.210     pooka 	mutex_enter(proc_lock);
   2032          1.210     pooka 	PROCLIST_FOREACH(p, &allproc) {
   2033          1.210     pooka 		struct filedesc *fd;
   2034          1.210     pooka 		fdtab_t *dt;
   2035          1.210     pooka 		u_int i;
   2036          1.210     pooka 
   2037          1.210     pooka 		if (p->p_stat == SIDL) {
   2038          1.210     pooka 			/* skip embryonic processes */
   2039          1.210     pooka 			continue;
   2040          1.210     pooka 		}
   2041          1.210     pooka 		mutex_enter(p->p_lock);
   2042          1.210     pooka 		error = kauth_authorize_process(l->l_cred,
   2043          1.210     pooka 		    KAUTH_PROCESS_CANSEE, p,
   2044          1.210     pooka 		    KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_OPENFILES),
   2045          1.210     pooka 		    NULL, NULL);
   2046          1.210     pooka 		mutex_exit(p->p_lock);
   2047          1.210     pooka 		if (error != 0) {
   2048          1.210     pooka 			/*
   2049          1.210     pooka 			 * Don't leak kauth retval if we're silently
   2050          1.210     pooka 			 * skipping this entry.
   2051          1.210     pooka 			 */
   2052          1.210     pooka 			error = 0;
   2053          1.210     pooka 			continue;
   2054          1.210     pooka 		}
   2055          1.210     pooka 
   2056          1.210     pooka 		/*
   2057          1.210     pooka 		 * Grab a hold on the process.
   2058          1.210     pooka 		 */
   2059          1.210     pooka 		if (!rw_tryenter(&p->p_reflock, RW_READER)) {
   2060          1.210     pooka 			continue;
   2061          1.210     pooka 		}
   2062          1.210     pooka 		mutex_exit(proc_lock);
   2063          1.210     pooka 
   2064          1.210     pooka 		fd = p->p_fd;
   2065          1.210     pooka 		mutex_enter(&fd->fd_lock);
   2066          1.210     pooka 		dt = fd->fd_dt;
   2067          1.210     pooka 		for (i = 0; i < dt->dt_nfiles; i++) {
   2068          1.210     pooka 			fdfile_t *ff;
   2069          1.210     pooka 
   2070          1.210     pooka 			if ((ff = dt->dt_ff[i]) == NULL) {
   2071          1.210     pooka 				continue;
   2072          1.210     pooka 			}
   2073          1.210     pooka 			if ((fp = ff->ff_file) == NULL) {
   2074          1.210     pooka 				continue;
   2075          1.210     pooka 			}
   2076          1.210     pooka 
   2077          1.210     pooka 			mutex_enter(&fp->f_lock);
   2078          1.210     pooka 
   2079          1.210     pooka 			if ((fp->f_count == 0) ||
   2080          1.210     pooka 			    (fp->f_marker == sysctl_file_marker)) {
   2081          1.210     pooka 				mutex_exit(&fp->f_lock);
   2082          1.210     pooka 				continue;
   2083          1.210     pooka 			}
   2084          1.210     pooka 
   2085          1.210     pooka 			/* Check that we have enough space. */
   2086          1.210     pooka 			if (buflen < sizeof(struct file)) {
   2087          1.210     pooka 				*oldlenp = where - start;
   2088          1.213     rmind 				mutex_exit(&fp->f_lock);
   2089          1.210     pooka 				error = ENOMEM;
   2090          1.210     pooka 				break;
   2091          1.210     pooka 			}
   2092          1.210     pooka 
   2093          1.210     pooka 			memcpy(&fbuf, fp, sizeof(fbuf));
   2094          1.210     pooka 			mutex_exit(&fp->f_lock);
   2095          1.210     pooka 			error = sysctl_copyout(l, &fbuf, where, sizeof(fbuf));
   2096          1.210     pooka 			if (error) {
   2097          1.210     pooka 				break;
   2098          1.210     pooka 			}
   2099          1.210     pooka 			buflen -= sizeof(struct file);
   2100          1.210     pooka 			where += sizeof(struct file);
   2101          1.210     pooka 
   2102          1.210     pooka 			fp->f_marker = sysctl_file_marker;
   2103          1.210     pooka 		}
   2104          1.210     pooka 		mutex_exit(&fd->fd_lock);
   2105          1.210     pooka 
   2106          1.210     pooka 		/*
   2107          1.210     pooka 		 * Release reference to process.
   2108          1.210     pooka 		 */
   2109          1.210     pooka 		mutex_enter(proc_lock);
   2110          1.210     pooka 		rw_exit(&p->p_reflock);
   2111          1.210     pooka 
   2112          1.210     pooka 		if (error)
   2113          1.210     pooka 			break;
   2114          1.210     pooka 	}
   2115          1.210     pooka 
   2116          1.210     pooka 	sysctl_file_marker++;
   2117          1.210     pooka 	/* Reset all markers if wrapped. */
   2118          1.210     pooka 	if (sysctl_file_marker == 0) {
   2119          1.210     pooka 		sysctl_file_marker_reset();
   2120          1.210     pooka 		sysctl_file_marker++;
   2121          1.210     pooka 	}
   2122          1.210     pooka 
   2123          1.210     pooka 	mutex_exit(proc_lock);
   2124          1.210     pooka 	mutex_exit(&sysctl_file_marker_lock);
   2125          1.210     pooka 
   2126          1.210     pooka 	*oldlenp = where - start;
   2127          1.213     rmind 	sysctl_relock();
   2128          1.213     rmind 	return error;
   2129          1.210     pooka }
   2130          1.210     pooka 
   2131          1.210     pooka /*
   2132          1.210     pooka  * sysctl helper function for kern.file2
   2133          1.210     pooka  */
   2134          1.210     pooka static int
   2135          1.210     pooka sysctl_kern_file2(SYSCTLFN_ARGS)
   2136          1.210     pooka {
   2137          1.210     pooka 	struct proc *p;
   2138          1.210     pooka 	struct file *fp;
   2139          1.210     pooka 	struct filedesc *fd;
   2140          1.210     pooka 	struct kinfo_file kf;
   2141          1.210     pooka 	char *dp;
   2142          1.210     pooka 	u_int i, op;
   2143          1.210     pooka 	size_t len, needed, elem_size, out_size;
   2144          1.210     pooka 	int error, arg, elem_count;
   2145          1.210     pooka 	fdfile_t *ff;
   2146          1.210     pooka 	fdtab_t *dt;
   2147          1.210     pooka 
   2148          1.210     pooka 	if (namelen == 1 && name[0] == CTL_QUERY)
   2149          1.213     rmind 		return sysctl_query(SYSCTLFN_CALL(rnode));
   2150          1.210     pooka 
   2151          1.210     pooka 	if (namelen != 4)
   2152          1.213     rmind 		return EINVAL;
   2153          1.210     pooka 
   2154          1.210     pooka 	error = 0;
   2155          1.210     pooka 	dp = oldp;
   2156          1.210     pooka 	len = (oldp != NULL) ? *oldlenp : 0;
   2157          1.210     pooka 	op = name[0];
   2158          1.210     pooka 	arg = name[1];
   2159          1.210     pooka 	elem_size = name[2];
   2160          1.210     pooka 	elem_count = name[3];
   2161          1.210     pooka 	out_size = MIN(sizeof(kf), elem_size);
   2162          1.210     pooka 	needed = 0;
   2163          1.210     pooka 
   2164          1.210     pooka 	if (elem_size < 1 || elem_count < 0)
   2165          1.213     rmind 		return EINVAL;
   2166          1.210     pooka 
   2167          1.210     pooka 	switch (op) {
   2168          1.210     pooka 	case KERN_FILE_BYFILE:
   2169          1.210     pooka 	case KERN_FILE_BYPID:
   2170          1.210     pooka 		/*
   2171          1.210     pooka 		 * We're traversing the process list in both cases; the BYFILE
   2172          1.210     pooka 		 * case does additional work of keeping track of files already
   2173          1.210     pooka 		 * looked at.
   2174          1.210     pooka 		 */
   2175          1.210     pooka 
   2176          1.210     pooka 		/* doesn't use arg so it must be zero */
   2177          1.210     pooka 		if ((op == KERN_FILE_BYFILE) && (arg != 0))
   2178          1.210     pooka 			return EINVAL;
   2179          1.210     pooka 
   2180          1.210     pooka 		if ((op == KERN_FILE_BYPID) && (arg < -1))
   2181          1.210     pooka 			/* -1 means all processes */
   2182          1.213     rmind 			return EINVAL;
   2183          1.210     pooka 
   2184          1.210     pooka 		sysctl_unlock();
   2185          1.210     pooka 		if (op == KERN_FILE_BYFILE)
   2186          1.210     pooka 			mutex_enter(&sysctl_file_marker_lock);
   2187          1.210     pooka 		mutex_enter(proc_lock);
   2188          1.210     pooka 		PROCLIST_FOREACH(p, &allproc) {
   2189          1.210     pooka 			if (p->p_stat == SIDL) {
   2190          1.210     pooka 				/* skip embryonic processes */
   2191          1.210     pooka 				continue;
   2192          1.210     pooka 			}
   2193          1.210     pooka 			if (arg > 0 && p->p_pid != arg) {
   2194          1.210     pooka 				/* pick only the one we want */
   2195          1.210     pooka 				/* XXX want 0 to mean "kernel files" */
   2196          1.210     pooka 				continue;
   2197          1.210     pooka 			}
   2198          1.210     pooka 			mutex_enter(p->p_lock);
   2199          1.210     pooka 			error = kauth_authorize_process(l->l_cred,
   2200          1.210     pooka 			    KAUTH_PROCESS_CANSEE, p,
   2201          1.210     pooka 			    KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_OPENFILES),
   2202          1.210     pooka 			    NULL, NULL);
   2203          1.210     pooka 			mutex_exit(p->p_lock);
   2204          1.210     pooka 			if (error != 0) {
   2205          1.210     pooka 				/*
   2206          1.210     pooka 				 * Don't leak kauth retval if we're silently
   2207          1.210     pooka 				 * skipping this entry.
   2208          1.210     pooka 				 */
   2209          1.210     pooka 				error = 0;
   2210          1.210     pooka 				continue;
   2211          1.210     pooka 			}
   2212          1.210     pooka 
   2213          1.210     pooka 			/*
   2214          1.210     pooka 			 * Grab a hold on the process.
   2215          1.210     pooka 			 */
   2216          1.210     pooka 			if (!rw_tryenter(&p->p_reflock, RW_READER)) {
   2217          1.210     pooka 				continue;
   2218          1.210     pooka 			}
   2219          1.210     pooka 			mutex_exit(proc_lock);
   2220          1.210     pooka 
   2221          1.210     pooka 			fd = p->p_fd;
   2222          1.210     pooka 			mutex_enter(&fd->fd_lock);
   2223          1.210     pooka 			dt = fd->fd_dt;
   2224          1.210     pooka 			for (i = 0; i < dt->dt_nfiles; i++) {
   2225          1.210     pooka 				if ((ff = dt->dt_ff[i]) == NULL) {
   2226          1.210     pooka 					continue;
   2227          1.210     pooka 				}
   2228          1.210     pooka 				if ((fp = ff->ff_file) == NULL) {
   2229          1.210     pooka 					continue;
   2230          1.210     pooka 				}
   2231          1.210     pooka 
   2232          1.210     pooka 				if ((op == KERN_FILE_BYFILE) &&
   2233          1.210     pooka 				    (fp->f_marker == sysctl_file_marker)) {
   2234          1.210     pooka 					continue;
   2235          1.210     pooka 				}
   2236          1.210     pooka 				if (len >= elem_size && elem_count > 0) {
   2237          1.210     pooka 					mutex_enter(&fp->f_lock);
   2238          1.210     pooka 					fill_file(&kf, fp, ff, i, p->p_pid);
   2239          1.210     pooka 					mutex_exit(&fp->f_lock);
   2240          1.210     pooka 					mutex_exit(&fd->fd_lock);
   2241          1.210     pooka 					error = sysctl_copyout(l,
   2242          1.210     pooka 					    &kf, dp, out_size);
   2243          1.210     pooka 					mutex_enter(&fd->fd_lock);
   2244          1.210     pooka 					if (error)
   2245          1.210     pooka 						break;
   2246          1.210     pooka 					dp += elem_size;
   2247          1.210     pooka 					len -= elem_size;
   2248          1.210     pooka 				}
   2249          1.210     pooka 				if (op == KERN_FILE_BYFILE)
   2250          1.210     pooka 					fp->f_marker = sysctl_file_marker;
   2251          1.210     pooka 				needed += elem_size;
   2252          1.210     pooka 				if (elem_count > 0 && elem_count != INT_MAX)
   2253          1.210     pooka 					elem_count--;
   2254          1.210     pooka 			}
   2255          1.210     pooka 			mutex_exit(&fd->fd_lock);
   2256          1.210     pooka 
   2257          1.210     pooka 			/*
   2258          1.210     pooka 			 * Release reference to process.
   2259          1.210     pooka 			 */
   2260          1.210     pooka 			mutex_enter(proc_lock);
   2261          1.210     pooka 			rw_exit(&p->p_reflock);
   2262          1.210     pooka 		}
   2263          1.210     pooka 		if (op == KERN_FILE_BYFILE) {
   2264          1.210     pooka 			sysctl_file_marker++;
   2265          1.210     pooka 
   2266          1.210     pooka 			/* Reset all markers if wrapped. */
   2267          1.210     pooka 			if (sysctl_file_marker == 0) {
   2268          1.210     pooka 				sysctl_file_marker_reset();
   2269          1.210     pooka 				sysctl_file_marker++;
   2270          1.210     pooka 			}
   2271          1.210     pooka 		}
   2272          1.210     pooka 		mutex_exit(proc_lock);
   2273          1.210     pooka 		if (op == KERN_FILE_BYFILE)
   2274          1.210     pooka 			mutex_exit(&sysctl_file_marker_lock);
   2275          1.210     pooka 		sysctl_relock();
   2276          1.210     pooka 		break;
   2277          1.210     pooka 	default:
   2278          1.213     rmind 		return EINVAL;
   2279          1.210     pooka 	}
   2280          1.210     pooka 
   2281          1.210     pooka 	if (oldp == NULL)
   2282          1.210     pooka 		needed += KERN_FILESLOP * elem_size;
   2283          1.210     pooka 	*oldlenp = needed;
   2284          1.210     pooka 
   2285          1.213     rmind 	return error;
   2286          1.210     pooka }
   2287          1.210     pooka 
   2288          1.210     pooka static void
   2289          1.210     pooka fill_file(struct kinfo_file *kp, const file_t *fp, const fdfile_t *ff,
   2290          1.210     pooka 	  int i, pid_t pid)
   2291          1.210     pooka {
   2292          1.210     pooka 
   2293          1.210     pooka 	memset(kp, 0, sizeof(*kp));
   2294          1.210     pooka 
   2295          1.210     pooka 	kp->ki_fileaddr =	PTRTOUINT64(fp);
   2296          1.210     pooka 	kp->ki_flag =		fp->f_flag;
   2297          1.210     pooka 	kp->ki_iflags =		0;
   2298          1.210     pooka 	kp->ki_ftype =		fp->f_type;
   2299          1.210     pooka 	kp->ki_count =		fp->f_count;
   2300          1.210     pooka 	kp->ki_msgcount =	fp->f_msgcount;
   2301          1.210     pooka 	kp->ki_fucred =		PTRTOUINT64(fp->f_cred);
   2302          1.210     pooka 	kp->ki_fuid =		kauth_cred_geteuid(fp->f_cred);
   2303          1.210     pooka 	kp->ki_fgid =		kauth_cred_getegid(fp->f_cred);
   2304          1.210     pooka 	kp->ki_fops =		PTRTOUINT64(fp->f_ops);
   2305          1.210     pooka 	kp->ki_foffset =	fp->f_offset;
   2306          1.210     pooka 	kp->ki_fdata =		PTRTOUINT64(fp->f_data);
   2307          1.210     pooka 
   2308          1.210     pooka 	/* vnode information to glue this file to something */
   2309          1.210     pooka 	if (fp->f_type == DTYPE_VNODE) {
   2310          1.210     pooka 		struct vnode *vp = (struct vnode *)fp->f_data;
   2311          1.210     pooka 
   2312          1.210     pooka 		kp->ki_vun =	PTRTOUINT64(vp->v_un.vu_socket);
   2313          1.210     pooka 		kp->ki_vsize =	vp->v_size;
   2314          1.210     pooka 		kp->ki_vtype =	vp->v_type;
   2315          1.210     pooka 		kp->ki_vtag =	vp->v_tag;
   2316          1.210     pooka 		kp->ki_vdata =	PTRTOUINT64(vp->v_data);
   2317          1.210     pooka 	}
   2318          1.210     pooka 
   2319          1.210     pooka 	/* process information when retrieved via KERN_FILE_BYPID */
   2320          1.210     pooka 	if (ff != NULL) {
   2321          1.210     pooka 		kp->ki_pid =		pid;
   2322          1.210     pooka 		kp->ki_fd =		i;
   2323          1.210     pooka 		kp->ki_ofileflags =	ff->ff_exclose;
   2324          1.210     pooka 		kp->ki_usecount =	ff->ff_refcnt;
   2325          1.210     pooka 	}
   2326          1.210     pooka }
   2327