Home | History | Annotate | Line # | Download | only in kern
vfs_vnops.c revision 1.243
      1 /*	$NetBSD: vfs_vnops.c,v 1.243 2024/12/07 02:11:43 riastradh Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2009 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Andrew Doran.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1982, 1986, 1989, 1993
     34  *	The Regents of the University of California.  All rights reserved.
     35  * (c) UNIX System Laboratories, Inc.
     36  * All or some portions of this file are derived from material licensed
     37  * to the University of California by American Telephone and Telegraph
     38  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
     39  * the permission of UNIX System Laboratories, Inc.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  *	@(#)vfs_vnops.c	8.14 (Berkeley) 6/15/95
     66  */
     67 
     68 #include <sys/cdefs.h>
     69 __KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c,v 1.243 2024/12/07 02:11:43 riastradh Exp $");
     70 
     71 #include "veriexec.h"
     72 
     73 #include <sys/param.h>
     74 #include <sys/types.h>
     75 
     76 #include <sys/atomic.h>
     77 #include <sys/buf.h>
     78 #include <sys/file.h>
     79 #include <sys/filedesc.h>
     80 #include <sys/fstrans.h>
     81 #include <sys/ioctl.h>
     82 #include <sys/kauth.h>
     83 #include <sys/kernel.h>
     84 #include <sys/mman.h>
     85 #include <sys/mount.h>
     86 #include <sys/namei.h>
     87 #include <sys/poll.h>
     88 #include <sys/proc.h>
     89 #include <sys/stat.h>
     90 #include <sys/syslog.h>
     91 #include <sys/systm.h>
     92 #include <sys/tty.h>
     93 #include <sys/verified_exec.h>
     94 #include <sys/vnode_impl.h>
     95 #include <sys/wapbl.h>
     96 
     97 #include <miscfs/fifofs/fifo.h>
     98 #include <miscfs/specfs/specdev.h>
     99 
    100 #include <uvm/uvm_device.h>
    101 #include <uvm/uvm_extern.h>
    102 #include <uvm/uvm_readahead.h>
    103 
    104 #ifndef COMPAT_ZERODEV
    105 #define COMPAT_ZERODEV(dev)	(0)
    106 #endif
    107 
    108 int (*vn_union_readdir_hook)(struct vnode **, struct file *, struct lwp *);
    109 
    110 static int vn_read(file_t *fp, off_t *offset, struct uio *uio,
    111     kauth_cred_t cred, int flags);
    112 static int vn_write(file_t *fp, off_t *offset, struct uio *uio,
    113     kauth_cred_t cred, int flags);
    114 static int vn_closefile(file_t *fp);
    115 static int vn_poll(file_t *fp, int events);
    116 static int vn_fcntl(file_t *fp, u_int com, void *data);
    117 static int vn_statfile(file_t *fp, struct stat *sb);
    118 static int vn_ioctl(file_t *fp, u_long com, void *data);
    119 static int vn_mmap(struct file *, off_t *, size_t, int, int *, int *,
    120     struct uvm_object **, int *);
    121 static int vn_seek(struct file *, off_t, int, off_t *, int);
    122 static int vn_advlock(struct file *, void *, int, struct flock *, int);
    123 static int vn_fpathconf(struct file *, int, register_t *);
    124 static int vn_posix_fadvise(struct file *, off_t, off_t, int);
    125 static int vn_truncate(file_t *, off_t);
    126 
    127 const struct fileops vnops = {
    128 	.fo_name = "vn",
    129 	.fo_read = vn_read,
    130 	.fo_write = vn_write,
    131 	.fo_ioctl = vn_ioctl,
    132 	.fo_fcntl = vn_fcntl,
    133 	.fo_poll = vn_poll,
    134 	.fo_stat = vn_statfile,
    135 	.fo_close = vn_closefile,
    136 	.fo_kqfilter = vn_kqfilter,
    137 	.fo_restart = fnullop_restart,
    138 	.fo_mmap = vn_mmap,
    139 	.fo_seek = vn_seek,
    140 	.fo_advlock = vn_advlock,
    141 	.fo_fpathconf = vn_fpathconf,
    142 	.fo_posix_fadvise = vn_posix_fadvise,
    143 	.fo_truncate = vn_truncate,
    144 };
    145 
    146 /*
    147  * Common code for vnode open operations.
    148  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
    149  *
    150  * at_dvp is the directory for openat(), if any.
    151  * pb is the path.
    152  * nmode is additional namei flags, restricted to TRYEMULROOT and NOCHROOT.
    153  * fmode is the open flags, converted from O_* to F*
    154  * cmode is the creation file permissions.
    155  *
    156  * XXX shouldn't cmode be mode_t?
    157  *
    158  * On success produces either a locked vnode in *ret_vp, or NULL in
    159  * *ret_vp and a file descriptor number in *ret_fd.
    160  *
    161  * The caller may pass NULL for ret_fd (and ret_domove), in which case
    162  * EOPNOTSUPP will be produced in the cases that would otherwise return
    163  * a file descriptor.
    164  *
    165  * Note that callers that want no-follow behavior should pass
    166  * O_NOFOLLOW in fmode. Neither FOLLOW nor NOFOLLOW in nmode is
    167  * honored.
    168  */
    169 int
    170 vn_open(struct vnode *at_dvp, struct pathbuf *pb,
    171     int nmode, int fmode, int cmode,
    172     struct vnode **ret_vp, bool *ret_domove, int *ret_fd)
    173 {
    174 	struct nameidata nd;
    175 	struct vnode *vp = NULL;
    176 	struct lwp *l = curlwp;
    177 	kauth_cred_t cred = l->l_cred;
    178 	struct vattr va;
    179 	int error;
    180 	const char *pathstring;
    181 
    182 	KASSERT((nmode & (TRYEMULROOT | NOCHROOT)) == nmode);
    183 
    184 	KASSERT(ret_vp != NULL);
    185 	KASSERT((ret_domove == NULL) == (ret_fd == NULL));
    186 
    187 	if ((fmode & (O_CREAT | O_DIRECTORY)) == (O_CREAT | O_DIRECTORY))
    188 		return EINVAL;
    189 
    190 	NDINIT(&nd, LOOKUP, nmode, pb);
    191 	if (at_dvp != NULL)
    192 		NDAT(&nd, at_dvp);
    193 
    194 	nd.ni_cnd.cn_flags &= TRYEMULROOT | NOCHROOT;
    195 
    196 	if (fmode & O_CREAT) {
    197 		nd.ni_cnd.cn_nameiop = CREATE;
    198 		nd.ni_cnd.cn_flags |= LOCKPARENT | LOCKLEAF;
    199 		if ((fmode & O_EXCL) == 0 &&
    200 		    ((fmode & O_NOFOLLOW) == 0))
    201 			nd.ni_cnd.cn_flags |= FOLLOW;
    202 		if ((fmode & O_EXCL) == 0)
    203 			nd.ni_cnd.cn_flags |= NONEXCLHACK;
    204 	} else {
    205 		nd.ni_cnd.cn_nameiop = LOOKUP;
    206 		nd.ni_cnd.cn_flags |= LOCKLEAF;
    207 		if ((fmode & O_NOFOLLOW) == 0)
    208 			nd.ni_cnd.cn_flags |= FOLLOW;
    209 	}
    210 
    211 	pathstring = pathbuf_stringcopy_get(nd.ni_pathbuf);
    212 	if (pathstring == NULL) {
    213 		return ENOMEM;
    214 	}
    215 
    216 	/*
    217 	 * When this "interface" was exposed to do_open() it used
    218 	 * to initialize l_dupfd to -newfd-1 (thus passing in the
    219 	 * new file handle number to use)... but nothing in the
    220 	 * kernel uses that value. So just send 0.
    221 	 */
    222 	l->l_dupfd = 0;
    223 
    224 	error = namei(&nd);
    225 	if (error)
    226 		goto out;
    227 
    228 	vp = nd.ni_vp;
    229 
    230 #if NVERIEXEC > 0
    231 	error = veriexec_openchk(l, nd.ni_vp, pathstring, fmode);
    232 	if (error) {
    233 		/* We have to release the locks ourselves */
    234 		/*
    235 		 * 20210604 dholland passing NONEXCLHACK means we can
    236 		 * get ni_dvp == NULL back if ni_vp exists, and we should
    237 		 * treat that like the non-O_CREAT case.
    238 		 */
    239 		if ((fmode & O_CREAT) != 0 && nd.ni_dvp != NULL) {
    240 			if (vp == NULL) {
    241 				vput(nd.ni_dvp);
    242 			} else {
    243 				VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
    244 				if (nd.ni_dvp == nd.ni_vp)
    245 					vrele(nd.ni_dvp);
    246 				else
    247 					vput(nd.ni_dvp);
    248 				nd.ni_dvp = NULL;
    249 				vput(vp);
    250 				vp = NULL;
    251 			}
    252 		} else {
    253 			vput(vp);
    254 			vp = NULL;
    255 		}
    256 		goto out;
    257 	}
    258 #endif /* NVERIEXEC > 0 */
    259 
    260 	/*
    261 	 * 20210604 dholland ditto
    262 	 */
    263 	if ((fmode & O_CREAT) != 0 && nd.ni_dvp != NULL) {
    264 		if (nd.ni_vp == NULL) {
    265 			vattr_null(&va);
    266 			va.va_type = VREG;
    267 			va.va_mode = cmode;
    268 			if (fmode & O_EXCL)
    269 				va.va_vaflags |= VA_EXCLUSIVE;
    270 			error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp,
    271 			    &nd.ni_cnd, &va);
    272 			if (error) {
    273 				vput(nd.ni_dvp);
    274 				goto out;
    275 			}
    276 			fmode &= ~O_TRUNC;
    277 			vp = nd.ni_vp;
    278 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
    279 			vput(nd.ni_dvp);
    280 		} else {
    281 			VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
    282 			if (nd.ni_dvp == nd.ni_vp)
    283 				vrele(nd.ni_dvp);
    284 			else
    285 				vput(nd.ni_dvp);
    286 			nd.ni_dvp = NULL;
    287 			vp = nd.ni_vp;
    288 			if (fmode & O_EXCL) {
    289 				error = EEXIST;
    290 				goto bad;
    291 			}
    292 			fmode &= ~O_CREAT;
    293 		}
    294 	} else if ((fmode & O_CREAT) != 0) {
    295 		/*
    296 		 * 20210606 dholland passing NONEXCLHACK means this
    297 		 * case exists; it is the same as the following one
    298 		 * but also needs to do things in the second (exists)
    299 		 * half of the following block. (Besides handle
    300 		 * ni_dvp, anyway.)
    301 		 */
    302 		vp = nd.ni_vp;
    303 		KASSERT((fmode & O_EXCL) == 0);
    304 		fmode &= ~O_CREAT;
    305 	} else {
    306 		vp = nd.ni_vp;
    307 	}
    308 	if (vp->v_type == VSOCK) {
    309 		error = EOPNOTSUPP;
    310 		goto bad;
    311 	}
    312 	if (nd.ni_vp->v_type == VLNK) {
    313 		error = EFTYPE;
    314 		goto bad;
    315 	}
    316 
    317 	if ((fmode & O_CREAT) == 0) {
    318 		error = vn_openchk(vp, cred, fmode);
    319 		if (error != 0)
    320 			goto bad;
    321 	}
    322 
    323 	if (fmode & O_TRUNC) {
    324 		vattr_null(&va);
    325 		va.va_size = 0;
    326 		error = VOP_SETATTR(vp, &va, cred);
    327 		if (error != 0)
    328 			goto bad;
    329 	}
    330 	if ((error = VOP_OPEN(vp, fmode, cred)) != 0)
    331 		goto bad;
    332 	if (fmode & FWRITE) {
    333 		mutex_enter(vp->v_interlock);
    334 		vp->v_writecount++;
    335 		mutex_exit(vp->v_interlock);
    336 	}
    337 
    338 bad:
    339 	if (error) {
    340 		vput(vp);
    341 		vp = NULL;
    342 	}
    343 out:
    344 	pathbuf_stringcopy_put(nd.ni_pathbuf, pathstring);
    345 
    346 	switch (error) {
    347 	case EDUPFD:
    348 	case EMOVEFD:
    349 		/* if the caller isn't prepared to handle fds, fail for them */
    350 		if (ret_fd == NULL) {
    351 			error = EOPNOTSUPP;
    352 			break;
    353 		}
    354 		*ret_vp = NULL;
    355 		*ret_domove = error == EMOVEFD;
    356 		*ret_fd = l->l_dupfd;
    357 		error = 0;
    358 		break;
    359 	case 0:
    360 		KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
    361 		*ret_vp = vp;
    362 		break;
    363 	}
    364 	l->l_dupfd = 0;
    365 	return error;
    366 }
    367 
    368 /*
    369  * Check for write permissions on the specified vnode.
    370  * Prototype text segments cannot be written.
    371  */
    372 int
    373 vn_writechk(struct vnode *vp)
    374 {
    375 
    376 	/*
    377 	 * If the vnode is in use as a process's text,
    378 	 * we can't allow writing.
    379 	 */
    380 	if (vp->v_iflag & VI_TEXT)
    381 		return ETXTBSY;
    382 	return 0;
    383 }
    384 
    385 int
    386 vn_openchk(struct vnode *vp, kauth_cred_t cred, int fflags)
    387 {
    388 	int permbits = 0;
    389 	int error;
    390 
    391 	if (vp->v_type == VNON || vp->v_type == VBAD)
    392 		return ENXIO;
    393 
    394 	if ((fflags & O_DIRECTORY) != 0 && vp->v_type != VDIR)
    395 		return ENOTDIR;
    396 
    397 	if ((fflags & O_REGULAR) != 0 && vp->v_type != VREG)
    398 		return EFTYPE;
    399 
    400 	if ((fflags & FREAD) != 0) {
    401 		permbits = VREAD;
    402 	}
    403 	if ((fflags & FEXEC) != 0) {
    404 		permbits |= VEXEC;
    405 	}
    406 	if ((fflags & (FWRITE | O_TRUNC)) != 0) {
    407 		permbits |= VWRITE;
    408 		if (vp->v_type == VDIR) {
    409 			error = EISDIR;
    410 			goto bad;
    411 		}
    412 		error = vn_writechk(vp);
    413 		if (error != 0)
    414 			goto bad;
    415 	}
    416 	error = VOP_ACCESS(vp, permbits, cred);
    417 bad:
    418 	return error;
    419 }
    420 
    421 /*
    422  * Mark a vnode as having executable mappings.
    423  */
    424 void
    425 vn_markexec(struct vnode *vp)
    426 {
    427 
    428 	if ((vp->v_iflag & VI_EXECMAP) != 0) {
    429 		/* Safe unlocked, as long as caller holds a reference. */
    430 		return;
    431 	}
    432 
    433 	rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
    434 	mutex_enter(vp->v_interlock);
    435 	if ((vp->v_iflag & VI_EXECMAP) == 0) {
    436 		cpu_count(CPU_COUNT_EXECPAGES, vp->v_uobj.uo_npages);
    437 		vp->v_iflag |= VI_EXECMAP;
    438 	}
    439 	mutex_exit(vp->v_interlock);
    440 	rw_exit(vp->v_uobj.vmobjlock);
    441 }
    442 
    443 /*
    444  * Mark a vnode as being the text of a process.
    445  * Fail if the vnode is currently writable.
    446  */
    447 int
    448 vn_marktext(struct vnode *vp)
    449 {
    450 
    451 	if ((vp->v_iflag & (VI_TEXT|VI_EXECMAP)) == (VI_TEXT|VI_EXECMAP)) {
    452 		/* Safe unlocked, as long as caller holds a reference. */
    453 		return 0;
    454 	}
    455 
    456 	rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
    457 	mutex_enter(vp->v_interlock);
    458 	if (vp->v_writecount != 0) {
    459 		KASSERT((vp->v_iflag & VI_TEXT) == 0);
    460 		mutex_exit(vp->v_interlock);
    461 		rw_exit(vp->v_uobj.vmobjlock);
    462 		return ETXTBSY;
    463 	}
    464 	if ((vp->v_iflag & VI_EXECMAP) == 0) {
    465 		cpu_count(CPU_COUNT_EXECPAGES, vp->v_uobj.uo_npages);
    466 	}
    467 	vp->v_iflag |= (VI_TEXT | VI_EXECMAP);
    468 	mutex_exit(vp->v_interlock);
    469 	rw_exit(vp->v_uobj.vmobjlock);
    470 	return 0;
    471 }
    472 
    473 /*
    474  * Vnode close call
    475  *
    476  * Note: takes an unlocked vnode, while VOP_CLOSE takes a locked node.
    477  */
    478 int
    479 vn_close(struct vnode *vp, int flags, kauth_cred_t cred)
    480 {
    481 	int error;
    482 
    483 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
    484 	if (flags & FWRITE) {
    485 		mutex_enter(vp->v_interlock);
    486 		KASSERT(vp->v_writecount > 0);
    487 		vp->v_writecount--;
    488 		mutex_exit(vp->v_interlock);
    489 	}
    490 	error = VOP_CLOSE(vp, flags, cred);
    491 	vput(vp);
    492 	return error;
    493 }
    494 
    495 static int
    496 enforce_rlimit_fsize(struct vnode *vp, struct uio *uio, int ioflag)
    497 {
    498 	struct lwp *l = curlwp;
    499 	off_t testoff;
    500 
    501 	if (uio->uio_rw != UIO_WRITE || vp->v_type != VREG)
    502 		return 0;
    503 
    504 	KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
    505 	if (ioflag & IO_APPEND)
    506 		testoff = vp->v_size;
    507 	else
    508 		testoff = uio->uio_offset;
    509 
    510 	if (testoff + uio->uio_resid >
    511 	    l->l_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
    512 		mutex_enter(&proc_lock);
    513 		psignal(l->l_proc, SIGXFSZ);
    514 		mutex_exit(&proc_lock);
    515 		return EFBIG;
    516 	}
    517 
    518 	return 0;
    519 }
    520 
    521 /*
    522  * Package up an I/O request on a vnode into a uio and do it.
    523  */
    524 int
    525 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
    526     enum uio_seg segflg, int ioflg, kauth_cred_t cred, size_t *aresid,
    527     struct lwp *l)
    528 {
    529 	struct uio auio;
    530 	struct iovec aiov;
    531 	int error;
    532 
    533 	if ((ioflg & IO_NODELOCKED) == 0) {
    534 		if (rw == UIO_READ) {
    535 			vn_lock(vp, LK_SHARED | LK_RETRY);
    536 		} else /* UIO_WRITE */ {
    537 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
    538 		}
    539 	}
    540 	auio.uio_iov = &aiov;
    541 	auio.uio_iovcnt = 1;
    542 	aiov.iov_base = base;
    543 	aiov.iov_len = len;
    544 	auio.uio_resid = len;
    545 	auio.uio_offset = offset;
    546 	auio.uio_rw = rw;
    547 	if (segflg == UIO_SYSSPACE) {
    548 		UIO_SETUP_SYSSPACE(&auio);
    549 	} else {
    550 		auio.uio_vmspace = l->l_proc->p_vmspace;
    551 	}
    552 
    553 	if ((error = enforce_rlimit_fsize(vp, &auio, ioflg)) != 0)
    554 		goto out;
    555 
    556 	if (rw == UIO_READ) {
    557 		error = VOP_READ(vp, &auio, ioflg, cred);
    558 	} else {
    559 		error = VOP_WRITE(vp, &auio, ioflg, cred);
    560 	}
    561 
    562 	if (aresid)
    563 		*aresid = auio.uio_resid;
    564 	else
    565 		if (auio.uio_resid && error == 0)
    566 			error = EIO;
    567 
    568 out:
    569 	if ((ioflg & IO_NODELOCKED) == 0) {
    570 		VOP_UNLOCK(vp);
    571 	}
    572 	return error;
    573 }
    574 
    575 int
    576 vn_readdir(file_t *fp, char *bf, int segflg, u_int count, int *done,
    577     struct lwp *l, off_t **cookies, int *ncookies)
    578 {
    579 	struct vnode *vp = fp->f_vnode;
    580 	struct iovec aiov;
    581 	struct uio auio;
    582 	int error, eofflag;
    583 
    584 	/* Limit the size on any kernel buffers used by VOP_READDIR */
    585 	count = uimin(MAXBSIZE, count);
    586 
    587 unionread:
    588 	if (vp->v_type != VDIR)
    589 		return EINVAL;
    590 	aiov.iov_base = bf;
    591 	aiov.iov_len = count;
    592 	auio.uio_iov = &aiov;
    593 	auio.uio_iovcnt = 1;
    594 	auio.uio_rw = UIO_READ;
    595 	if (segflg == UIO_SYSSPACE) {
    596 		UIO_SETUP_SYSSPACE(&auio);
    597 	} else {
    598 		KASSERT(l == curlwp);
    599 		auio.uio_vmspace = l->l_proc->p_vmspace;
    600 	}
    601 	auio.uio_resid = count;
    602 	vn_lock(vp, LK_SHARED | LK_RETRY);
    603 	mutex_enter(&fp->f_lock);
    604 	auio.uio_offset = fp->f_offset;
    605 	mutex_exit(&fp->f_lock);
    606 	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, cookies,
    607 	    ncookies);
    608 	mutex_enter(&fp->f_lock);
    609 	fp->f_offset = auio.uio_offset;
    610 	mutex_exit(&fp->f_lock);
    611 	VOP_UNLOCK(vp);
    612 	if (error)
    613 		return error;
    614 
    615 	if (count == auio.uio_resid && vn_union_readdir_hook) {
    616 		struct vnode *ovp = vp;
    617 
    618 		error = (*vn_union_readdir_hook)(&vp, fp, l);
    619 		if (error)
    620 			return error;
    621 		if (vp != ovp)
    622 			goto unionread;
    623 	}
    624 
    625 	if (count == auio.uio_resid && (vp->v_vflag & VV_ROOT) &&
    626 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
    627 		struct vnode *tvp = vp;
    628 		vp = vp->v_mount->mnt_vnodecovered;
    629 		vref(vp);
    630 		mutex_enter(&fp->f_lock);
    631 		fp->f_vnode = vp;
    632 		fp->f_offset = 0;
    633 		mutex_exit(&fp->f_lock);
    634 		vrele(tvp);
    635 		goto unionread;
    636 	}
    637 	*done = count - auio.uio_resid;
    638 	return error;
    639 }
    640 
    641 /*
    642  * File table vnode read routine.
    643  */
    644 static int
    645 vn_read(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
    646     int flags)
    647 {
    648 	struct vnode *vp = fp->f_vnode;
    649 	int error, ioflag, fflag;
    650 	size_t count;
    651 
    652 	ioflag = IO_ADV_ENCODE(fp->f_advice);
    653 	fflag = fp->f_flag;
    654 	if (fflag & FNONBLOCK)
    655 		ioflag |= IO_NDELAY;
    656 	if ((fflag & (FFSYNC | FRSYNC)) == (FFSYNC | FRSYNC))
    657 		ioflag |= IO_SYNC;
    658 	if (fflag & FALTIO)
    659 		ioflag |= IO_ALTSEMANTICS;
    660 	if (fflag & FDIRECT)
    661 		ioflag |= IO_DIRECT;
    662 	if (offset == &fp->f_offset && (flags & FOF_UPDATE_OFFSET) != 0)
    663 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
    664 	else
    665 		vn_lock(vp, LK_SHARED | LK_RETRY);
    666 	if (__predict_false(vp->v_type == VDIR) &&
    667 	    offset == &fp->f_offset && (flags & FOF_UPDATE_OFFSET) == 0)
    668 		mutex_enter(&fp->f_lock);
    669 	uio->uio_offset = *offset;
    670 	if (__predict_false(vp->v_type == VDIR) &&
    671 	    offset == &fp->f_offset && (flags & FOF_UPDATE_OFFSET) == 0)
    672 		mutex_enter(&fp->f_lock);
    673 	count = uio->uio_resid;
    674 	error = VOP_READ(vp, uio, ioflag, cred);
    675 	if (flags & FOF_UPDATE_OFFSET)
    676 		*offset += count - uio->uio_resid;
    677 	VOP_UNLOCK(vp);
    678 	return error;
    679 }
    680 
    681 /*
    682  * File table vnode write routine.
    683  */
    684 static int
    685 vn_write(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
    686     int flags)
    687 {
    688 	struct vnode *vp = fp->f_vnode;
    689 	int error, ioflag, fflag;
    690 	size_t count;
    691 
    692 	ioflag = IO_ADV_ENCODE(fp->f_advice) | IO_UNIT;
    693 	fflag = fp->f_flag;
    694 	if (vp->v_type == VREG && (fflag & O_APPEND))
    695 		ioflag |= IO_APPEND;
    696 	if (fflag & FNONBLOCK)
    697 		ioflag |= IO_NDELAY;
    698 	if (fflag & FFSYNC ||
    699 	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
    700 		ioflag |= IO_SYNC;
    701 	else if (fflag & FDSYNC)
    702 		ioflag |= IO_DSYNC;
    703 	if (fflag & FALTIO)
    704 		ioflag |= IO_ALTSEMANTICS;
    705 	if (fflag & FDIRECT)
    706 		ioflag |= IO_DIRECT;
    707 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
    708 	uio->uio_offset = *offset;
    709 	count = uio->uio_resid;
    710 
    711 	if ((error = enforce_rlimit_fsize(vp, uio, ioflag)) != 0)
    712 		goto out;
    713 
    714 	error = VOP_WRITE(vp, uio, ioflag, cred);
    715 
    716 	if (flags & FOF_UPDATE_OFFSET) {
    717 		if (ioflag & IO_APPEND) {
    718 			/*
    719 			 * SUSv3 describes behaviour for count = 0 as
    720 			 * following: "Before any action ... is taken,
    721 			 * and if nbyte is zero and the file is a
    722 			 * regular file, the write() function ... in
    723 			 * the absence of errors ... shall return zero
    724 			 * and have no other results."
    725 			 */
    726 			if (count)
    727 				*offset = uio->uio_offset;
    728 		} else
    729 			*offset += count - uio->uio_resid;
    730 	}
    731 
    732 out:
    733 	VOP_UNLOCK(vp);
    734 	return error;
    735 }
    736 
    737 /*
    738  * File table vnode stat routine.
    739  */
    740 static int
    741 vn_statfile(file_t *fp, struct stat *sb)
    742 {
    743 	struct vnode *vp = fp->f_vnode;
    744 	int error;
    745 
    746 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
    747 	error = vn_stat(vp, sb);
    748 	VOP_UNLOCK(vp);
    749 	return error;
    750 }
    751 
    752 int
    753 vn_stat(struct vnode *vp, struct stat *sb)
    754 {
    755 	struct vattr va;
    756 	int error;
    757 	mode_t mode;
    758 
    759 	memset(&va, 0, sizeof(va));
    760 	error = VOP_GETATTR(vp, &va, kauth_cred_get());
    761 	if (error)
    762 		return error;
    763 	/*
    764 	 * Copy from vattr table
    765 	 */
    766 	memset(sb, 0, sizeof(*sb));
    767 	sb->st_dev = va.va_fsid;
    768 	sb->st_ino = va.va_fileid;
    769 	mode = va.va_mode;
    770 	switch (vp->v_type) {
    771 	case VREG:
    772 		mode |= S_IFREG;
    773 		break;
    774 	case VDIR:
    775 		mode |= S_IFDIR;
    776 		break;
    777 	case VBLK:
    778 		mode |= S_IFBLK;
    779 		break;
    780 	case VCHR:
    781 		mode |= S_IFCHR;
    782 		break;
    783 	case VLNK:
    784 		mode |= S_IFLNK;
    785 		break;
    786 	case VSOCK:
    787 		mode |= S_IFSOCK;
    788 		break;
    789 	case VFIFO:
    790 		mode |= S_IFIFO;
    791 		break;
    792 	default:
    793 		return EBADF;
    794 	}
    795 	sb->st_mode = mode;
    796 	sb->st_nlink = va.va_nlink;
    797 	sb->st_uid = va.va_uid;
    798 	sb->st_gid = va.va_gid;
    799 	sb->st_rdev = va.va_rdev;
    800 	sb->st_size = va.va_size;
    801 	sb->st_atimespec = va.va_atime;
    802 	sb->st_mtimespec = va.va_mtime;
    803 	sb->st_ctimespec = va.va_ctime;
    804 	sb->st_birthtimespec = va.va_birthtime;
    805 	sb->st_blksize = va.va_blocksize;
    806 	sb->st_flags = va.va_flags;
    807 	sb->st_gen = 0;
    808 	sb->st_blocks = va.va_bytes / S_BLKSIZE;
    809 	return 0;
    810 }
    811 
    812 /*
    813  * File table vnode fcntl routine.
    814  */
    815 static int
    816 vn_fcntl(file_t *fp, u_int com, void *data)
    817 {
    818 	struct vnode *vp = fp->f_vnode;
    819 	int error;
    820 
    821 	error = VOP_FCNTL(vp, com, data, fp->f_flag, kauth_cred_get());
    822 	return error;
    823 }
    824 
    825 /*
    826  * File table vnode ioctl routine.
    827  */
    828 static int
    829 vn_ioctl(file_t *fp, u_long com, void *data)
    830 {
    831 	struct vnode *vp = fp->f_vnode, *ovp;
    832 	struct vattr vattr;
    833 	int error;
    834 
    835 	switch (vp->v_type) {
    836 
    837 	case VREG:
    838 	case VDIR:
    839 		if (com == FIONREAD) {
    840 			vn_lock(vp, LK_SHARED | LK_RETRY);
    841 			error = VOP_GETATTR(vp, &vattr, kauth_cred_get());
    842 			if (error == 0) {
    843 				if (vp->v_type == VDIR)
    844 					mutex_enter(&fp->f_lock);
    845 				*(int *)data = vattr.va_size - fp->f_offset;
    846 				if (vp->v_type == VDIR)
    847 					mutex_exit(&fp->f_lock);
    848 			}
    849 			VOP_UNLOCK(vp);
    850 			if (error)
    851 				return error;
    852 			return 0;
    853 		}
    854 		if ((com == FIONWRITE) || (com == FIONSPACE)) {
    855 			/*
    856 			 * Files don't have send queues, so there never
    857 			 * are any bytes in them, nor is there any
    858 			 * open space in them.
    859 			 */
    860 			*(int *)data = 0;
    861 			return 0;
    862 		}
    863 		if (com == FIOGETBMAP) {
    864 			daddr_t *block;
    865 
    866 			if (*(daddr_t *)data < 0)
    867 				return EINVAL;
    868 			block = (daddr_t *)data;
    869 			vn_lock(vp, LK_SHARED | LK_RETRY);
    870 			error = VOP_BMAP(vp, *block, NULL, block, NULL);
    871 			VOP_UNLOCK(vp);
    872 			return error;
    873 		}
    874 		if (com == OFIOGETBMAP) {
    875 			daddr_t ibn, obn;
    876 
    877 			if (*(int32_t *)data < 0)
    878 				return EINVAL;
    879 			ibn = (daddr_t)*(int32_t *)data;
    880 			vn_lock(vp, LK_SHARED | LK_RETRY);
    881 			error = VOP_BMAP(vp, ibn, NULL, &obn, NULL);
    882 			VOP_UNLOCK(vp);
    883 			*(int32_t *)data = (int32_t)obn;
    884 			return error;
    885 		}
    886 		if (com == FIONBIO || com == FIOASYNC)	/* XXX */
    887 			return 0;			/* XXX */
    888 		/* FALLTHROUGH */
    889 	case VFIFO:
    890 	case VCHR:
    891 	case VBLK:
    892 		error = VOP_IOCTL(vp, com, data, fp->f_flag, kauth_cred_get());
    893 		if (error == 0 && com == TIOCSCTTY) {
    894 			vref(vp);
    895 			mutex_enter(&proc_lock);
    896 			ovp = curproc->p_session->s_ttyvp;
    897 			curproc->p_session->s_ttyvp = vp;
    898 			mutex_exit(&proc_lock);
    899 			if (ovp != NULL)
    900 				vrele(ovp);
    901 		}
    902 		return error;
    903 
    904 	default:
    905 		return EPASSTHROUGH;
    906 	}
    907 }
    908 
    909 /*
    910  * File table vnode poll routine.
    911  */
    912 static int
    913 vn_poll(file_t *fp, int events)
    914 {
    915 
    916 	return VOP_POLL(fp->f_vnode, events);
    917 }
    918 
    919 /*
    920  * File table vnode kqfilter routine.
    921  */
    922 int
    923 vn_kqfilter(file_t *fp, struct knote *kn)
    924 {
    925 
    926 	return VOP_KQFILTER(fp->f_vnode, kn);
    927 }
    928 
    929 static int
    930 vn_mmap(struct file *fp, off_t *offp, size_t size, int prot, int *flagsp,
    931     int *advicep, struct uvm_object **uobjp, int *maxprotp)
    932 {
    933 	struct uvm_object *uobj;
    934 	struct vnode *vp;
    935 	struct vattr va;
    936 	struct lwp *l;
    937 	vm_prot_t maxprot;
    938 	off_t off;
    939 	int error, flags;
    940 	bool needwritemap;
    941 
    942 	l = curlwp;
    943 
    944 	off = *offp;
    945 	flags = *flagsp;
    946 	maxprot = VM_PROT_EXECUTE;
    947 
    948 	KASSERT(size > 0);
    949 
    950 	vp = fp->f_vnode;
    951 	if (vp->v_type != VREG && vp->v_type != VCHR &&
    952 	    vp->v_type != VBLK) {
    953 		/* only REG/CHR/BLK support mmap */
    954 		return ENODEV;
    955 	}
    956 	if (vp->v_type != VCHR && off < 0) {
    957 		return EINVAL;
    958 	}
    959 #if SIZE_MAX > UINT32_MAX	/* XXX -Wtype-limits */
    960 	if (vp->v_type != VCHR && size > __type_max(off_t)) {
    961 		return EOVERFLOW;
    962 	}
    963 #endif
    964 	if (vp->v_type != VCHR && off > __type_max(off_t) - size) {
    965 		/* no offset wrapping */
    966 		return EOVERFLOW;
    967 	}
    968 
    969 	/* special case: catch SunOS style /dev/zero */
    970 	if (vp->v_type == VCHR &&
    971 	    (vp->v_rdev == zerodev || COMPAT_ZERODEV(vp->v_rdev))) {
    972 		*uobjp = NULL;
    973 		*maxprotp = VM_PROT_ALL;
    974 		return 0;
    975 	}
    976 
    977 	/*
    978 	 * Old programs may not select a specific sharing type, so
    979 	 * default to an appropriate one.
    980 	 *
    981 	 * XXX: how does MAP_ANON fit in the picture?
    982 	 */
    983 	if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) {
    984 #if defined(DEBUG)
    985 		struct proc *p = l->l_proc;
    986 		printf("WARNING: defaulted mmap() share type to "
    987 		    "%s (pid %d command %s)\n",
    988 		    vp->v_type == VCHR ? "MAP_SHARED" : "MAP_PRIVATE",
    989 		    p->p_pid,
    990 		    p->p_comm);
    991 #endif
    992 		if (vp->v_type == VCHR)
    993 			flags |= MAP_SHARED;	/* for a device */
    994 		else
    995 			flags |= MAP_PRIVATE;	/* for a file */
    996 	}
    997 
    998 	/*
    999 	 * MAP_PRIVATE device mappings don't make sense (and aren't
   1000 	 * supported anyway).  However, some programs rely on this,
   1001 	 * so just change it to MAP_SHARED.
   1002 	 */
   1003 	if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) {
   1004 		flags = (flags & ~MAP_PRIVATE) | MAP_SHARED;
   1005 	}
   1006 
   1007 	/*
   1008 	 * now check protection
   1009 	 */
   1010 
   1011 	/* check read access */
   1012 	if (fp->f_flag & FREAD)
   1013 		maxprot |= VM_PROT_READ;
   1014 	else if (prot & PROT_READ) {
   1015 		return EACCES;
   1016 	}
   1017 
   1018 	/* check write access, shared case first */
   1019 	if (flags & MAP_SHARED) {
   1020 		/*
   1021 		 * if the file is writable, only add PROT_WRITE to
   1022 		 * maxprot if the file is not immutable, append-only.
   1023 		 * otherwise, if we have asked for PROT_WRITE, return
   1024 		 * EPERM.
   1025 		 */
   1026 		if (fp->f_flag & FWRITE) {
   1027 			vn_lock(vp, LK_SHARED | LK_RETRY);
   1028 			error = VOP_GETATTR(vp, &va, l->l_cred);
   1029 			VOP_UNLOCK(vp);
   1030 			if (error) {
   1031 				return error;
   1032 			}
   1033 			if ((va.va_flags &
   1034 				(SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0)
   1035 				maxprot |= VM_PROT_WRITE;
   1036 			else if (prot & PROT_WRITE) {
   1037 				return EPERM;
   1038 			}
   1039 		} else if (prot & PROT_WRITE) {
   1040 			return EACCES;
   1041 		}
   1042 	} else {
   1043 		/* MAP_PRIVATE mappings can always write to */
   1044 		maxprot |= VM_PROT_WRITE;
   1045 	}
   1046 
   1047 	/*
   1048 	 * Don't allow mmap for EXEC if the file system
   1049 	 * is mounted NOEXEC.
   1050 	 */
   1051 	if ((prot & PROT_EXEC) != 0 &&
   1052 	    (vp->v_mount->mnt_flag & MNT_NOEXEC) != 0) {
   1053 		return EACCES;
   1054 	}
   1055 
   1056 	if (vp->v_type != VCHR) {
   1057 		error = VOP_MMAP(vp, prot, curlwp->l_cred);
   1058 		if (error) {
   1059 			return error;
   1060 		}
   1061 		vref(vp);
   1062 		uobj = &vp->v_uobj;
   1063 
   1064 		/*
   1065 		 * If the vnode is being mapped with PROT_EXEC,
   1066 		 * then mark it as text.
   1067 		 */
   1068 		if (prot & PROT_EXEC) {
   1069 			vn_markexec(vp);
   1070 		}
   1071 	} else {
   1072 		int i = maxprot;
   1073 
   1074 		/*
   1075 		 * XXX Some devices don't like to be mapped with
   1076 		 * XXX PROT_EXEC or PROT_WRITE, but we don't really
   1077 		 * XXX have a better way of handling this, right now
   1078 		 */
   1079 		do {
   1080 			uobj = udv_attach(vp->v_rdev,
   1081 			    (flags & MAP_SHARED) ? i : (i & ~VM_PROT_WRITE),
   1082 			    off, size);
   1083 			i--;
   1084 		} while ((uobj == NULL) && (i > 0));
   1085 		if (uobj == NULL) {
   1086 			return EINVAL;
   1087 		}
   1088 		*advicep = UVM_ADV_RANDOM;
   1089 	}
   1090 
   1091 	/*
   1092 	 * Set vnode flags to indicate the new kinds of mapping.
   1093 	 * We take the vnode lock in exclusive mode here to serialize
   1094 	 * with direct I/O.
   1095 	 *
   1096 	 * Safe to check for these flag values without a lock, as
   1097 	 * long as a reference to the vnode is held.
   1098 	 */
   1099 	needwritemap = (vp->v_iflag & VI_WRMAP) == 0 &&
   1100 	    (flags & MAP_SHARED) != 0 &&
   1101 	    (maxprot & VM_PROT_WRITE) != 0;
   1102 	if ((vp->v_vflag & VV_MAPPED) == 0 || needwritemap) {
   1103 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   1104 		vp->v_vflag |= VV_MAPPED;
   1105 		if (needwritemap) {
   1106 			rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
   1107 			mutex_enter(vp->v_interlock);
   1108 			vp->v_iflag |= VI_WRMAP;
   1109 			mutex_exit(vp->v_interlock);
   1110 			rw_exit(vp->v_uobj.vmobjlock);
   1111 		}
   1112 		VOP_UNLOCK(vp);
   1113 	}
   1114 
   1115 #if NVERIEXEC > 0
   1116 	/*
   1117 	 * Check if the file can be executed indirectly.
   1118 	 *
   1119 	 * XXX: This gives false warnings about "Incorrect access type"
   1120 	 * XXX: if the mapping is not executable. Harmless, but will be
   1121 	 * XXX: fixed as part of other changes.
   1122 	 */
   1123 	if (veriexec_verify(l, vp, "(mmap)", VERIEXEC_INDIRECT,
   1124 		NULL)) {
   1125 
   1126 		/*
   1127 		 * Don't allow executable mappings if we can't
   1128 		 * indirectly execute the file.
   1129 		 */
   1130 		if (prot & VM_PROT_EXECUTE) {
   1131 			return EPERM;
   1132 		}
   1133 
   1134 		/*
   1135 		 * Strip the executable bit from 'maxprot' to make sure
   1136 		 * it can't be made executable later.
   1137 		 */
   1138 		maxprot &= ~VM_PROT_EXECUTE;
   1139 	}
   1140 #endif /* NVERIEXEC > 0 */
   1141 
   1142 	*uobjp = uobj;
   1143 	*maxprotp = maxprot;
   1144 	*flagsp = flags;
   1145 
   1146 	return 0;
   1147 }
   1148 
   1149 static int
   1150 vn_seek(struct file *fp, off_t delta, int whence, off_t *newoffp, int flags)
   1151 {
   1152 	const off_t OFF_MIN = __type_min(off_t);
   1153 	const off_t OFF_MAX = __type_max(off_t);
   1154 	kauth_cred_t cred = fp->f_cred;
   1155 	off_t oldoff, newoff;
   1156 	struct vnode *vp = fp->f_vnode;
   1157 	struct vattr vattr;
   1158 	int error;
   1159 
   1160 	if (vp->v_type == VFIFO)
   1161 		return ESPIPE;
   1162 
   1163 	if (flags & FOF_UPDATE_OFFSET)
   1164 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   1165 	else
   1166 		vn_lock(vp, LK_SHARED | LK_RETRY);
   1167 
   1168 	/* Compute the old and new offsets.  */
   1169 	if (vp->v_type == VDIR && (flags & FOF_UPDATE_OFFSET) == 0)
   1170 		mutex_enter(&fp->f_lock);
   1171 	oldoff = fp->f_offset;
   1172 	if (vp->v_type == VDIR && (flags & FOF_UPDATE_OFFSET) == 0)
   1173 		mutex_exit(&fp->f_lock);
   1174 	switch (whence) {
   1175 	case SEEK_CUR:
   1176 		if (delta > 0) {
   1177 			if (oldoff > 0 && delta > OFF_MAX - oldoff) {
   1178 				newoff = OFF_MAX;
   1179 				break;
   1180 			}
   1181 		} else {
   1182 			if (oldoff < 0 && delta < OFF_MIN - oldoff) {
   1183 				newoff = OFF_MIN;
   1184 				break;
   1185 			}
   1186 		}
   1187 		newoff = oldoff + delta;
   1188 		break;
   1189 	case SEEK_END:
   1190 		error = VOP_GETATTR(vp, &vattr, cred);
   1191 		if (error)
   1192 			goto out;
   1193 		if (vattr.va_size > OFF_MAX ||
   1194 		    delta > OFF_MAX - (off_t)vattr.va_size) {
   1195 			newoff = OFF_MAX;
   1196 			break;
   1197 		}
   1198 		newoff = delta + vattr.va_size;
   1199 		break;
   1200 	case SEEK_SET:
   1201 		newoff = delta;
   1202 		break;
   1203 	default:
   1204 		error = EINVAL;
   1205 		goto out;
   1206 	}
   1207 
   1208 	/* Pass the proposed change to the file system to audit.  */
   1209 	error = VOP_SEEK(vp, oldoff, newoff, cred);
   1210 	if (error)
   1211 		goto out;
   1212 
   1213 	/* Success!  */
   1214 	if (newoffp)
   1215 		*newoffp = newoff;
   1216 	if (flags & FOF_UPDATE_OFFSET)
   1217 		fp->f_offset = newoff;
   1218 	error = 0;
   1219 
   1220 out:	VOP_UNLOCK(vp);
   1221 	return error;
   1222 }
   1223 
   1224 static int
   1225 vn_advlock(struct file *fp, void *id, int op, struct flock *fl, int flags)
   1226 {
   1227 	struct vnode *const vp = fp->f_vnode;
   1228 
   1229 	if (fl->l_whence == SEEK_CUR) {
   1230 		vn_lock(vp, LK_SHARED | LK_RETRY);
   1231 		fl->l_start += fp->f_offset;
   1232 		VOP_UNLOCK(vp);
   1233 	}
   1234 
   1235 	return VOP_ADVLOCK(vp, id, op, fl, flags);
   1236 }
   1237 
   1238 static int
   1239 vn_fpathconf(struct file *fp, int name, register_t *retval)
   1240 {
   1241 	struct vnode *const vp = fp->f_vnode;
   1242 	int error;
   1243 
   1244 	vn_lock(vp, LK_SHARED | LK_RETRY);
   1245 	error = VOP_PATHCONF(vp, name, retval);
   1246 	VOP_UNLOCK(vp);
   1247 
   1248 	return error;
   1249 }
   1250 
   1251 static int
   1252 vn_posix_fadvise(struct file *fp, off_t offset, off_t len, int advice)
   1253 {
   1254 	const off_t OFF_MAX = __type_max(off_t);
   1255 	struct vnode *vp = fp->f_vnode;
   1256 	off_t endoffset;
   1257 	int error;
   1258 
   1259 	if (offset < 0) {
   1260 		return EINVAL;
   1261 	}
   1262 	if (len == 0) {
   1263 		endoffset = OFF_MAX;
   1264 	} else if (len > 0 && (OFF_MAX - offset) >= len) {
   1265 		endoffset = offset + len;
   1266 	} else {
   1267 		return EINVAL;
   1268 	}
   1269 
   1270 	CTASSERT(POSIX_FADV_NORMAL == UVM_ADV_NORMAL);
   1271 	CTASSERT(POSIX_FADV_RANDOM == UVM_ADV_RANDOM);
   1272 	CTASSERT(POSIX_FADV_SEQUENTIAL == UVM_ADV_SEQUENTIAL);
   1273 
   1274 	switch (advice) {
   1275 	case POSIX_FADV_WILLNEED:
   1276 	case POSIX_FADV_DONTNEED:
   1277 		if (vp->v_type != VREG && vp->v_type != VBLK)
   1278 			return 0;
   1279 		break;
   1280 	}
   1281 
   1282 	switch (advice) {
   1283 	case POSIX_FADV_NORMAL:
   1284 	case POSIX_FADV_RANDOM:
   1285 	case POSIX_FADV_SEQUENTIAL:
   1286 		/*
   1287 		 * We ignore offset and size.  Must lock the file to
   1288 		 * do this, as f_advice is sub-word sized.
   1289 		 */
   1290 		mutex_enter(&fp->f_lock);
   1291 		fp->f_advice = (u_char)advice;
   1292 		mutex_exit(&fp->f_lock);
   1293 		error = 0;
   1294 		break;
   1295 
   1296 	case POSIX_FADV_WILLNEED:
   1297 		error = uvm_readahead(&vp->v_uobj, offset, endoffset - offset);
   1298 		break;
   1299 
   1300 	case POSIX_FADV_DONTNEED:
   1301 		/*
   1302 		 * Align the region to page boundaries as VOP_PUTPAGES expects
   1303 		 * by shrinking it.  We shrink instead of expand because we
   1304 		 * do not want to deactivate cache outside of the requested
   1305 		 * region.  It means that if the specified region is smaller
   1306 		 * than PAGE_SIZE, we do nothing.
   1307 		 */
   1308 		if (offset <= trunc_page(OFF_MAX) &&
   1309 		    round_page(offset) < trunc_page(endoffset)) {
   1310 			rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
   1311 			error = VOP_PUTPAGES(vp,
   1312 			    round_page(offset), trunc_page(endoffset),
   1313 			    PGO_DEACTIVATE | PGO_CLEANIT);
   1314 		} else {
   1315 			error = 0;
   1316 		}
   1317 		break;
   1318 
   1319 	case POSIX_FADV_NOREUSE:
   1320 		/* Not implemented yet. */
   1321 		error = 0;
   1322 		break;
   1323 	default:
   1324 		error = EINVAL;
   1325 		break;
   1326 	}
   1327 
   1328 	return error;
   1329 }
   1330 
   1331 static int
   1332 vn_truncate(file_t *fp, off_t length)
   1333 {
   1334 	struct vattr vattr;
   1335 	struct vnode *vp;
   1336 	int error = 0;
   1337 
   1338 	if (length < 0)
   1339 		return EINVAL;
   1340 
   1341 	if ((fp->f_flag & FWRITE) == 0)
   1342 		return EINVAL;
   1343 	vp = fp->f_vnode;
   1344 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   1345 	if (vp->v_type == VDIR)
   1346 		error = EISDIR;
   1347 	else if ((error = vn_writechk(vp)) == 0) {
   1348 		vattr_null(&vattr);
   1349 		vattr.va_size = length;
   1350 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
   1351 	}
   1352 	VOP_UNLOCK(vp);
   1353 
   1354 	return error;
   1355 }
   1356 
   1357 
   1358 /*
   1359  * Check that the vnode is still valid, and if so
   1360  * acquire requested lock.
   1361  */
   1362 int
   1363 vn_lock(struct vnode *vp, int flags)
   1364 {
   1365 	struct lwp *l;
   1366 	int error;
   1367 
   1368 	KASSERT(vrefcnt(vp) > 0);
   1369 	KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT|LK_RETRY|
   1370 	    LK_UPGRADE|LK_DOWNGRADE)) == 0);
   1371 	KASSERT((flags & LK_NOWAIT) != 0 || !mutex_owned(vp->v_interlock));
   1372 
   1373 #ifdef DIAGNOSTIC
   1374 	if (wapbl_vphaswapbl(vp))
   1375 		WAPBL_JUNLOCK_ASSERT(wapbl_vptomp(vp));
   1376 #endif
   1377 
   1378 	/* Get a more useful report for lockstat. */
   1379 	l = curlwp;
   1380 	KASSERT(l->l_rwcallsite == 0);
   1381 	l->l_rwcallsite = (uintptr_t)__builtin_return_address(0);
   1382 
   1383 	error = VOP_LOCK(vp, flags);
   1384 
   1385 	l->l_rwcallsite = 0;
   1386 
   1387 	switch (flags & (LK_RETRY | LK_NOWAIT)) {
   1388 	case 0:
   1389 		KASSERT(error == 0 || error == ENOENT);
   1390 		break;
   1391 	case LK_RETRY:
   1392 		KASSERT(error == 0);
   1393 		break;
   1394 	case LK_NOWAIT:
   1395 		KASSERT(error == 0 || error == EBUSY || error == ENOENT);
   1396 		break;
   1397 	case LK_RETRY | LK_NOWAIT:
   1398 		KASSERT(error == 0 || error == EBUSY);
   1399 		break;
   1400 	}
   1401 
   1402 	return error;
   1403 }
   1404 
   1405 /*
   1406  * File table vnode close routine.
   1407  */
   1408 static int
   1409 vn_closefile(file_t *fp)
   1410 {
   1411 
   1412 	return vn_close(fp->f_vnode, fp->f_flag, fp->f_cred);
   1413 }
   1414 
   1415 /*
   1416  * Simplified in-kernel wrapper calls for extended attribute access.
   1417  * Both calls pass in a NULL credential, authorizing a "kernel" access.
   1418  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
   1419  */
   1420 int
   1421 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
   1422     const char *attrname, size_t *buflen, void *bf, struct lwp *l)
   1423 {
   1424 	struct uio auio;
   1425 	struct iovec aiov;
   1426 	int error;
   1427 
   1428 	aiov.iov_len = *buflen;
   1429 	aiov.iov_base = bf;
   1430 
   1431 	auio.uio_iov = &aiov;
   1432 	auio.uio_iovcnt = 1;
   1433 	auio.uio_rw = UIO_READ;
   1434 	auio.uio_offset = 0;
   1435 	auio.uio_resid = *buflen;
   1436 	UIO_SETUP_SYSSPACE(&auio);
   1437 
   1438 	if ((ioflg & IO_NODELOCKED) == 0)
   1439 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   1440 
   1441 	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL,
   1442 	    NOCRED);
   1443 
   1444 	if ((ioflg & IO_NODELOCKED) == 0)
   1445 		VOP_UNLOCK(vp);
   1446 
   1447 	if (error == 0)
   1448 		*buflen = *buflen - auio.uio_resid;
   1449 
   1450 	return error;
   1451 }
   1452 
   1453 /*
   1454  * XXX Failure mode if partially written?
   1455  */
   1456 int
   1457 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
   1458     const char *attrname, size_t buflen, const void *bf, struct lwp *l)
   1459 {
   1460 	struct uio auio;
   1461 	struct iovec aiov;
   1462 	int error;
   1463 
   1464 	aiov.iov_len = buflen;
   1465 	aiov.iov_base = __UNCONST(bf);		/* XXXUNCONST kills const */
   1466 
   1467 	auio.uio_iov = &aiov;
   1468 	auio.uio_iovcnt = 1;
   1469 	auio.uio_rw = UIO_WRITE;
   1470 	auio.uio_offset = 0;
   1471 	auio.uio_resid = buflen;
   1472 	UIO_SETUP_SYSSPACE(&auio);
   1473 
   1474 	if ((ioflg & IO_NODELOCKED) == 0) {
   1475 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   1476 	}
   1477 
   1478 	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NOCRED);
   1479 
   1480 	if ((ioflg & IO_NODELOCKED) == 0) {
   1481 		VOP_UNLOCK(vp);
   1482 	}
   1483 
   1484 	return error;
   1485 }
   1486 
   1487 int
   1488 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
   1489     const char *attrname, struct lwp *l)
   1490 {
   1491 	int error;
   1492 
   1493 	if ((ioflg & IO_NODELOCKED) == 0) {
   1494 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   1495 	}
   1496 
   1497 	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NOCRED);
   1498 	if (error == EOPNOTSUPP)
   1499 		error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
   1500 		    NOCRED);
   1501 
   1502 	if ((ioflg & IO_NODELOCKED) == 0) {
   1503 		VOP_UNLOCK(vp);
   1504 	}
   1505 
   1506 	return error;
   1507 }
   1508 
   1509 int
   1510 vn_fifo_bypass(void *v)
   1511 {
   1512 	struct vop_generic_args *ap = v;
   1513 
   1514 	return VOCALL(fifo_vnodeop_p, ap->a_desc->vdesc_offset, v);
   1515 }
   1516 
   1517 /*
   1518  * Open block device by device number
   1519  */
   1520 int
   1521 vn_bdev_open(dev_t dev, struct vnode **vpp, struct lwp *l)
   1522 {
   1523 	int     error;
   1524 
   1525 	if ((error = bdevvp(dev, vpp)) != 0)
   1526 		return error;
   1527 
   1528 	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
   1529 	if ((error = VOP_OPEN(*vpp, FREAD | FWRITE, l->l_cred)) != 0) {
   1530 		vput(*vpp);
   1531 		return error;
   1532 	}
   1533 	mutex_enter((*vpp)->v_interlock);
   1534 	(*vpp)->v_writecount++;
   1535 	mutex_exit((*vpp)->v_interlock);
   1536 	VOP_UNLOCK(*vpp);
   1537 
   1538 	return 0;
   1539 }
   1540 
   1541 /*
   1542  * Lookup the provided name in the filesystem.  If the file exists,
   1543  * is a valid block device, and isn't being used by anyone else,
   1544  * set *vpp to the file's vnode.
   1545  */
   1546 int
   1547 vn_bdev_openpath(struct pathbuf *pb, struct vnode **vpp, struct lwp *l)
   1548 {
   1549 	struct vnode *vp;
   1550 	dev_t dev;
   1551 	enum vtype vt;
   1552 	int     error;
   1553 
   1554 	error = vn_open(NULL, pb, 0, FREAD | FWRITE, 0, &vp, NULL, NULL);
   1555 	if (error != 0)
   1556 		return error;
   1557 
   1558 	dev = vp->v_rdev;
   1559 	vt = vp->v_type;
   1560 
   1561 	VOP_UNLOCK(vp);
   1562 	(void) vn_close(vp, FREAD | FWRITE, l->l_cred);
   1563 
   1564 	if (vt != VBLK)
   1565 		return ENOTBLK;
   1566 
   1567 	return vn_bdev_open(dev, vpp, l);
   1568 }
   1569 
   1570 static long
   1571 vn_knote_to_interest(const struct knote *kn)
   1572 {
   1573 
   1574 	switch (kn->kn_filter) {
   1575 	case EVFILT_READ:
   1576 		/*
   1577 		 * Writing to the file or changing its attributes can
   1578 		 * set the file size, which impacts the readability
   1579 		 * filter.
   1580 		 *
   1581 		 * (No need to set NOTE_EXTEND here; it's only ever
   1582 		 * send with other hints; see vnode_if.c.)
   1583 		 */
   1584 		return NOTE_WRITE | NOTE_ATTRIB;
   1585 
   1586 	case EVFILT_VNODE:
   1587 		return kn->kn_sfflags;
   1588 
   1589 	case EVFILT_WRITE:
   1590 	default:
   1591 		return 0;
   1592 	}
   1593 }
   1594 
   1595 void
   1596 vn_knote_attach(struct vnode *vp, struct knote *kn)
   1597 {
   1598 	struct vnode_klist *vk = vp->v_klist;
   1599 	long interest = 0;
   1600 
   1601 	/*
   1602 	 * In the case of layered / stacked file systems, knotes
   1603 	 * should only ever be associated with the base vnode.
   1604 	 */
   1605 	KASSERT(kn->kn_hook == vp);
   1606 	KASSERT(vp->v_klist == &VNODE_TO_VIMPL(vp)->vi_klist);
   1607 
   1608 	/*
   1609 	 * We maintain a bitmask of the kevents that there is interest in,
   1610 	 * to minimize the impact of having watchers.  It's silly to have
   1611 	 * to traverse vn_klist every time a read or write happens simply
   1612 	 * because there is someone interested in knowing when the file
   1613 	 * is deleted, for example.
   1614 	 */
   1615 
   1616 	mutex_enter(vp->v_interlock);
   1617 	SLIST_INSERT_HEAD(&vk->vk_klist, kn, kn_selnext);
   1618 	SLIST_FOREACH(kn, &vk->vk_klist, kn_selnext) {
   1619 		interest |= vn_knote_to_interest(kn);
   1620 	}
   1621 	vk->vk_interest = interest;
   1622 	mutex_exit(vp->v_interlock);
   1623 }
   1624 
   1625 void
   1626 vn_knote_detach(struct vnode *vp, struct knote *kn)
   1627 {
   1628 	struct vnode_klist *vk = vp->v_klist;
   1629 	long interest = 0;
   1630 
   1631 	/* See above. */
   1632 	KASSERT(kn->kn_hook == vp);
   1633 	KASSERT(vp->v_klist == &VNODE_TO_VIMPL(vp)->vi_klist);
   1634 
   1635 	/*
   1636 	 * We special case removing the head of the list, because:
   1637 	 *
   1638 	 * 1. It's extremely likely that we're detaching the only
   1639 	 *    knote.
   1640 	 *
   1641 	 * 2. We're already traversing the whole list, so we don't
   1642 	 *    want to use the generic SLIST_REMOVE() which would
   1643 	 *    traverse it *again*.
   1644 	 */
   1645 
   1646 	mutex_enter(vp->v_interlock);
   1647 	if (__predict_true(kn == SLIST_FIRST(&vk->vk_klist))) {
   1648 		SLIST_REMOVE_HEAD(&vk->vk_klist, kn_selnext);
   1649 		SLIST_FOREACH(kn, &vk->vk_klist, kn_selnext) {
   1650 			interest |= vn_knote_to_interest(kn);
   1651 		}
   1652 		vk->vk_interest = interest;
   1653 	} else {
   1654 		struct knote *thiskn, *nextkn, *prevkn = NULL;
   1655 
   1656 		SLIST_FOREACH_SAFE(thiskn, &vk->vk_klist, kn_selnext, nextkn) {
   1657 			if (thiskn == kn) {
   1658 				KASSERT(kn != NULL);
   1659 				KASSERT(prevkn != NULL);
   1660 				SLIST_REMOVE_AFTER(prevkn, kn_selnext);
   1661 				kn = NULL;
   1662 			} else {
   1663 				interest |= vn_knote_to_interest(thiskn);
   1664 				prevkn = thiskn;
   1665 			}
   1666 		}
   1667 		vk->vk_interest = interest;
   1668 	}
   1669 	mutex_exit(vp->v_interlock);
   1670 }
   1671