Home | History | Annotate | Line # | Download | only in kern
vfs_vnops.c revision 1.242
      1 /*	$NetBSD: vfs_vnops.c,v 1.242 2023/07/10 02:31:55 christos Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2009 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Andrew Doran.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1982, 1986, 1989, 1993
     34  *	The Regents of the University of California.  All rights reserved.
     35  * (c) UNIX System Laboratories, Inc.
     36  * All or some portions of this file are derived from material licensed
     37  * to the University of California by American Telephone and Telegraph
     38  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
     39  * the permission of UNIX System Laboratories, Inc.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  *	@(#)vfs_vnops.c	8.14 (Berkeley) 6/15/95
     66  */
     67 
     68 #include <sys/cdefs.h>
     69 __KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c,v 1.242 2023/07/10 02:31:55 christos Exp $");
     70 
     71 #include "veriexec.h"
     72 
     73 #include <sys/param.h>
     74 #include <sys/systm.h>
     75 #include <sys/kernel.h>
     76 #include <sys/file.h>
     77 #include <sys/stat.h>
     78 #include <sys/buf.h>
     79 #include <sys/proc.h>
     80 #include <sys/mount.h>
     81 #include <sys/namei.h>
     82 #include <sys/vnode_impl.h>
     83 #include <sys/ioctl.h>
     84 #include <sys/tty.h>
     85 #include <sys/poll.h>
     86 #include <sys/kauth.h>
     87 #include <sys/syslog.h>
     88 #include <sys/fstrans.h>
     89 #include <sys/atomic.h>
     90 #include <sys/filedesc.h>
     91 #include <sys/wapbl.h>
     92 #include <sys/mman.h>
     93 
     94 #include <miscfs/specfs/specdev.h>
     95 #include <miscfs/fifofs/fifo.h>
     96 
     97 #include <uvm/uvm_extern.h>
     98 #include <uvm/uvm_readahead.h>
     99 #include <uvm/uvm_device.h>
    100 
    101 #ifdef UNION
    102 #include <fs/union/union.h>
    103 #endif
    104 
    105 #ifndef COMPAT_ZERODEV
    106 #define COMPAT_ZERODEV(dev)	(0)
    107 #endif
    108 
    109 int (*vn_union_readdir_hook)(struct vnode **, struct file *, struct lwp *);
    110 
    111 #include <sys/verified_exec.h>
    112 
    113 static int vn_read(file_t *fp, off_t *offset, struct uio *uio,
    114     kauth_cred_t cred, int flags);
    115 static int vn_write(file_t *fp, off_t *offset, struct uio *uio,
    116     kauth_cred_t cred, int flags);
    117 static int vn_closefile(file_t *fp);
    118 static int vn_poll(file_t *fp, int events);
    119 static int vn_fcntl(file_t *fp, u_int com, void *data);
    120 static int vn_statfile(file_t *fp, struct stat *sb);
    121 static int vn_ioctl(file_t *fp, u_long com, void *data);
    122 static int vn_mmap(struct file *, off_t *, size_t, int, int *, int *,
    123     struct uvm_object **, int *);
    124 static int vn_seek(struct file *, off_t, int, off_t *, int);
    125 static int vn_advlock(struct file *, void *, int, struct flock *, int);
    126 static int vn_fpathconf(struct file *, int, register_t *);
    127 static int vn_posix_fadvise(struct file *, off_t, off_t, int);
    128 static int vn_truncate(file_t *, off_t);
    129 
    130 const struct fileops vnops = {
    131 	.fo_name = "vn",
    132 	.fo_read = vn_read,
    133 	.fo_write = vn_write,
    134 	.fo_ioctl = vn_ioctl,
    135 	.fo_fcntl = vn_fcntl,
    136 	.fo_poll = vn_poll,
    137 	.fo_stat = vn_statfile,
    138 	.fo_close = vn_closefile,
    139 	.fo_kqfilter = vn_kqfilter,
    140 	.fo_restart = fnullop_restart,
    141 	.fo_mmap = vn_mmap,
    142 	.fo_seek = vn_seek,
    143 	.fo_advlock = vn_advlock,
    144 	.fo_fpathconf = vn_fpathconf,
    145 	.fo_posix_fadvise = vn_posix_fadvise,
    146 	.fo_truncate = vn_truncate,
    147 };
    148 
    149 /*
    150  * Common code for vnode open operations.
    151  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
    152  *
    153  * at_dvp is the directory for openat(), if any.
    154  * pb is the path.
    155  * nmode is additional namei flags, restricted to TRYEMULROOT and NOCHROOT.
    156  * fmode is the open flags, converted from O_* to F*
    157  * cmode is the creation file permissions.
    158  *
    159  * XXX shouldn't cmode be mode_t?
    160  *
    161  * On success produces either a locked vnode in *ret_vp, or NULL in
    162  * *ret_vp and a file descriptor number in *ret_fd.
    163  *
    164  * The caller may pass NULL for ret_fd (and ret_domove), in which case
    165  * EOPNOTSUPP will be produced in the cases that would otherwise return
    166  * a file descriptor.
    167  *
    168  * Note that callers that want no-follow behavior should pass
    169  * O_NOFOLLOW in fmode. Neither FOLLOW nor NOFOLLOW in nmode is
    170  * honored.
    171  */
    172 int
    173 vn_open(struct vnode *at_dvp, struct pathbuf *pb,
    174 	int nmode, int fmode, int cmode,
    175 	struct vnode **ret_vp, bool *ret_domove, int *ret_fd)
    176 {
    177 	struct nameidata nd;
    178 	struct vnode *vp = NULL;
    179 	struct lwp *l = curlwp;
    180 	kauth_cred_t cred = l->l_cred;
    181 	struct vattr va;
    182 	int error;
    183 	const char *pathstring;
    184 
    185 	KASSERT((nmode & (TRYEMULROOT | NOCHROOT)) == nmode);
    186 
    187 	KASSERT(ret_vp != NULL);
    188 	KASSERT((ret_domove == NULL) == (ret_fd == NULL));
    189 
    190 	if ((fmode & (O_CREAT | O_DIRECTORY)) == (O_CREAT | O_DIRECTORY))
    191 		return EINVAL;
    192 
    193 	NDINIT(&nd, LOOKUP, nmode, pb);
    194 	if (at_dvp != NULL)
    195 		NDAT(&nd, at_dvp);
    196 
    197 	nd.ni_cnd.cn_flags &= TRYEMULROOT | NOCHROOT;
    198 
    199 	if (fmode & O_CREAT) {
    200 		nd.ni_cnd.cn_nameiop = CREATE;
    201 		nd.ni_cnd.cn_flags |= LOCKPARENT | LOCKLEAF;
    202 		if ((fmode & O_EXCL) == 0 &&
    203 		    ((fmode & O_NOFOLLOW) == 0))
    204 			nd.ni_cnd.cn_flags |= FOLLOW;
    205 		if ((fmode & O_EXCL) == 0)
    206 			nd.ni_cnd.cn_flags |= NONEXCLHACK;
    207 	} else {
    208 		nd.ni_cnd.cn_nameiop = LOOKUP;
    209 		nd.ni_cnd.cn_flags |= LOCKLEAF;
    210 		if ((fmode & O_NOFOLLOW) == 0)
    211 			nd.ni_cnd.cn_flags |= FOLLOW;
    212 	}
    213 
    214 	pathstring = pathbuf_stringcopy_get(nd.ni_pathbuf);
    215 	if (pathstring == NULL) {
    216 		return ENOMEM;
    217 	}
    218 
    219 	/*
    220 	 * When this "interface" was exposed to do_open() it used
    221 	 * to initialize l_dupfd to -newfd-1 (thus passing in the
    222 	 * new file handle number to use)... but nothing in the
    223 	 * kernel uses that value. So just send 0.
    224 	 */
    225 	l->l_dupfd = 0;
    226 
    227 	error = namei(&nd);
    228 	if (error)
    229 		goto out;
    230 
    231 	vp = nd.ni_vp;
    232 
    233 #if NVERIEXEC > 0
    234 	error = veriexec_openchk(l, nd.ni_vp, pathstring, fmode);
    235 	if (error) {
    236 		/* We have to release the locks ourselves */
    237 		/*
    238 		 * 20210604 dholland passing NONEXCLHACK means we can
    239 		 * get ni_dvp == NULL back if ni_vp exists, and we should
    240 		 * treat that like the non-O_CREAT case.
    241 		 */
    242 		if ((fmode & O_CREAT) != 0 && nd.ni_dvp != NULL) {
    243 			if (vp == NULL) {
    244 				vput(nd.ni_dvp);
    245 			} else {
    246 				VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
    247 				if (nd.ni_dvp == nd.ni_vp)
    248 					vrele(nd.ni_dvp);
    249 				else
    250 					vput(nd.ni_dvp);
    251 				nd.ni_dvp = NULL;
    252 				vput(vp);
    253 				vp = NULL;
    254 			}
    255 		} else {
    256 			vput(vp);
    257 			vp = NULL;
    258 		}
    259 		goto out;
    260 	}
    261 #endif /* NVERIEXEC > 0 */
    262 
    263 	/*
    264 	 * 20210604 dholland ditto
    265 	 */
    266 	if ((fmode & O_CREAT) != 0 && nd.ni_dvp != NULL) {
    267 		if (nd.ni_vp == NULL) {
    268 			vattr_null(&va);
    269 			va.va_type = VREG;
    270 			va.va_mode = cmode;
    271 			if (fmode & O_EXCL)
    272 				 va.va_vaflags |= VA_EXCLUSIVE;
    273 			error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp,
    274 					   &nd.ni_cnd, &va);
    275 			if (error) {
    276 				vput(nd.ni_dvp);
    277 				goto out;
    278 			}
    279 			fmode &= ~O_TRUNC;
    280 			vp = nd.ni_vp;
    281 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
    282 			vput(nd.ni_dvp);
    283 		} else {
    284 			VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
    285 			if (nd.ni_dvp == nd.ni_vp)
    286 				vrele(nd.ni_dvp);
    287 			else
    288 				vput(nd.ni_dvp);
    289 			nd.ni_dvp = NULL;
    290 			vp = nd.ni_vp;
    291 			if (fmode & O_EXCL) {
    292 				error = EEXIST;
    293 				goto bad;
    294 			}
    295 			fmode &= ~O_CREAT;
    296 		}
    297 	} else if ((fmode & O_CREAT) != 0) {
    298 		/*
    299 		 * 20210606 dholland passing NONEXCLHACK means this
    300 		 * case exists; it is the same as the following one
    301 		 * but also needs to do things in the second (exists)
    302 		 * half of the following block. (Besides handle
    303 		 * ni_dvp, anyway.)
    304 		 */
    305 		vp = nd.ni_vp;
    306 		KASSERT((fmode & O_EXCL) == 0);
    307 		fmode &= ~O_CREAT;
    308 	} else {
    309 		vp = nd.ni_vp;
    310 	}
    311 	if (vp->v_type == VSOCK) {
    312 		error = EOPNOTSUPP;
    313 		goto bad;
    314 	}
    315 	if (nd.ni_vp->v_type == VLNK) {
    316 		error = EFTYPE;
    317 		goto bad;
    318 	}
    319 
    320 	if ((fmode & O_CREAT) == 0) {
    321 		error = vn_openchk(vp, cred, fmode);
    322 		if (error != 0)
    323 			goto bad;
    324 	}
    325 
    326 	if (fmode & O_TRUNC) {
    327 		vattr_null(&va);
    328 		va.va_size = 0;
    329 		error = VOP_SETATTR(vp, &va, cred);
    330 		if (error != 0)
    331 			goto bad;
    332 	}
    333 	if ((error = VOP_OPEN(vp, fmode, cred)) != 0)
    334 		goto bad;
    335 	if (fmode & FWRITE) {
    336 		mutex_enter(vp->v_interlock);
    337 		vp->v_writecount++;
    338 		mutex_exit(vp->v_interlock);
    339 	}
    340 
    341 bad:
    342 	if (error) {
    343 		vput(vp);
    344 		vp = NULL;
    345 	}
    346 out:
    347 	pathbuf_stringcopy_put(nd.ni_pathbuf, pathstring);
    348 
    349 	switch (error) {
    350 	case EDUPFD:
    351 	case EMOVEFD:
    352 		/* if the caller isn't prepared to handle fds, fail for them */
    353 		if (ret_fd == NULL) {
    354 			error = EOPNOTSUPP;
    355 			break;
    356 		}
    357 		*ret_vp = NULL;
    358 		*ret_domove = error == EMOVEFD;
    359 		*ret_fd = l->l_dupfd;
    360 		error = 0;
    361 		break;
    362 	case 0:
    363 		KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
    364 		*ret_vp = vp;
    365 		break;
    366 	}
    367 	l->l_dupfd = 0;
    368 	return error;
    369 }
    370 
    371 /*
    372  * Check for write permissions on the specified vnode.
    373  * Prototype text segments cannot be written.
    374  */
    375 int
    376 vn_writechk(struct vnode *vp)
    377 {
    378 
    379 	/*
    380 	 * If the vnode is in use as a process's text,
    381 	 * we can't allow writing.
    382 	 */
    383 	if (vp->v_iflag & VI_TEXT)
    384 		return ETXTBSY;
    385 	return 0;
    386 }
    387 
    388 int
    389 vn_openchk(struct vnode *vp, kauth_cred_t cred, int fflags)
    390 {
    391 	int permbits = 0;
    392 	int error;
    393 
    394 	if (vp->v_type == VNON || vp->v_type == VBAD)
    395 		return ENXIO;
    396 
    397 	if ((fflags & O_DIRECTORY) != 0 && vp->v_type != VDIR)
    398 		return ENOTDIR;
    399 
    400 	if ((fflags & O_REGULAR) != 0 && vp->v_type != VREG)
    401 		return EFTYPE;
    402 
    403 	if ((fflags & FREAD) != 0) {
    404 		permbits = VREAD;
    405 	}
    406 	if ((fflags & FEXEC) != 0) {
    407 		permbits |= VEXEC;
    408 	}
    409 	if ((fflags & (FWRITE | O_TRUNC)) != 0) {
    410 		permbits |= VWRITE;
    411 		if (vp->v_type == VDIR) {
    412 			error = EISDIR;
    413 			goto bad;
    414 		}
    415 		error = vn_writechk(vp);
    416 		if (error != 0)
    417 			goto bad;
    418 	}
    419 	error = VOP_ACCESS(vp, permbits, cred);
    420 bad:
    421 	return error;
    422 }
    423 
    424 /*
    425  * Mark a vnode as having executable mappings.
    426  */
    427 void
    428 vn_markexec(struct vnode *vp)
    429 {
    430 
    431 	if ((vp->v_iflag & VI_EXECMAP) != 0) {
    432 		/* Safe unlocked, as long as caller holds a reference. */
    433 		return;
    434 	}
    435 
    436 	rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
    437 	mutex_enter(vp->v_interlock);
    438 	if ((vp->v_iflag & VI_EXECMAP) == 0) {
    439 		cpu_count(CPU_COUNT_EXECPAGES, vp->v_uobj.uo_npages);
    440 		vp->v_iflag |= VI_EXECMAP;
    441 	}
    442 	mutex_exit(vp->v_interlock);
    443 	rw_exit(vp->v_uobj.vmobjlock);
    444 }
    445 
    446 /*
    447  * Mark a vnode as being the text of a process.
    448  * Fail if the vnode is currently writable.
    449  */
    450 int
    451 vn_marktext(struct vnode *vp)
    452 {
    453 
    454 	if ((vp->v_iflag & (VI_TEXT|VI_EXECMAP)) == (VI_TEXT|VI_EXECMAP)) {
    455 		/* Safe unlocked, as long as caller holds a reference. */
    456 		return 0;
    457 	}
    458 
    459 	rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
    460 	mutex_enter(vp->v_interlock);
    461 	if (vp->v_writecount != 0) {
    462 		KASSERT((vp->v_iflag & VI_TEXT) == 0);
    463 		mutex_exit(vp->v_interlock);
    464 		rw_exit(vp->v_uobj.vmobjlock);
    465 		return ETXTBSY;
    466 	}
    467 	if ((vp->v_iflag & VI_EXECMAP) == 0) {
    468 		cpu_count(CPU_COUNT_EXECPAGES, vp->v_uobj.uo_npages);
    469 	}
    470 	vp->v_iflag |= (VI_TEXT | VI_EXECMAP);
    471 	mutex_exit(vp->v_interlock);
    472 	rw_exit(vp->v_uobj.vmobjlock);
    473 	return 0;
    474 }
    475 
    476 /*
    477  * Vnode close call
    478  *
    479  * Note: takes an unlocked vnode, while VOP_CLOSE takes a locked node.
    480  */
    481 int
    482 vn_close(struct vnode *vp, int flags, kauth_cred_t cred)
    483 {
    484 	int error;
    485 
    486 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
    487 	if (flags & FWRITE) {
    488 		mutex_enter(vp->v_interlock);
    489 		KASSERT(vp->v_writecount > 0);
    490 		vp->v_writecount--;
    491 		mutex_exit(vp->v_interlock);
    492 	}
    493 	error = VOP_CLOSE(vp, flags, cred);
    494 	vput(vp);
    495 	return error;
    496 }
    497 
    498 static int
    499 enforce_rlimit_fsize(struct vnode *vp, struct uio *uio, int ioflag)
    500 {
    501 	struct lwp *l = curlwp;
    502 	off_t testoff;
    503 
    504 	if (uio->uio_rw != UIO_WRITE || vp->v_type != VREG)
    505 		return 0;
    506 
    507 	KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
    508 	if (ioflag & IO_APPEND)
    509 		testoff = vp->v_size;
    510 	else
    511 		testoff = uio->uio_offset;
    512 
    513 	if (testoff + uio->uio_resid >
    514 	    l->l_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
    515 		mutex_enter(&proc_lock);
    516 		psignal(l->l_proc, SIGXFSZ);
    517 		mutex_exit(&proc_lock);
    518 		return EFBIG;
    519 	}
    520 
    521 	return 0;
    522 }
    523 
    524 /*
    525  * Package up an I/O request on a vnode into a uio and do it.
    526  */
    527 int
    528 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
    529     enum uio_seg segflg, int ioflg, kauth_cred_t cred, size_t *aresid,
    530     struct lwp *l)
    531 {
    532 	struct uio auio;
    533 	struct iovec aiov;
    534 	int error;
    535 
    536 	if ((ioflg & IO_NODELOCKED) == 0) {
    537 		if (rw == UIO_READ) {
    538 			vn_lock(vp, LK_SHARED | LK_RETRY);
    539 		} else /* UIO_WRITE */ {
    540 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
    541 		}
    542 	}
    543 	auio.uio_iov = &aiov;
    544 	auio.uio_iovcnt = 1;
    545 	aiov.iov_base = base;
    546 	aiov.iov_len = len;
    547 	auio.uio_resid = len;
    548 	auio.uio_offset = offset;
    549 	auio.uio_rw = rw;
    550 	if (segflg == UIO_SYSSPACE) {
    551 		UIO_SETUP_SYSSPACE(&auio);
    552 	} else {
    553 		auio.uio_vmspace = l->l_proc->p_vmspace;
    554 	}
    555 
    556 	if ((error = enforce_rlimit_fsize(vp, &auio, ioflg)) != 0)
    557 		goto out;
    558 
    559 	if (rw == UIO_READ) {
    560 		error = VOP_READ(vp, &auio, ioflg, cred);
    561 	} else {
    562 		error = VOP_WRITE(vp, &auio, ioflg, cred);
    563 	}
    564 
    565 	if (aresid)
    566 		*aresid = auio.uio_resid;
    567 	else
    568 		if (auio.uio_resid && error == 0)
    569 			error = EIO;
    570 
    571  out:
    572 	if ((ioflg & IO_NODELOCKED) == 0) {
    573 		VOP_UNLOCK(vp);
    574 	}
    575 	return error;
    576 }
    577 
    578 int
    579 vn_readdir(file_t *fp, char *bf, int segflg, u_int count, int *done,
    580     struct lwp *l, off_t **cookies, int *ncookies)
    581 {
    582 	struct vnode *vp = fp->f_vnode;
    583 	struct iovec aiov;
    584 	struct uio auio;
    585 	int error, eofflag;
    586 
    587 	/* Limit the size on any kernel buffers used by VOP_READDIR */
    588 	count = uimin(MAXBSIZE, count);
    589 
    590 unionread:
    591 	if (vp->v_type != VDIR)
    592 		return EINVAL;
    593 	aiov.iov_base = bf;
    594 	aiov.iov_len = count;
    595 	auio.uio_iov = &aiov;
    596 	auio.uio_iovcnt = 1;
    597 	auio.uio_rw = UIO_READ;
    598 	if (segflg == UIO_SYSSPACE) {
    599 		UIO_SETUP_SYSSPACE(&auio);
    600 	} else {
    601 		KASSERT(l == curlwp);
    602 		auio.uio_vmspace = l->l_proc->p_vmspace;
    603 	}
    604 	auio.uio_resid = count;
    605 	vn_lock(vp, LK_SHARED | LK_RETRY);
    606 	mutex_enter(&fp->f_lock);
    607 	auio.uio_offset = fp->f_offset;
    608 	mutex_exit(&fp->f_lock);
    609 	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, cookies,
    610 	    ncookies);
    611 	mutex_enter(&fp->f_lock);
    612 	fp->f_offset = auio.uio_offset;
    613 	mutex_exit(&fp->f_lock);
    614 	VOP_UNLOCK(vp);
    615 	if (error)
    616 		return error;
    617 
    618 	if (count == auio.uio_resid && vn_union_readdir_hook) {
    619 		struct vnode *ovp = vp;
    620 
    621 		error = (*vn_union_readdir_hook)(&vp, fp, l);
    622 		if (error)
    623 			return error;
    624 		if (vp != ovp)
    625 			goto unionread;
    626 	}
    627 
    628 	if (count == auio.uio_resid && (vp->v_vflag & VV_ROOT) &&
    629 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
    630 		struct vnode *tvp = vp;
    631 		vp = vp->v_mount->mnt_vnodecovered;
    632 		vref(vp);
    633 		mutex_enter(&fp->f_lock);
    634 		fp->f_vnode = vp;
    635 		fp->f_offset = 0;
    636 		mutex_exit(&fp->f_lock);
    637 		vrele(tvp);
    638 		goto unionread;
    639 	}
    640 	*done = count - auio.uio_resid;
    641 	return error;
    642 }
    643 
    644 /*
    645  * File table vnode read routine.
    646  */
    647 static int
    648 vn_read(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
    649     int flags)
    650 {
    651 	struct vnode *vp = fp->f_vnode;
    652 	int error, ioflag, fflag;
    653 	size_t count;
    654 
    655 	ioflag = IO_ADV_ENCODE(fp->f_advice);
    656 	fflag = fp->f_flag;
    657 	if (fflag & FNONBLOCK)
    658 		ioflag |= IO_NDELAY;
    659 	if ((fflag & (FFSYNC | FRSYNC)) == (FFSYNC | FRSYNC))
    660 		ioflag |= IO_SYNC;
    661 	if (fflag & FALTIO)
    662 		ioflag |= IO_ALTSEMANTICS;
    663 	if (fflag & FDIRECT)
    664 		ioflag |= IO_DIRECT;
    665 	if (offset == &fp->f_offset && (flags & FOF_UPDATE_OFFSET) != 0)
    666 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
    667 	else
    668 		vn_lock(vp, LK_SHARED | LK_RETRY);
    669 	if (__predict_false(vp->v_type == VDIR) &&
    670 	    offset == &fp->f_offset && (flags & FOF_UPDATE_OFFSET) == 0)
    671 		mutex_enter(&fp->f_lock);
    672 	uio->uio_offset = *offset;
    673 	if (__predict_false(vp->v_type == VDIR) &&
    674 	    offset == &fp->f_offset && (flags & FOF_UPDATE_OFFSET) == 0)
    675 		mutex_enter(&fp->f_lock);
    676 	count = uio->uio_resid;
    677 	error = VOP_READ(vp, uio, ioflag, cred);
    678 	if (flags & FOF_UPDATE_OFFSET)
    679 		*offset += count - uio->uio_resid;
    680 	VOP_UNLOCK(vp);
    681 	return error;
    682 }
    683 
    684 /*
    685  * File table vnode write routine.
    686  */
    687 static int
    688 vn_write(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
    689     int flags)
    690 {
    691 	struct vnode *vp = fp->f_vnode;
    692 	int error, ioflag, fflag;
    693 	size_t count;
    694 
    695 	ioflag = IO_ADV_ENCODE(fp->f_advice) | IO_UNIT;
    696 	fflag = fp->f_flag;
    697 	if (vp->v_type == VREG && (fflag & O_APPEND))
    698 		ioflag |= IO_APPEND;
    699 	if (fflag & FNONBLOCK)
    700 		ioflag |= IO_NDELAY;
    701 	if (fflag & FFSYNC ||
    702 	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
    703 		ioflag |= IO_SYNC;
    704 	else if (fflag & FDSYNC)
    705 		ioflag |= IO_DSYNC;
    706 	if (fflag & FALTIO)
    707 		ioflag |= IO_ALTSEMANTICS;
    708 	if (fflag & FDIRECT)
    709 		ioflag |= IO_DIRECT;
    710 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
    711 	uio->uio_offset = *offset;
    712 	count = uio->uio_resid;
    713 
    714 	if ((error = enforce_rlimit_fsize(vp, uio, ioflag)) != 0)
    715 		goto out;
    716 
    717 	error = VOP_WRITE(vp, uio, ioflag, cred);
    718 
    719 	if (flags & FOF_UPDATE_OFFSET) {
    720 		if (ioflag & IO_APPEND) {
    721 			/*
    722 			 * SUSv3 describes behaviour for count = 0 as following:
    723 			 * "Before any action ... is taken, and if nbyte is zero
    724 			 * and the file is a regular file, the write() function
    725 			 * ... in the absence of errors ... shall return zero
    726 			 * and have no other results."
    727 			 */
    728 			if (count)
    729 				*offset = uio->uio_offset;
    730 		} else
    731 			*offset += count - uio->uio_resid;
    732 	}
    733 
    734  out:
    735 	VOP_UNLOCK(vp);
    736 	return error;
    737 }
    738 
    739 /*
    740  * File table vnode stat routine.
    741  */
    742 static int
    743 vn_statfile(file_t *fp, struct stat *sb)
    744 {
    745 	struct vnode *vp = fp->f_vnode;
    746 	int error;
    747 
    748 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
    749 	error = vn_stat(vp, sb);
    750 	VOP_UNLOCK(vp);
    751 	return error;
    752 }
    753 
    754 int
    755 vn_stat(struct vnode *vp, struct stat *sb)
    756 {
    757 	struct vattr va;
    758 	int error;
    759 	mode_t mode;
    760 
    761 	memset(&va, 0, sizeof(va));
    762 	error = VOP_GETATTR(vp, &va, kauth_cred_get());
    763 	if (error)
    764 		return error;
    765 	/*
    766 	 * Copy from vattr table
    767 	 */
    768 	memset(sb, 0, sizeof(*sb));
    769 	sb->st_dev = va.va_fsid;
    770 	sb->st_ino = va.va_fileid;
    771 	mode = va.va_mode;
    772 	switch (vp->v_type) {
    773 	case VREG:
    774 		mode |= S_IFREG;
    775 		break;
    776 	case VDIR:
    777 		mode |= S_IFDIR;
    778 		break;
    779 	case VBLK:
    780 		mode |= S_IFBLK;
    781 		break;
    782 	case VCHR:
    783 		mode |= S_IFCHR;
    784 		break;
    785 	case VLNK:
    786 		mode |= S_IFLNK;
    787 		break;
    788 	case VSOCK:
    789 		mode |= S_IFSOCK;
    790 		break;
    791 	case VFIFO:
    792 		mode |= S_IFIFO;
    793 		break;
    794 	default:
    795 		return EBADF;
    796 	}
    797 	sb->st_mode = mode;
    798 	sb->st_nlink = va.va_nlink;
    799 	sb->st_uid = va.va_uid;
    800 	sb->st_gid = va.va_gid;
    801 	sb->st_rdev = va.va_rdev;
    802 	sb->st_size = va.va_size;
    803 	sb->st_atimespec = va.va_atime;
    804 	sb->st_mtimespec = va.va_mtime;
    805 	sb->st_ctimespec = va.va_ctime;
    806 	sb->st_birthtimespec = va.va_birthtime;
    807 	sb->st_blksize = va.va_blocksize;
    808 	sb->st_flags = va.va_flags;
    809 	sb->st_gen = 0;
    810 	sb->st_blocks = va.va_bytes / S_BLKSIZE;
    811 	return 0;
    812 }
    813 
    814 /*
    815  * File table vnode fcntl routine.
    816  */
    817 static int
    818 vn_fcntl(file_t *fp, u_int com, void *data)
    819 {
    820 	struct vnode *vp = fp->f_vnode;
    821 	int error;
    822 
    823 	error = VOP_FCNTL(vp, com, data, fp->f_flag, kauth_cred_get());
    824 	return error;
    825 }
    826 
    827 /*
    828  * File table vnode ioctl routine.
    829  */
    830 static int
    831 vn_ioctl(file_t *fp, u_long com, void *data)
    832 {
    833 	struct vnode *vp = fp->f_vnode, *ovp;
    834 	struct vattr vattr;
    835 	int error;
    836 
    837 	switch (vp->v_type) {
    838 
    839 	case VREG:
    840 	case VDIR:
    841 		if (com == FIONREAD) {
    842 			vn_lock(vp, LK_SHARED | LK_RETRY);
    843 			error = VOP_GETATTR(vp, &vattr, kauth_cred_get());
    844 			if (error == 0) {
    845 				if (vp->v_type == VDIR)
    846 					mutex_enter(&fp->f_lock);
    847 				*(int *)data = vattr.va_size - fp->f_offset;
    848 				if (vp->v_type == VDIR)
    849 					mutex_exit(&fp->f_lock);
    850 			}
    851 			VOP_UNLOCK(vp);
    852 			if (error)
    853 				return error;
    854 			return 0;
    855 		}
    856 		if ((com == FIONWRITE) || (com == FIONSPACE)) {
    857 			/*
    858 			 * Files don't have send queues, so there never
    859 			 * are any bytes in them, nor is there any
    860 			 * open space in them.
    861 			 */
    862 			*(int *)data = 0;
    863 			return 0;
    864 		}
    865 		if (com == FIOGETBMAP) {
    866 			daddr_t *block;
    867 
    868 			if (*(daddr_t *)data < 0)
    869 				return EINVAL;
    870 			block = (daddr_t *)data;
    871 			vn_lock(vp, LK_SHARED | LK_RETRY);
    872 			error = VOP_BMAP(vp, *block, NULL, block, NULL);
    873 			VOP_UNLOCK(vp);
    874 			return error;
    875 		}
    876 		if (com == OFIOGETBMAP) {
    877 			daddr_t ibn, obn;
    878 
    879 			if (*(int32_t *)data < 0)
    880 				return EINVAL;
    881 			ibn = (daddr_t)*(int32_t *)data;
    882 			vn_lock(vp, LK_SHARED | LK_RETRY);
    883 			error = VOP_BMAP(vp, ibn, NULL, &obn, NULL);
    884 			VOP_UNLOCK(vp);
    885 			*(int32_t *)data = (int32_t)obn;
    886 			return error;
    887 		}
    888 		if (com == FIONBIO || com == FIOASYNC)	/* XXX */
    889 			return 0;			/* XXX */
    890 		/* FALLTHROUGH */
    891 	case VFIFO:
    892 	case VCHR:
    893 	case VBLK:
    894 		error = VOP_IOCTL(vp, com, data, fp->f_flag,
    895 		    kauth_cred_get());
    896 		if (error == 0 && com == TIOCSCTTY) {
    897 			vref(vp);
    898 			mutex_enter(&proc_lock);
    899 			ovp = curproc->p_session->s_ttyvp;
    900 			curproc->p_session->s_ttyvp = vp;
    901 			mutex_exit(&proc_lock);
    902 			if (ovp != NULL)
    903 				vrele(ovp);
    904 		}
    905 		return error;
    906 
    907 	default:
    908 		return EPASSTHROUGH;
    909 	}
    910 }
    911 
    912 /*
    913  * File table vnode poll routine.
    914  */
    915 static int
    916 vn_poll(file_t *fp, int events)
    917 {
    918 
    919 	return VOP_POLL(fp->f_vnode, events);
    920 }
    921 
    922 /*
    923  * File table vnode kqfilter routine.
    924  */
    925 int
    926 vn_kqfilter(file_t *fp, struct knote *kn)
    927 {
    928 
    929 	return VOP_KQFILTER(fp->f_vnode, kn);
    930 }
    931 
    932 static int
    933 vn_mmap(struct file *fp, off_t *offp, size_t size, int prot, int *flagsp,
    934     int *advicep, struct uvm_object **uobjp, int *maxprotp)
    935 {
    936 	struct uvm_object *uobj;
    937 	struct vnode *vp;
    938 	struct vattr va;
    939 	struct lwp *l;
    940 	vm_prot_t maxprot;
    941 	off_t off;
    942 	int error, flags;
    943 	bool needwritemap;
    944 
    945 	l = curlwp;
    946 
    947 	off = *offp;
    948 	flags = *flagsp;
    949 	maxprot = VM_PROT_EXECUTE;
    950 
    951 	KASSERT(size > 0);
    952 
    953 	vp = fp->f_vnode;
    954 	if (vp->v_type != VREG && vp->v_type != VCHR &&
    955 	    vp->v_type != VBLK) {
    956 		/* only REG/CHR/BLK support mmap */
    957 		return ENODEV;
    958 	}
    959 	if (vp->v_type != VCHR && off < 0) {
    960 		return EINVAL;
    961 	}
    962 #if SIZE_MAX > UINT32_MAX	/* XXX -Wtype-limits */
    963 	if (vp->v_type != VCHR && size > __type_max(off_t)) {
    964 		return EOVERFLOW;
    965 	}
    966 #endif
    967 	if (vp->v_type != VCHR && off > __type_max(off_t) - size) {
    968 		/* no offset wrapping */
    969 		return EOVERFLOW;
    970 	}
    971 
    972 	/* special case: catch SunOS style /dev/zero */
    973 	if (vp->v_type == VCHR &&
    974 	    (vp->v_rdev == zerodev || COMPAT_ZERODEV(vp->v_rdev))) {
    975 		*uobjp = NULL;
    976 		*maxprotp = VM_PROT_ALL;
    977 		return 0;
    978 	}
    979 
    980 	/*
    981 	 * Old programs may not select a specific sharing type, so
    982 	 * default to an appropriate one.
    983 	 *
    984 	 * XXX: how does MAP_ANON fit in the picture?
    985 	 */
    986 	if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) {
    987 #if defined(DEBUG)
    988 		struct proc *p = l->l_proc;
    989 		printf("WARNING: defaulted mmap() share type to "
    990 		       "%s (pid %d command %s)\n", vp->v_type == VCHR ?
    991 		       "MAP_SHARED" : "MAP_PRIVATE", p->p_pid,
    992 		       p->p_comm);
    993 #endif
    994 		if (vp->v_type == VCHR)
    995 			flags |= MAP_SHARED;	/* for a device */
    996 		else
    997 			flags |= MAP_PRIVATE;	/* for a file */
    998 	}
    999 
   1000 	/*
   1001 	 * MAP_PRIVATE device mappings don't make sense (and aren't
   1002 	 * supported anyway).  However, some programs rely on this,
   1003 	 * so just change it to MAP_SHARED.
   1004 	 */
   1005 	if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) {
   1006 		flags = (flags & ~MAP_PRIVATE) | MAP_SHARED;
   1007 	}
   1008 
   1009 	/*
   1010 	 * now check protection
   1011 	 */
   1012 
   1013 	/* check read access */
   1014 	if (fp->f_flag & FREAD)
   1015 		maxprot |= VM_PROT_READ;
   1016 	else if (prot & PROT_READ) {
   1017 		return EACCES;
   1018 	}
   1019 
   1020 	/* check write access, shared case first */
   1021 	if (flags & MAP_SHARED) {
   1022 		/*
   1023 		 * if the file is writable, only add PROT_WRITE to
   1024 		 * maxprot if the file is not immutable, append-only.
   1025 		 * otherwise, if we have asked for PROT_WRITE, return
   1026 		 * EPERM.
   1027 		 */
   1028 		if (fp->f_flag & FWRITE) {
   1029 			vn_lock(vp, LK_SHARED | LK_RETRY);
   1030 			error = VOP_GETATTR(vp, &va, l->l_cred);
   1031 			VOP_UNLOCK(vp);
   1032 			if (error) {
   1033 				return error;
   1034 			}
   1035 			if ((va.va_flags &
   1036 			     (SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0)
   1037 				maxprot |= VM_PROT_WRITE;
   1038 			else if (prot & PROT_WRITE) {
   1039 				return EPERM;
   1040 			}
   1041 		} else if (prot & PROT_WRITE) {
   1042 			return EACCES;
   1043 		}
   1044 	} else {
   1045 		/* MAP_PRIVATE mappings can always write to */
   1046 		maxprot |= VM_PROT_WRITE;
   1047 	}
   1048 
   1049 	/*
   1050 	 * Don't allow mmap for EXEC if the file system
   1051 	 * is mounted NOEXEC.
   1052 	 */
   1053 	if ((prot & PROT_EXEC) != 0 &&
   1054 	    (vp->v_mount->mnt_flag & MNT_NOEXEC) != 0) {
   1055 		return EACCES;
   1056 	}
   1057 
   1058 	if (vp->v_type != VCHR) {
   1059 		error = VOP_MMAP(vp, prot, curlwp->l_cred);
   1060 		if (error) {
   1061 			return error;
   1062 		}
   1063 		vref(vp);
   1064 		uobj = &vp->v_uobj;
   1065 
   1066 		/*
   1067 		 * If the vnode is being mapped with PROT_EXEC,
   1068 		 * then mark it as text.
   1069 		 */
   1070 		if (prot & PROT_EXEC) {
   1071 			vn_markexec(vp);
   1072 		}
   1073 	} else {
   1074 		int i = maxprot;
   1075 
   1076 		/*
   1077 		 * XXX Some devices don't like to be mapped with
   1078 		 * XXX PROT_EXEC or PROT_WRITE, but we don't really
   1079 		 * XXX have a better way of handling this, right now
   1080 		 */
   1081 		do {
   1082 			uobj = udv_attach(vp->v_rdev,
   1083 					  (flags & MAP_SHARED) ? i :
   1084 					  (i & ~VM_PROT_WRITE), off, size);
   1085 			i--;
   1086 		} while ((uobj == NULL) && (i > 0));
   1087 		if (uobj == NULL) {
   1088 			return EINVAL;
   1089 		}
   1090 		*advicep = UVM_ADV_RANDOM;
   1091 	}
   1092 
   1093 	/*
   1094 	 * Set vnode flags to indicate the new kinds of mapping.
   1095 	 * We take the vnode lock in exclusive mode here to serialize
   1096 	 * with direct I/O.
   1097 	 *
   1098 	 * Safe to check for these flag values without a lock, as
   1099 	 * long as a reference to the vnode is held.
   1100 	 */
   1101 	needwritemap = (vp->v_iflag & VI_WRMAP) == 0 &&
   1102 		(flags & MAP_SHARED) != 0 &&
   1103 		(maxprot & VM_PROT_WRITE) != 0;
   1104 	if ((vp->v_vflag & VV_MAPPED) == 0 || needwritemap) {
   1105 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   1106 		vp->v_vflag |= VV_MAPPED;
   1107 		if (needwritemap) {
   1108 			rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
   1109 			mutex_enter(vp->v_interlock);
   1110 			vp->v_iflag |= VI_WRMAP;
   1111 			mutex_exit(vp->v_interlock);
   1112 			rw_exit(vp->v_uobj.vmobjlock);
   1113 		}
   1114 		VOP_UNLOCK(vp);
   1115 	}
   1116 
   1117 #if NVERIEXEC > 0
   1118 
   1119 	/*
   1120 	 * Check if the file can be executed indirectly.
   1121 	 *
   1122 	 * XXX: This gives false warnings about "Incorrect access type"
   1123 	 * XXX: if the mapping is not executable. Harmless, but will be
   1124 	 * XXX: fixed as part of other changes.
   1125 	 */
   1126 	if (veriexec_verify(l, vp, "(mmap)", VERIEXEC_INDIRECT,
   1127 			    NULL)) {
   1128 
   1129 		/*
   1130 		 * Don't allow executable mappings if we can't
   1131 		 * indirectly execute the file.
   1132 		 */
   1133 		if (prot & VM_PROT_EXECUTE) {
   1134 			return EPERM;
   1135 		}
   1136 
   1137 		/*
   1138 		 * Strip the executable bit from 'maxprot' to make sure
   1139 		 * it can't be made executable later.
   1140 		 */
   1141 		maxprot &= ~VM_PROT_EXECUTE;
   1142 	}
   1143 #endif /* NVERIEXEC > 0 */
   1144 
   1145 	*uobjp = uobj;
   1146 	*maxprotp = maxprot;
   1147 	*flagsp = flags;
   1148 
   1149 	return 0;
   1150 }
   1151 
   1152 static int
   1153 vn_seek(struct file *fp, off_t delta, int whence, off_t *newoffp,
   1154     int flags)
   1155 {
   1156 	const off_t OFF_MIN = __type_min(off_t);
   1157 	const off_t OFF_MAX = __type_max(off_t);
   1158 	kauth_cred_t cred = fp->f_cred;
   1159 	off_t oldoff, newoff;
   1160 	struct vnode *vp = fp->f_vnode;
   1161 	struct vattr vattr;
   1162 	int error;
   1163 
   1164 	if (vp->v_type == VFIFO)
   1165 		return ESPIPE;
   1166 
   1167 	if (flags & FOF_UPDATE_OFFSET)
   1168 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   1169 	else
   1170 		vn_lock(vp, LK_SHARED | LK_RETRY);
   1171 
   1172 	/* Compute the old and new offsets.  */
   1173 	if (vp->v_type == VDIR && (flags & FOF_UPDATE_OFFSET) == 0)
   1174 		mutex_enter(&fp->f_lock);
   1175 	oldoff = fp->f_offset;
   1176 	if (vp->v_type == VDIR && (flags & FOF_UPDATE_OFFSET) == 0)
   1177 		mutex_exit(&fp->f_lock);
   1178 	switch (whence) {
   1179 	case SEEK_CUR:
   1180 		if (delta > 0) {
   1181 			if (oldoff > 0 && delta > OFF_MAX - oldoff) {
   1182 				newoff = OFF_MAX;
   1183 				break;
   1184 			}
   1185 		} else {
   1186 			if (oldoff < 0 && delta < OFF_MIN - oldoff) {
   1187 				newoff = OFF_MIN;
   1188 				break;
   1189 			}
   1190 		}
   1191 		newoff = oldoff + delta;
   1192 		break;
   1193 	case SEEK_END:
   1194 		error = VOP_GETATTR(vp, &vattr, cred);
   1195 		if (error)
   1196 			goto out;
   1197 		if (vattr.va_size > OFF_MAX ||
   1198 		    delta > OFF_MAX - (off_t)vattr.va_size) {
   1199 			newoff = OFF_MAX;
   1200 			break;
   1201 		}
   1202 		newoff = delta + vattr.va_size;
   1203 		break;
   1204 	case SEEK_SET:
   1205 		newoff = delta;
   1206 		break;
   1207 	default:
   1208 		error = EINVAL;
   1209 		goto out;
   1210 	}
   1211 
   1212 	/* Pass the proposed change to the file system to audit.  */
   1213 	error = VOP_SEEK(vp, oldoff, newoff, cred);
   1214 	if (error)
   1215 		goto out;
   1216 
   1217 	/* Success!  */
   1218 	if (newoffp)
   1219 		*newoffp = newoff;
   1220 	if (flags & FOF_UPDATE_OFFSET)
   1221 		fp->f_offset = newoff;
   1222 	error = 0;
   1223 
   1224 out:	VOP_UNLOCK(vp);
   1225 	return error;
   1226 }
   1227 
   1228 static int
   1229 vn_advlock(struct file *fp, void *id, int op, struct flock *fl,
   1230     int flags)
   1231 {
   1232 	struct vnode *const vp = fp->f_vnode;
   1233 
   1234 	if (fl->l_whence == SEEK_CUR) {
   1235 		vn_lock(vp, LK_SHARED | LK_RETRY);
   1236 		fl->l_start += fp->f_offset;
   1237 		VOP_UNLOCK(vp);
   1238 	}
   1239 
   1240 	return VOP_ADVLOCK(vp, id, op, fl, flags);
   1241 }
   1242 
   1243 static int
   1244 vn_fpathconf(struct file *fp, int name, register_t *retval)
   1245 {
   1246 	struct vnode *const vp = fp->f_vnode;
   1247 	int error;
   1248 
   1249 	vn_lock(vp, LK_SHARED | LK_RETRY);
   1250 	error = VOP_PATHCONF(vp, name, retval);
   1251 	VOP_UNLOCK(vp);
   1252 
   1253 	return error;
   1254 }
   1255 
   1256 static int
   1257 vn_posix_fadvise(struct file *fp, off_t offset, off_t len, int advice)
   1258 {
   1259 	const off_t OFF_MAX = __type_max(off_t);
   1260 	struct vnode *vp = fp->f_vnode;
   1261 	off_t endoffset;
   1262 	int error;
   1263 
   1264 	if (offset < 0) {
   1265 		return EINVAL;
   1266 	}
   1267 	if (len == 0) {
   1268 		endoffset = OFF_MAX;
   1269 	} else if (len > 0 && (OFF_MAX - offset) >= len) {
   1270 		endoffset = offset + len;
   1271 	} else {
   1272 		return EINVAL;
   1273 	}
   1274 
   1275 	CTASSERT(POSIX_FADV_NORMAL == UVM_ADV_NORMAL);
   1276 	CTASSERT(POSIX_FADV_RANDOM == UVM_ADV_RANDOM);
   1277 	CTASSERT(POSIX_FADV_SEQUENTIAL == UVM_ADV_SEQUENTIAL);
   1278 
   1279 	switch (advice) {
   1280 	case POSIX_FADV_WILLNEED:
   1281 	case POSIX_FADV_DONTNEED:
   1282 		if (vp->v_type != VREG && vp->v_type != VBLK)
   1283 			return 0;
   1284 		break;
   1285 	}
   1286 
   1287 	switch (advice) {
   1288 	case POSIX_FADV_NORMAL:
   1289 	case POSIX_FADV_RANDOM:
   1290 	case POSIX_FADV_SEQUENTIAL:
   1291 		/*
   1292 		 * We ignore offset and size.  Must lock the file to
   1293 		 * do this, as f_advice is sub-word sized.
   1294 		 */
   1295 		mutex_enter(&fp->f_lock);
   1296 		fp->f_advice = (u_char)advice;
   1297 		mutex_exit(&fp->f_lock);
   1298 		error = 0;
   1299 		break;
   1300 
   1301 	case POSIX_FADV_WILLNEED:
   1302 		error = uvm_readahead(&vp->v_uobj, offset, endoffset - offset);
   1303 		break;
   1304 
   1305 	case POSIX_FADV_DONTNEED:
   1306 		/*
   1307 		 * Align the region to page boundaries as VOP_PUTPAGES expects
   1308 		 * by shrinking it.  We shrink instead of expand because we
   1309 		 * do not want to deactivate cache outside of the requested
   1310 		 * region.  It means that if the specified region is smaller
   1311 		 * than PAGE_SIZE, we do nothing.
   1312 		 */
   1313 		if (offset <= trunc_page(OFF_MAX) &&
   1314 		    round_page(offset) < trunc_page(endoffset)) {
   1315 			rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
   1316 			error = VOP_PUTPAGES(vp,
   1317 			    round_page(offset), trunc_page(endoffset),
   1318 			    PGO_DEACTIVATE | PGO_CLEANIT);
   1319 		} else {
   1320 			error = 0;
   1321 		}
   1322 		break;
   1323 
   1324 	case POSIX_FADV_NOREUSE:
   1325 		/* Not implemented yet. */
   1326 		error = 0;
   1327 		break;
   1328 	default:
   1329 		error = EINVAL;
   1330 		break;
   1331 	}
   1332 
   1333 	return error;
   1334 }
   1335 
   1336 static int
   1337 vn_truncate(file_t *fp, off_t length)
   1338 {
   1339 	struct vattr vattr;
   1340 	struct vnode *vp;
   1341 	int error = 0;
   1342 
   1343 	if (length < 0)
   1344 		return EINVAL;
   1345 
   1346 	if ((fp->f_flag & FWRITE) == 0)
   1347 		return EINVAL;
   1348 	vp = fp->f_vnode;
   1349 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   1350 	if (vp->v_type == VDIR)
   1351 		error = EISDIR;
   1352 	else if ((error = vn_writechk(vp)) == 0) {
   1353 		vattr_null(&vattr);
   1354 		vattr.va_size = length;
   1355 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
   1356 	}
   1357 	VOP_UNLOCK(vp);
   1358 
   1359 	return error;
   1360 }
   1361 
   1362 
   1363 /*
   1364  * Check that the vnode is still valid, and if so
   1365  * acquire requested lock.
   1366  */
   1367 int
   1368 vn_lock(struct vnode *vp, int flags)
   1369 {
   1370 	struct lwp *l;
   1371 	int error;
   1372 
   1373 	KASSERT(vrefcnt(vp) > 0);
   1374 	KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT|LK_RETRY|
   1375 	    LK_UPGRADE|LK_DOWNGRADE)) == 0);
   1376 	KASSERT((flags & LK_NOWAIT) != 0 || !mutex_owned(vp->v_interlock));
   1377 
   1378 #ifdef DIAGNOSTIC
   1379 	if (wapbl_vphaswapbl(vp))
   1380 		WAPBL_JUNLOCK_ASSERT(wapbl_vptomp(vp));
   1381 #endif
   1382 
   1383 	/* Get a more useful report for lockstat. */
   1384 	l = curlwp;
   1385 	KASSERT(l->l_rwcallsite == 0);
   1386 	l->l_rwcallsite = (uintptr_t)__builtin_return_address(0);
   1387 
   1388 	error = VOP_LOCK(vp, flags);
   1389 
   1390 	l->l_rwcallsite = 0;
   1391 
   1392 	switch (flags & (LK_RETRY | LK_NOWAIT)) {
   1393 	case 0:
   1394 		KASSERT(error == 0 || error == ENOENT);
   1395 		break;
   1396 	case LK_RETRY:
   1397 		KASSERT(error == 0);
   1398 		break;
   1399 	case LK_NOWAIT:
   1400 		KASSERT(error == 0 || error == EBUSY || error == ENOENT);
   1401 		break;
   1402 	case LK_RETRY | LK_NOWAIT:
   1403 		KASSERT(error == 0 || error == EBUSY);
   1404 		break;
   1405 	}
   1406 
   1407 	return error;
   1408 }
   1409 
   1410 /*
   1411  * File table vnode close routine.
   1412  */
   1413 static int
   1414 vn_closefile(file_t *fp)
   1415 {
   1416 
   1417 	return vn_close(fp->f_vnode, fp->f_flag, fp->f_cred);
   1418 }
   1419 
   1420 /*
   1421  * Simplified in-kernel wrapper calls for extended attribute access.
   1422  * Both calls pass in a NULL credential, authorizing a "kernel" access.
   1423  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
   1424  */
   1425 int
   1426 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
   1427     const char *attrname, size_t *buflen, void *bf, struct lwp *l)
   1428 {
   1429 	struct uio auio;
   1430 	struct iovec aiov;
   1431 	int error;
   1432 
   1433 	aiov.iov_len = *buflen;
   1434 	aiov.iov_base = bf;
   1435 
   1436 	auio.uio_iov = &aiov;
   1437 	auio.uio_iovcnt = 1;
   1438 	auio.uio_rw = UIO_READ;
   1439 	auio.uio_offset = 0;
   1440 	auio.uio_resid = *buflen;
   1441 	UIO_SETUP_SYSSPACE(&auio);
   1442 
   1443 	if ((ioflg & IO_NODELOCKED) == 0)
   1444 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   1445 
   1446 	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL,
   1447 	    NOCRED);
   1448 
   1449 	if ((ioflg & IO_NODELOCKED) == 0)
   1450 		VOP_UNLOCK(vp);
   1451 
   1452 	if (error == 0)
   1453 		*buflen = *buflen - auio.uio_resid;
   1454 
   1455 	return error;
   1456 }
   1457 
   1458 /*
   1459  * XXX Failure mode if partially written?
   1460  */
   1461 int
   1462 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
   1463     const char *attrname, size_t buflen, const void *bf, struct lwp *l)
   1464 {
   1465 	struct uio auio;
   1466 	struct iovec aiov;
   1467 	int error;
   1468 
   1469 	aiov.iov_len = buflen;
   1470 	aiov.iov_base = __UNCONST(bf);		/* XXXUNCONST kills const */
   1471 
   1472 	auio.uio_iov = &aiov;
   1473 	auio.uio_iovcnt = 1;
   1474 	auio.uio_rw = UIO_WRITE;
   1475 	auio.uio_offset = 0;
   1476 	auio.uio_resid = buflen;
   1477 	UIO_SETUP_SYSSPACE(&auio);
   1478 
   1479 	if ((ioflg & IO_NODELOCKED) == 0) {
   1480 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   1481 	}
   1482 
   1483 	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NOCRED);
   1484 
   1485 	if ((ioflg & IO_NODELOCKED) == 0) {
   1486 		VOP_UNLOCK(vp);
   1487 	}
   1488 
   1489 	return error;
   1490 }
   1491 
   1492 int
   1493 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
   1494     const char *attrname, struct lwp *l)
   1495 {
   1496 	int error;
   1497 
   1498 	if ((ioflg & IO_NODELOCKED) == 0) {
   1499 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   1500 	}
   1501 
   1502 	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NOCRED);
   1503 	if (error == EOPNOTSUPP)
   1504 		error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
   1505 		    NOCRED);
   1506 
   1507 	if ((ioflg & IO_NODELOCKED) == 0) {
   1508 		VOP_UNLOCK(vp);
   1509 	}
   1510 
   1511 	return error;
   1512 }
   1513 
   1514 int
   1515 vn_fifo_bypass(void *v)
   1516 {
   1517 	struct vop_generic_args *ap = v;
   1518 
   1519 	return VOCALL(fifo_vnodeop_p, ap->a_desc->vdesc_offset, v);
   1520 }
   1521 
   1522 /*
   1523  * Open block device by device number
   1524  */
   1525 int
   1526 vn_bdev_open(dev_t dev, struct vnode **vpp, struct lwp *l)
   1527 {
   1528 	int     error;
   1529 
   1530 	if ((error = bdevvp(dev, vpp)) != 0)
   1531 		return error;
   1532 
   1533 	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
   1534 	if ((error = VOP_OPEN(*vpp, FREAD | FWRITE, l->l_cred)) != 0) {
   1535 		vput(*vpp);
   1536 		return error;
   1537 	}
   1538 	mutex_enter((*vpp)->v_interlock);
   1539 	(*vpp)->v_writecount++;
   1540 	mutex_exit((*vpp)->v_interlock);
   1541 	VOP_UNLOCK(*vpp);
   1542 
   1543 	return 0;
   1544 }
   1545 
   1546 /*
   1547  * Lookup the provided name in the filesystem.  If the file exists,
   1548  * is a valid block device, and isn't being used by anyone else,
   1549  * set *vpp to the file's vnode.
   1550  */
   1551 int
   1552 vn_bdev_openpath(struct pathbuf *pb, struct vnode **vpp, struct lwp *l)
   1553 {
   1554 	struct vnode *vp;
   1555 	dev_t dev;
   1556 	enum vtype vt;
   1557 	int     error;
   1558 
   1559 	error = vn_open(NULL, pb, 0, FREAD | FWRITE, 0, &vp, NULL, NULL);
   1560 	if (error != 0)
   1561 		return error;
   1562 
   1563 	dev = vp->v_rdev;
   1564 	vt = vp->v_type;
   1565 
   1566 	VOP_UNLOCK(vp);
   1567 	(void) vn_close(vp, FREAD | FWRITE, l->l_cred);
   1568 
   1569 	if (vt != VBLK)
   1570 		return ENOTBLK;
   1571 
   1572 	return vn_bdev_open(dev, vpp, l);
   1573 }
   1574 
   1575 static long
   1576 vn_knote_to_interest(const struct knote *kn)
   1577 {
   1578 	switch (kn->kn_filter) {
   1579 	case EVFILT_READ:
   1580 		/*
   1581 		 * Writing to the file or changing its attributes can
   1582 		 * set the file size, which impacts the readability
   1583 		 * filter.
   1584 		 *
   1585 		 * (No need to set NOTE_EXTEND here; it's only ever
   1586 		 * send with other hints; see vnode_if.c.)
   1587 		 */
   1588 		return NOTE_WRITE | NOTE_ATTRIB;
   1589 
   1590 	case EVFILT_VNODE:
   1591 		return kn->kn_sfflags;
   1592 
   1593 	case EVFILT_WRITE:
   1594 	default:
   1595 		return 0;
   1596 	}
   1597 }
   1598 
   1599 void
   1600 vn_knote_attach(struct vnode *vp, struct knote *kn)
   1601 {
   1602 	struct vnode_klist *vk = vp->v_klist;
   1603 	long interest = 0;
   1604 
   1605 	/*
   1606 	 * In the case of layered / stacked file systems, knotes
   1607 	 * should only ever be associated with the base vnode.
   1608 	 */
   1609 	KASSERT(kn->kn_hook == vp);
   1610 	KASSERT(vp->v_klist == &VNODE_TO_VIMPL(vp)->vi_klist);
   1611 
   1612 	/*
   1613 	 * We maintain a bitmask of the kevents that there is interest in,
   1614 	 * to minimize the impact of having watchers.  It's silly to have
   1615 	 * to traverse vn_klist every time a read or write happens simply
   1616 	 * because there is someone interested in knowing when the file
   1617 	 * is deleted, for example.
   1618 	 */
   1619 
   1620 	mutex_enter(vp->v_interlock);
   1621 	SLIST_INSERT_HEAD(&vk->vk_klist, kn, kn_selnext);
   1622 	SLIST_FOREACH(kn, &vk->vk_klist, kn_selnext) {
   1623 		interest |= vn_knote_to_interest(kn);
   1624 	}
   1625 	vk->vk_interest = interest;
   1626 	mutex_exit(vp->v_interlock);
   1627 }
   1628 
   1629 void
   1630 vn_knote_detach(struct vnode *vp, struct knote *kn)
   1631 {
   1632 	struct vnode_klist *vk = vp->v_klist;
   1633 	long interest = 0;
   1634 
   1635 	/* See above. */
   1636 	KASSERT(kn->kn_hook == vp);
   1637 	KASSERT(vp->v_klist == &VNODE_TO_VIMPL(vp)->vi_klist);
   1638 
   1639 	/*
   1640 	 * We special case removing the head of the list, because:
   1641 	 *
   1642 	 * 1. It's extremely likely that we're detaching the only
   1643 	 *    knote.
   1644 	 *
   1645 	 * 2. We're already traversing the whole list, so we don't
   1646 	 *    want to use the generic SLIST_REMOVE() which would
   1647 	 *    traverse it *again*.
   1648 	 */
   1649 
   1650 	mutex_enter(vp->v_interlock);
   1651 	if (__predict_true(kn == SLIST_FIRST(&vk->vk_klist))) {
   1652 		SLIST_REMOVE_HEAD(&vk->vk_klist, kn_selnext);
   1653 		SLIST_FOREACH(kn, &vk->vk_klist, kn_selnext) {
   1654 			interest |= vn_knote_to_interest(kn);
   1655 		}
   1656 		vk->vk_interest = interest;
   1657 	} else {
   1658 		struct knote *thiskn, *nextkn, *prevkn = NULL;
   1659 
   1660 		SLIST_FOREACH_SAFE(thiskn, &vk->vk_klist, kn_selnext, nextkn) {
   1661 			if (thiskn == kn) {
   1662 				KASSERT(kn != NULL);
   1663 				KASSERT(prevkn != NULL);
   1664 				SLIST_REMOVE_AFTER(prevkn, kn_selnext);
   1665 				kn = NULL;
   1666 			} else {
   1667 				interest |= vn_knote_to_interest(thiskn);
   1668 				prevkn = thiskn;
   1669 			}
   1670 		}
   1671 		vk->vk_interest = interest;
   1672 	}
   1673 	mutex_exit(vp->v_interlock);
   1674 }
   1675