Home | History | Annotate | Line # | Download | only in kern
vfs_vnode.c revision 1.39.2.3
      1 /*	$NetBSD: vfs_vnode.c,v 1.39.2.3 2015/09/22 12:06:07 skrll Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1997-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
      9  * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
     10  *
     11  * Redistribution and use in source and binary forms, with or without
     12  * modification, are permitted provided that the following conditions
     13  * are met:
     14  * 1. Redistributions of source code must retain the above copyright
     15  *    notice, this list of conditions and the following disclaimer.
     16  * 2. Redistributions in binary form must reproduce the above copyright
     17  *    notice, this list of conditions and the following disclaimer in the
     18  *    documentation and/or other materials provided with the distribution.
     19  *
     20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     30  * POSSIBILITY OF SUCH DAMAGE.
     31  */
     32 
     33 /*
     34  * Copyright (c) 1989, 1993
     35  *	The Regents of the University of California.  All rights reserved.
     36  * (c) UNIX System Laboratories, Inc.
     37  * All or some portions of this file are derived from material licensed
     38  * to the University of California by American Telephone and Telegraph
     39  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
     40  * the permission of UNIX System Laboratories, Inc.
     41  *
     42  * Redistribution and use in source and binary forms, with or without
     43  * modification, are permitted provided that the following conditions
     44  * are met:
     45  * 1. Redistributions of source code must retain the above copyright
     46  *    notice, this list of conditions and the following disclaimer.
     47  * 2. Redistributions in binary form must reproduce the above copyright
     48  *    notice, this list of conditions and the following disclaimer in the
     49  *    documentation and/or other materials provided with the distribution.
     50  * 3. Neither the name of the University nor the names of its contributors
     51  *    may be used to endorse or promote products derived from this software
     52  *    without specific prior written permission.
     53  *
     54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     64  * SUCH DAMAGE.
     65  *
     66  *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
     67  */
     68 
     69 /*
     70  * The vnode cache subsystem.
     71  *
     72  * Life-cycle
     73  *
     74  *	Normally, there are two points where new vnodes are created:
     75  *	VOP_CREATE(9) and VOP_LOOKUP(9).  The life-cycle of a vnode
     76  *	starts in one of the following ways:
     77  *
     78  *	- Allocation, via vcache_get(9) or vcache_new(9).
     79  *	- Reclamation of inactive vnode, via vget(9).
     80  *
     81  *	Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9)
     82  *	was another, traditional way.  Currently, only the draining thread
     83  *	recycles the vnodes.  This behaviour might be revisited.
     84  *
     85  *	The life-cycle ends when the last reference is dropped, usually
     86  *	in VOP_REMOVE(9).  In such case, VOP_INACTIVE(9) is called to inform
     87  *	the file system that vnode is inactive.  Via this call, file system
     88  *	indicates whether vnode can be recycled (usually, it checks its own
     89  *	references, e.g. count of links, whether the file was removed).
     90  *
     91  *	Depending on indication, vnode can be put into a free list (cache),
     92  *	or cleaned via vclean(9), which calls VOP_RECLAIM(9) to disassociate
     93  *	underlying file system from the vnode, and finally destroyed.
     94  *
     95  * Reference counting
     96  *
     97  *	Vnode is considered active, if reference count (vnode_t::v_usecount)
     98  *	is non-zero.  It is maintained using: vref(9) and vrele(9), as well
     99  *	as vput(9), routines.  Common points holding references are e.g.
    100  *	file openings, current working directory, mount points, etc.
    101  *
    102  * Note on v_usecount and its locking
    103  *
    104  *	At nearly all points it is known that v_usecount could be zero,
    105  *	the vnode_t::v_interlock will be held.  To change v_usecount away
    106  *	from zero, the interlock must be held.  To change from a non-zero
    107  *	value to zero, again the interlock must be held.
    108  *
    109  *	Changing the usecount from a non-zero value to a non-zero value can
    110  *	safely be done using atomic operations, without the interlock held.
    111  *
    112  *	Note: if VI_CLEAN is set, vnode_t::v_interlock will be released while
    113  *	mntvnode_lock is still held.
    114  *
    115  *	See PR 41374.
    116  */
    117 
    118 #include <sys/cdefs.h>
    119 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.39.2.3 2015/09/22 12:06:07 skrll Exp $");
    120 
    121 #define _VFS_VNODE_PRIVATE
    122 
    123 #include <sys/param.h>
    124 #include <sys/kernel.h>
    125 
    126 #include <sys/atomic.h>
    127 #include <sys/buf.h>
    128 #include <sys/conf.h>
    129 #include <sys/device.h>
    130 #include <sys/hash.h>
    131 #include <sys/kauth.h>
    132 #include <sys/kmem.h>
    133 #include <sys/kthread.h>
    134 #include <sys/module.h>
    135 #include <sys/mount.h>
    136 #include <sys/namei.h>
    137 #include <sys/syscallargs.h>
    138 #include <sys/sysctl.h>
    139 #include <sys/systm.h>
    140 #include <sys/vnode.h>
    141 #include <sys/wapbl.h>
    142 #include <sys/fstrans.h>
    143 
    144 #include <uvm/uvm.h>
    145 #include <uvm/uvm_readahead.h>
    146 
    147 /* Flags to vrelel. */
    148 #define	VRELEL_ASYNC_RELE	0x0001	/* Always defer to vrele thread. */
    149 #define	VRELEL_CHANGING_SET	0x0002	/* VI_CHANGING set by caller. */
    150 
    151 struct vcache_key {
    152 	struct mount *vk_mount;
    153 	const void *vk_key;
    154 	size_t vk_key_len;
    155 };
    156 struct vcache_node {
    157 	SLIST_ENTRY(vcache_node) vn_hash;
    158 	struct vnode *vn_vnode;
    159 	struct vcache_key vn_key;
    160 };
    161 
    162 u_int			numvnodes		__cacheline_aligned;
    163 
    164 static pool_cache_t	vnode_cache		__read_mostly;
    165 
    166 /*
    167  * There are two free lists: one is for vnodes which have no buffer/page
    168  * references and one for those which do (i.e. v_holdcnt is non-zero).
    169  * Vnode recycling mechanism first attempts to look into the former list.
    170  */
    171 static kmutex_t		vnode_free_list_lock	__cacheline_aligned;
    172 static vnodelst_t	vnode_free_list		__cacheline_aligned;
    173 static vnodelst_t	vnode_hold_list		__cacheline_aligned;
    174 static kcondvar_t	vdrain_cv		__cacheline_aligned;
    175 
    176 static vnodelst_t	vrele_list		__cacheline_aligned;
    177 static kmutex_t		vrele_lock		__cacheline_aligned;
    178 static kcondvar_t	vrele_cv		__cacheline_aligned;
    179 static lwp_t *		vrele_lwp		__cacheline_aligned;
    180 static int		vrele_pending		__cacheline_aligned;
    181 static int		vrele_gen		__cacheline_aligned;
    182 
    183 SLIST_HEAD(hashhead, vcache_node);
    184 static struct {
    185 	kmutex_t	lock;
    186 	u_long		hashmask;
    187 	struct hashhead	*hashtab;
    188 	pool_cache_t	pool;
    189 }			vcache			__cacheline_aligned;
    190 
    191 static int		cleanvnode(void);
    192 static void		vcache_init(void);
    193 static void		vcache_reinit(void);
    194 static void		vclean(vnode_t *);
    195 static void		vrelel(vnode_t *, int);
    196 static void		vdrain_thread(void *);
    197 static void		vrele_thread(void *);
    198 static void		vnpanic(vnode_t *, const char *, ...)
    199     __printflike(2, 3);
    200 static void		vwait(vnode_t *, int);
    201 
    202 /* Routines having to do with the management of the vnode table. */
    203 extern struct mount	*dead_rootmount;
    204 extern int		(**dead_vnodeop_p)(void *);
    205 extern struct vfsops	dead_vfsops;
    206 
    207 void
    208 vfs_vnode_sysinit(void)
    209 {
    210 	int error __diagused;
    211 
    212 	vnode_cache = pool_cache_init(sizeof(vnode_t), 0, 0, 0, "vnodepl",
    213 	    NULL, IPL_NONE, NULL, NULL, NULL);
    214 	KASSERT(vnode_cache != NULL);
    215 
    216 	dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL);
    217 	KASSERT(dead_rootmount != NULL);
    218 	dead_rootmount->mnt_iflag = IMNT_MPSAFE;
    219 
    220 	mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE);
    221 	TAILQ_INIT(&vnode_free_list);
    222 	TAILQ_INIT(&vnode_hold_list);
    223 	TAILQ_INIT(&vrele_list);
    224 
    225 	vcache_init();
    226 
    227 	mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE);
    228 	cv_init(&vdrain_cv, "vdrain");
    229 	cv_init(&vrele_cv, "vrele");
    230 	error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread,
    231 	    NULL, NULL, "vdrain");
    232 	KASSERT(error == 0);
    233 	error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread,
    234 	    NULL, &vrele_lwp, "vrele");
    235 	KASSERT(error == 0);
    236 }
    237 
    238 /*
    239  * Allocate a new, uninitialized vnode.  If 'mp' is non-NULL, this is a
    240  * marker vnode.
    241  */
    242 vnode_t *
    243 vnalloc(struct mount *mp)
    244 {
    245 	vnode_t *vp;
    246 
    247 	vp = pool_cache_get(vnode_cache, PR_WAITOK);
    248 	KASSERT(vp != NULL);
    249 
    250 	memset(vp, 0, sizeof(*vp));
    251 	uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0);
    252 	cv_init(&vp->v_cv, "vnode");
    253 	/*
    254 	 * Done by memset() above.
    255 	 *	LIST_INIT(&vp->v_nclist);
    256 	 *	LIST_INIT(&vp->v_dnclist);
    257 	 */
    258 
    259 	if (mp != NULL) {
    260 		vp->v_mount = mp;
    261 		vp->v_type = VBAD;
    262 		vp->v_iflag = VI_MARKER;
    263 		return vp;
    264 	}
    265 
    266 	mutex_enter(&vnode_free_list_lock);
    267 	numvnodes++;
    268 	if (numvnodes > desiredvnodes + desiredvnodes / 10)
    269 		cv_signal(&vdrain_cv);
    270 	mutex_exit(&vnode_free_list_lock);
    271 
    272 	rw_init(&vp->v_lock);
    273 	vp->v_usecount = 1;
    274 	vp->v_type = VNON;
    275 	vp->v_size = vp->v_writesize = VSIZENOTSET;
    276 
    277 	return vp;
    278 }
    279 
    280 /*
    281  * Free an unused, unreferenced vnode.
    282  */
    283 void
    284 vnfree(vnode_t *vp)
    285 {
    286 
    287 	KASSERT(vp->v_usecount == 0);
    288 
    289 	if ((vp->v_iflag & VI_MARKER) == 0) {
    290 		rw_destroy(&vp->v_lock);
    291 		mutex_enter(&vnode_free_list_lock);
    292 		numvnodes--;
    293 		mutex_exit(&vnode_free_list_lock);
    294 	}
    295 
    296 	uvm_obj_destroy(&vp->v_uobj, true);
    297 	cv_destroy(&vp->v_cv);
    298 	pool_cache_put(vnode_cache, vp);
    299 }
    300 
    301 /*
    302  * cleanvnode: grab a vnode from freelist, clean and free it.
    303  *
    304  * => Releases vnode_free_list_lock.
    305  */
    306 static int
    307 cleanvnode(void)
    308 {
    309 	vnode_t *vp;
    310 	vnodelst_t *listhd;
    311 	struct mount *mp;
    312 
    313 	KASSERT(mutex_owned(&vnode_free_list_lock));
    314 
    315 	listhd = &vnode_free_list;
    316 try_nextlist:
    317 	TAILQ_FOREACH(vp, listhd, v_freelist) {
    318 		/*
    319 		 * It's safe to test v_usecount and v_iflag
    320 		 * without holding the interlock here, since
    321 		 * these vnodes should never appear on the
    322 		 * lists.
    323 		 */
    324 		KASSERT(vp->v_usecount == 0);
    325 		KASSERT((vp->v_iflag & VI_CLEAN) == 0);
    326 		KASSERT(vp->v_freelisthd == listhd);
    327 
    328 		if (!mutex_tryenter(vp->v_interlock))
    329 			continue;
    330 		if ((vp->v_iflag & VI_XLOCK) != 0) {
    331 			mutex_exit(vp->v_interlock);
    332 			continue;
    333 		}
    334 		mp = vp->v_mount;
    335 		if (fstrans_start_nowait(mp, FSTRANS_SHARED) != 0) {
    336 			mutex_exit(vp->v_interlock);
    337 			continue;
    338 		}
    339 		break;
    340 	}
    341 
    342 	if (vp == NULL) {
    343 		if (listhd == &vnode_free_list) {
    344 			listhd = &vnode_hold_list;
    345 			goto try_nextlist;
    346 		}
    347 		mutex_exit(&vnode_free_list_lock);
    348 		return EBUSY;
    349 	}
    350 
    351 	/* Remove it from the freelist. */
    352 	TAILQ_REMOVE(listhd, vp, v_freelist);
    353 	vp->v_freelisthd = NULL;
    354 	mutex_exit(&vnode_free_list_lock);
    355 
    356 	KASSERT(vp->v_usecount == 0);
    357 
    358 	/*
    359 	 * The vnode is still associated with a file system, so we must
    360 	 * clean it out before freeing it.  We need to add a reference
    361 	 * before doing this.
    362 	 */
    363 	vp->v_usecount = 1;
    364 	KASSERT((vp->v_iflag & VI_CHANGING) == 0);
    365 	vp->v_iflag |= VI_CHANGING;
    366 	vclean(vp);
    367 	vrelel(vp, VRELEL_CHANGING_SET);
    368 	fstrans_done(mp);
    369 
    370 	return 0;
    371 }
    372 
    373 /*
    374  * Helper thread to keep the number of vnodes below desiredvnodes.
    375  */
    376 static void
    377 vdrain_thread(void *cookie)
    378 {
    379 	int error;
    380 
    381 	mutex_enter(&vnode_free_list_lock);
    382 
    383 	for (;;) {
    384 		cv_timedwait(&vdrain_cv, &vnode_free_list_lock, hz);
    385 		while (numvnodes > desiredvnodes) {
    386 			error = cleanvnode();
    387 			if (error)
    388 				kpause("vndsbusy", false, hz, NULL);
    389 			mutex_enter(&vnode_free_list_lock);
    390 			if (error)
    391 				break;
    392 		}
    393 	}
    394 }
    395 
    396 /*
    397  * Remove a vnode from its freelist.
    398  */
    399 void
    400 vremfree(vnode_t *vp)
    401 {
    402 
    403 	KASSERT(mutex_owned(vp->v_interlock));
    404 	KASSERT(vp->v_usecount == 0);
    405 
    406 	/*
    407 	 * Note that the reference count must not change until
    408 	 * the vnode is removed.
    409 	 */
    410 	mutex_enter(&vnode_free_list_lock);
    411 	if (vp->v_holdcnt > 0) {
    412 		KASSERT(vp->v_freelisthd == &vnode_hold_list);
    413 	} else {
    414 		KASSERT(vp->v_freelisthd == &vnode_free_list);
    415 	}
    416 	TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
    417 	vp->v_freelisthd = NULL;
    418 	mutex_exit(&vnode_free_list_lock);
    419 }
    420 
    421 /*
    422  * vget: get a particular vnode from the free list, increment its reference
    423  * count and lock it.
    424  *
    425  * => Should be called with v_interlock held.
    426  *
    427  * If VI_CHANGING is set, the vnode may be eliminated in vgone()/vclean().
    428  * In that case, we cannot grab the vnode, so the process is awakened when
    429  * the transition is completed, and an error returned to indicate that the
    430  * vnode is no longer usable.
    431  */
    432 int
    433 vget(vnode_t *vp, int flags, bool waitok)
    434 {
    435 	int error = 0;
    436 
    437 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
    438 	KASSERT(mutex_owned(vp->v_interlock));
    439 	KASSERT((flags & ~LK_NOWAIT) == 0);
    440 	KASSERT(waitok == ((flags & LK_NOWAIT) == 0));
    441 
    442 	/*
    443 	 * Before adding a reference, we must remove the vnode
    444 	 * from its freelist.
    445 	 */
    446 	if (vp->v_usecount == 0) {
    447 		vremfree(vp);
    448 		vp->v_usecount = 1;
    449 	} else {
    450 		atomic_inc_uint(&vp->v_usecount);
    451 	}
    452 
    453 	/*
    454 	 * If the vnode is in the process of changing state we wait
    455 	 * for the change to complete and take care not to return
    456 	 * a clean vnode.
    457 	 */
    458 	if ((vp->v_iflag & VI_CHANGING) != 0) {
    459 		if ((flags & LK_NOWAIT) != 0) {
    460 			vrelel(vp, 0);
    461 			return EBUSY;
    462 		}
    463 		vwait(vp, VI_CHANGING);
    464 		if ((vp->v_iflag & VI_CLEAN) != 0) {
    465 			vrelel(vp, 0);
    466 			return ENOENT;
    467 		}
    468 	}
    469 
    470 	/*
    471 	 * Ok, we got it in good shape.
    472 	 */
    473 	KASSERT((vp->v_iflag & VI_CLEAN) == 0);
    474 	mutex_exit(vp->v_interlock);
    475 	return error;
    476 }
    477 
    478 /*
    479  * vput: unlock and release the reference.
    480  */
    481 void
    482 vput(vnode_t *vp)
    483 {
    484 
    485 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
    486 
    487 	VOP_UNLOCK(vp);
    488 	vrele(vp);
    489 }
    490 
    491 /*
    492  * Try to drop reference on a vnode.  Abort if we are releasing the
    493  * last reference.  Note: this _must_ succeed if not the last reference.
    494  */
    495 static inline bool
    496 vtryrele(vnode_t *vp)
    497 {
    498 	u_int use, next;
    499 
    500 	for (use = vp->v_usecount;; use = next) {
    501 		if (use == 1) {
    502 			return false;
    503 		}
    504 		KASSERT(use > 1);
    505 		next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
    506 		if (__predict_true(next == use)) {
    507 			return true;
    508 		}
    509 	}
    510 }
    511 
    512 /*
    513  * Vnode release.  If reference count drops to zero, call inactive
    514  * routine and either return to freelist or free to the pool.
    515  */
    516 static void
    517 vrelel(vnode_t *vp, int flags)
    518 {
    519 	bool recycle, defer;
    520 	int error;
    521 
    522 	KASSERT(mutex_owned(vp->v_interlock));
    523 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
    524 	KASSERT(vp->v_freelisthd == NULL);
    525 
    526 	if (__predict_false(vp->v_op == dead_vnodeop_p &&
    527 	    (vp->v_iflag & (VI_CLEAN|VI_XLOCK)) == 0)) {
    528 		vnpanic(vp, "dead but not clean");
    529 	}
    530 
    531 	/*
    532 	 * If not the last reference, just drop the reference count
    533 	 * and unlock.
    534 	 */
    535 	if (vtryrele(vp)) {
    536 		if ((flags & VRELEL_CHANGING_SET) != 0) {
    537 			KASSERT((vp->v_iflag & VI_CHANGING) != 0);
    538 			vp->v_iflag &= ~VI_CHANGING;
    539 			cv_broadcast(&vp->v_cv);
    540 		}
    541 		mutex_exit(vp->v_interlock);
    542 		return;
    543 	}
    544 	if (vp->v_usecount <= 0 || vp->v_writecount != 0) {
    545 		vnpanic(vp, "%s: bad ref count", __func__);
    546 	}
    547 
    548 	KASSERT((vp->v_iflag & VI_XLOCK) == 0);
    549 
    550 #ifdef DIAGNOSTIC
    551 	if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
    552 	    vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
    553 		vprint("vrelel: missing VOP_CLOSE()", vp);
    554 	}
    555 #endif
    556 
    557 	/*
    558 	 * If not clean, deactivate the vnode, but preserve
    559 	 * our reference across the call to VOP_INACTIVE().
    560 	 */
    561 	if ((vp->v_iflag & VI_CLEAN) == 0) {
    562 		recycle = false;
    563 
    564 		/*
    565 		 * XXX This ugly block can be largely eliminated if
    566 		 * locking is pushed down into the file systems.
    567 		 *
    568 		 * Defer vnode release to vrele_thread if caller
    569 		 * requests it explicitly or is the pagedaemon.
    570 		 */
    571 		if ((curlwp == uvm.pagedaemon_lwp) ||
    572 		    (flags & VRELEL_ASYNC_RELE) != 0) {
    573 			defer = true;
    574 		} else if (curlwp == vrele_lwp) {
    575 			/*
    576 			 * We have to try harder.
    577 			 */
    578 			mutex_exit(vp->v_interlock);
    579 			error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
    580 			KASSERT(error == 0);
    581 			mutex_enter(vp->v_interlock);
    582 			defer = false;
    583 		} else {
    584 			/* If we can't acquire the lock, then defer. */
    585 			mutex_exit(vp->v_interlock);
    586 			error = vn_lock(vp,
    587 			    LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
    588 			defer = (error != 0);
    589 			mutex_enter(vp->v_interlock);
    590 		}
    591 
    592 		KASSERT(mutex_owned(vp->v_interlock));
    593 		KASSERT(! (curlwp == vrele_lwp && defer));
    594 
    595 		if (defer) {
    596 			/*
    597 			 * Defer reclaim to the kthread; it's not safe to
    598 			 * clean it here.  We donate it our last reference.
    599 			 */
    600 			if ((flags & VRELEL_CHANGING_SET) != 0) {
    601 				KASSERT((vp->v_iflag & VI_CHANGING) != 0);
    602 				vp->v_iflag &= ~VI_CHANGING;
    603 				cv_broadcast(&vp->v_cv);
    604 			}
    605 			mutex_enter(&vrele_lock);
    606 			TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist);
    607 			if (++vrele_pending > (desiredvnodes >> 8))
    608 				cv_signal(&vrele_cv);
    609 			mutex_exit(&vrele_lock);
    610 			mutex_exit(vp->v_interlock);
    611 			return;
    612 		}
    613 
    614 		/*
    615 		 * If the node got another reference while we
    616 		 * released the interlock, don't try to inactivate it yet.
    617 		 */
    618 		if (__predict_false(vtryrele(vp))) {
    619 			VOP_UNLOCK(vp);
    620 			if ((flags & VRELEL_CHANGING_SET) != 0) {
    621 				KASSERT((vp->v_iflag & VI_CHANGING) != 0);
    622 				vp->v_iflag &= ~VI_CHANGING;
    623 				cv_broadcast(&vp->v_cv);
    624 			}
    625 			mutex_exit(vp->v_interlock);
    626 			return;
    627 		}
    628 
    629 		if ((flags & VRELEL_CHANGING_SET) == 0) {
    630 			KASSERT((vp->v_iflag & VI_CHANGING) == 0);
    631 			vp->v_iflag |= VI_CHANGING;
    632 		}
    633 		mutex_exit(vp->v_interlock);
    634 
    635 		/*
    636 		 * The vnode can gain another reference while being
    637 		 * deactivated.  If VOP_INACTIVE() indicates that
    638 		 * the described file has been deleted, then recycle
    639 		 * the vnode irrespective of additional references.
    640 		 * Another thread may be waiting to re-use the on-disk
    641 		 * inode.
    642 		 *
    643 		 * Note that VOP_INACTIVE() will drop the vnode lock.
    644 		 */
    645 		VOP_INACTIVE(vp, &recycle);
    646 		mutex_enter(vp->v_interlock);
    647 		if (!recycle) {
    648 			if (vtryrele(vp)) {
    649 				KASSERT((vp->v_iflag & VI_CHANGING) != 0);
    650 				vp->v_iflag &= ~VI_CHANGING;
    651 				cv_broadcast(&vp->v_cv);
    652 				mutex_exit(vp->v_interlock);
    653 				return;
    654 			}
    655 		}
    656 
    657 		/* Take care of space accounting. */
    658 		if (vp->v_iflag & VI_EXECMAP) {
    659 			atomic_add_int(&uvmexp.execpages,
    660 			    -vp->v_uobj.uo_npages);
    661 			atomic_add_int(&uvmexp.filepages,
    662 			    vp->v_uobj.uo_npages);
    663 		}
    664 		vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP);
    665 		vp->v_vflag &= ~VV_MAPPED;
    666 
    667 		/*
    668 		 * Recycle the vnode if the file is now unused (unlinked),
    669 		 * otherwise just free it.
    670 		 */
    671 		if (recycle) {
    672 			vclean(vp);
    673 		}
    674 		KASSERT(vp->v_usecount > 0);
    675 	} else { /* vnode was already clean */
    676 		if ((flags & VRELEL_CHANGING_SET) == 0) {
    677 			KASSERT((vp->v_iflag & VI_CHANGING) == 0);
    678 			vp->v_iflag |= VI_CHANGING;
    679 		}
    680 	}
    681 
    682 	if (atomic_dec_uint_nv(&vp->v_usecount) != 0) {
    683 		/* Gained another reference while being reclaimed. */
    684 		KASSERT((vp->v_iflag & VI_CHANGING) != 0);
    685 		vp->v_iflag &= ~VI_CHANGING;
    686 		cv_broadcast(&vp->v_cv);
    687 		mutex_exit(vp->v_interlock);
    688 		return;
    689 	}
    690 
    691 	if ((vp->v_iflag & VI_CLEAN) != 0) {
    692 		/*
    693 		 * It's clean so destroy it.  It isn't referenced
    694 		 * anywhere since it has been reclaimed.
    695 		 */
    696 		KASSERT(vp->v_holdcnt == 0);
    697 		KASSERT(vp->v_writecount == 0);
    698 		mutex_exit(vp->v_interlock);
    699 		vfs_insmntque(vp, NULL);
    700 		if (vp->v_type == VBLK || vp->v_type == VCHR) {
    701 			spec_node_destroy(vp);
    702 		}
    703 		vnfree(vp);
    704 	} else {
    705 		/*
    706 		 * Otherwise, put it back onto the freelist.  It
    707 		 * can't be destroyed while still associated with
    708 		 * a file system.
    709 		 */
    710 		mutex_enter(&vnode_free_list_lock);
    711 		if (vp->v_holdcnt > 0) {
    712 			vp->v_freelisthd = &vnode_hold_list;
    713 		} else {
    714 			vp->v_freelisthd = &vnode_free_list;
    715 		}
    716 		TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
    717 		mutex_exit(&vnode_free_list_lock);
    718 		KASSERT((vp->v_iflag & VI_CHANGING) != 0);
    719 		vp->v_iflag &= ~VI_CHANGING;
    720 		cv_broadcast(&vp->v_cv);
    721 		mutex_exit(vp->v_interlock);
    722 	}
    723 }
    724 
    725 void
    726 vrele(vnode_t *vp)
    727 {
    728 
    729 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
    730 
    731 	if (vtryrele(vp)) {
    732 		return;
    733 	}
    734 	mutex_enter(vp->v_interlock);
    735 	vrelel(vp, 0);
    736 }
    737 
    738 /*
    739  * Asynchronous vnode release, vnode is released in different context.
    740  */
    741 void
    742 vrele_async(vnode_t *vp)
    743 {
    744 
    745 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
    746 
    747 	if (vtryrele(vp)) {
    748 		return;
    749 	}
    750 	mutex_enter(vp->v_interlock);
    751 	vrelel(vp, VRELEL_ASYNC_RELE);
    752 }
    753 
    754 static void
    755 vrele_thread(void *cookie)
    756 {
    757 	vnodelst_t skip_list;
    758 	vnode_t *vp;
    759 	struct mount *mp;
    760 
    761 	TAILQ_INIT(&skip_list);
    762 
    763 	mutex_enter(&vrele_lock);
    764 	for (;;) {
    765 		while (TAILQ_EMPTY(&vrele_list)) {
    766 			vrele_gen++;
    767 			cv_broadcast(&vrele_cv);
    768 			cv_timedwait(&vrele_cv, &vrele_lock, hz);
    769 			TAILQ_CONCAT(&vrele_list, &skip_list, v_freelist);
    770 		}
    771 		vp = TAILQ_FIRST(&vrele_list);
    772 		mp = vp->v_mount;
    773 		TAILQ_REMOVE(&vrele_list, vp, v_freelist);
    774 		if (fstrans_start_nowait(mp, FSTRANS_LAZY) != 0) {
    775 			TAILQ_INSERT_TAIL(&skip_list, vp, v_freelist);
    776 			continue;
    777 		}
    778 		vrele_pending--;
    779 		mutex_exit(&vrele_lock);
    780 
    781 		/*
    782 		 * If not the last reference, then ignore the vnode
    783 		 * and look for more work.
    784 		 */
    785 		mutex_enter(vp->v_interlock);
    786 		vrelel(vp, 0);
    787 		fstrans_done(mp);
    788 		mutex_enter(&vrele_lock);
    789 	}
    790 }
    791 
    792 void
    793 vrele_flush(void)
    794 {
    795 	int gen;
    796 
    797 	mutex_enter(&vrele_lock);
    798 	gen = vrele_gen;
    799 	while (vrele_pending && gen == vrele_gen) {
    800 		cv_broadcast(&vrele_cv);
    801 		cv_wait(&vrele_cv, &vrele_lock);
    802 	}
    803 	mutex_exit(&vrele_lock);
    804 }
    805 
    806 /*
    807  * Vnode reference, where a reference is already held by some other
    808  * object (for example, a file structure).
    809  */
    810 void
    811 vref(vnode_t *vp)
    812 {
    813 
    814 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
    815 	KASSERT(vp->v_usecount != 0);
    816 
    817 	atomic_inc_uint(&vp->v_usecount);
    818 }
    819 
    820 /*
    821  * Page or buffer structure gets a reference.
    822  * Called with v_interlock held.
    823  */
    824 void
    825 vholdl(vnode_t *vp)
    826 {
    827 
    828 	KASSERT(mutex_owned(vp->v_interlock));
    829 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
    830 
    831 	if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) {
    832 		mutex_enter(&vnode_free_list_lock);
    833 		KASSERT(vp->v_freelisthd == &vnode_free_list);
    834 		TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
    835 		vp->v_freelisthd = &vnode_hold_list;
    836 		TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
    837 		mutex_exit(&vnode_free_list_lock);
    838 	}
    839 }
    840 
    841 /*
    842  * Page or buffer structure frees a reference.
    843  * Called with v_interlock held.
    844  */
    845 void
    846 holdrelel(vnode_t *vp)
    847 {
    848 
    849 	KASSERT(mutex_owned(vp->v_interlock));
    850 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
    851 
    852 	if (vp->v_holdcnt <= 0) {
    853 		vnpanic(vp, "%s: holdcnt vp %p", __func__, vp);
    854 	}
    855 
    856 	vp->v_holdcnt--;
    857 	if (vp->v_holdcnt == 0 && vp->v_usecount == 0) {
    858 		mutex_enter(&vnode_free_list_lock);
    859 		KASSERT(vp->v_freelisthd == &vnode_hold_list);
    860 		TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
    861 		vp->v_freelisthd = &vnode_free_list;
    862 		TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
    863 		mutex_exit(&vnode_free_list_lock);
    864 	}
    865 }
    866 
    867 /*
    868  * Disassociate the underlying file system from a vnode.
    869  *
    870  * Must be called with the interlock held, and will return with it held.
    871  */
    872 static void
    873 vclean(vnode_t *vp)
    874 {
    875 	lwp_t *l = curlwp;
    876 	bool recycle, active;
    877 	int error;
    878 
    879 	KASSERT(mutex_owned(vp->v_interlock));
    880 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
    881 	KASSERT(vp->v_usecount != 0);
    882 
    883 	/* If already clean, nothing to do. */
    884 	if ((vp->v_iflag & VI_CLEAN) != 0) {
    885 		return;
    886 	}
    887 
    888 	active = (vp->v_usecount > 1);
    889 	mutex_exit(vp->v_interlock);
    890 
    891 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
    892 
    893 	/*
    894 	 * Prevent the vnode from being recycled or brought into use
    895 	 * while we clean it out.
    896 	 */
    897 	mutex_enter(vp->v_interlock);
    898 	KASSERT((vp->v_iflag & (VI_XLOCK | VI_CLEAN)) == 0);
    899 	vp->v_iflag |= VI_XLOCK;
    900 	if (vp->v_iflag & VI_EXECMAP) {
    901 		atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages);
    902 		atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages);
    903 	}
    904 	vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
    905 	mutex_exit(vp->v_interlock);
    906 
    907 	/*
    908 	 * Clean out any cached data associated with the vnode.
    909 	 * If purging an active vnode, it must be closed and
    910 	 * deactivated before being reclaimed. Note that the
    911 	 * VOP_INACTIVE will unlock the vnode.
    912 	 */
    913 	error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
    914 	if (error != 0) {
    915 		if (wapbl_vphaswapbl(vp))
    916 			WAPBL_DISCARD(wapbl_vptomp(vp));
    917 		error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
    918 	}
    919 	KASSERT(error == 0);
    920 	KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
    921 	if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) {
    922 		 spec_node_revoke(vp);
    923 	}
    924 	if (active) {
    925 		VOP_INACTIVE(vp, &recycle);
    926 	} else {
    927 		/*
    928 		 * Any other processes trying to obtain this lock must first
    929 		 * wait for VI_XLOCK to clear, then call the new lock operation.
    930 		 */
    931 		VOP_UNLOCK(vp);
    932 	}
    933 
    934 	/* Disassociate the underlying file system from the vnode. */
    935 	if (VOP_RECLAIM(vp)) {
    936 		vnpanic(vp, "%s: cannot reclaim", __func__);
    937 	}
    938 
    939 	KASSERT(vp->v_data == NULL);
    940 	KASSERT(vp->v_uobj.uo_npages == 0);
    941 
    942 	if (vp->v_type == VREG && vp->v_ractx != NULL) {
    943 		uvm_ra_freectx(vp->v_ractx);
    944 		vp->v_ractx = NULL;
    945 	}
    946 
    947 	/* Purge name cache. */
    948 	cache_purge(vp);
    949 
    950 	/* Move to dead mount. */
    951 	vp->v_vflag &= ~VV_ROOT;
    952 	atomic_inc_uint(&dead_rootmount->mnt_refcnt);
    953 	vfs_insmntque(vp, dead_rootmount);
    954 
    955 	/* Done with purge, notify sleepers of the grim news. */
    956 	mutex_enter(vp->v_interlock);
    957 	vp->v_op = dead_vnodeop_p;
    958 	vp->v_vflag |= VV_LOCKSWORK;
    959 	vp->v_iflag |= VI_CLEAN;
    960 	vp->v_tag = VT_NON;
    961 	KNOTE(&vp->v_klist, NOTE_REVOKE);
    962 	vp->v_iflag &= ~VI_XLOCK;
    963 	cv_broadcast(&vp->v_cv);
    964 
    965 	KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
    966 }
    967 
    968 /*
    969  * Recycle an unused vnode if caller holds the last reference.
    970  */
    971 bool
    972 vrecycle(vnode_t *vp)
    973 {
    974 
    975 	mutex_enter(vp->v_interlock);
    976 
    977 	KASSERT((vp->v_iflag & VI_MARKER) == 0);
    978 
    979 	if (vp->v_usecount != 1) {
    980 		mutex_exit(vp->v_interlock);
    981 		return false;
    982 	}
    983 	if ((vp->v_iflag & VI_CHANGING) != 0)
    984 		vwait(vp, VI_CHANGING);
    985 	if (vp->v_usecount != 1) {
    986 		mutex_exit(vp->v_interlock);
    987 		return false;
    988 	} else if ((vp->v_iflag & VI_CLEAN) != 0) {
    989 		mutex_exit(vp->v_interlock);
    990 		return true;
    991 	}
    992 	vp->v_iflag |= VI_CHANGING;
    993 	vclean(vp);
    994 	vrelel(vp, VRELEL_CHANGING_SET);
    995 	return true;
    996 }
    997 
    998 /*
    999  * Eliminate all activity associated with the requested vnode
   1000  * and with all vnodes aliased to the requested vnode.
   1001  */
   1002 void
   1003 vrevoke(vnode_t *vp)
   1004 {
   1005 	vnode_t *vq;
   1006 	enum vtype type;
   1007 	dev_t dev;
   1008 
   1009 	KASSERT(vp->v_usecount > 0);
   1010 
   1011 	mutex_enter(vp->v_interlock);
   1012 	if ((vp->v_iflag & VI_CLEAN) != 0) {
   1013 		mutex_exit(vp->v_interlock);
   1014 		return;
   1015 	} else if (vp->v_type != VBLK && vp->v_type != VCHR) {
   1016 		atomic_inc_uint(&vp->v_usecount);
   1017 		mutex_exit(vp->v_interlock);
   1018 		vgone(vp);
   1019 		return;
   1020 	} else {
   1021 		dev = vp->v_rdev;
   1022 		type = vp->v_type;
   1023 		mutex_exit(vp->v_interlock);
   1024 	}
   1025 
   1026 	while (spec_node_lookup_by_dev(type, dev, &vq) == 0) {
   1027 		vgone(vq);
   1028 	}
   1029 }
   1030 
   1031 /*
   1032  * Eliminate all activity associated with a vnode in preparation for
   1033  * reuse.  Drops a reference from the vnode.
   1034  */
   1035 void
   1036 vgone(vnode_t *vp)
   1037 {
   1038 
   1039 	mutex_enter(vp->v_interlock);
   1040 	if ((vp->v_iflag & VI_CHANGING) != 0)
   1041 		vwait(vp, VI_CHANGING);
   1042 	vp->v_iflag |= VI_CHANGING;
   1043 	vclean(vp);
   1044 	vrelel(vp, VRELEL_CHANGING_SET);
   1045 }
   1046 
   1047 static inline uint32_t
   1048 vcache_hash(const struct vcache_key *key)
   1049 {
   1050 	uint32_t hash = HASH32_BUF_INIT;
   1051 
   1052 	hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash);
   1053 	hash = hash32_buf(key->vk_key, key->vk_key_len, hash);
   1054 	return hash;
   1055 }
   1056 
   1057 static void
   1058 vcache_init(void)
   1059 {
   1060 
   1061 	vcache.pool = pool_cache_init(sizeof(struct vcache_node), 0, 0, 0,
   1062 	    "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL);
   1063 	KASSERT(vcache.pool != NULL);
   1064 	mutex_init(&vcache.lock, MUTEX_DEFAULT, IPL_NONE);
   1065 	vcache.hashtab = hashinit(desiredvnodes, HASH_SLIST, true,
   1066 	    &vcache.hashmask);
   1067 }
   1068 
   1069 static void
   1070 vcache_reinit(void)
   1071 {
   1072 	int i;
   1073 	uint32_t hash;
   1074 	u_long oldmask, newmask;
   1075 	struct hashhead *oldtab, *newtab;
   1076 	struct vcache_node *node;
   1077 
   1078 	newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask);
   1079 	mutex_enter(&vcache.lock);
   1080 	oldtab = vcache.hashtab;
   1081 	oldmask = vcache.hashmask;
   1082 	vcache.hashtab = newtab;
   1083 	vcache.hashmask = newmask;
   1084 	for (i = 0; i <= oldmask; i++) {
   1085 		while ((node = SLIST_FIRST(&oldtab[i])) != NULL) {
   1086 			SLIST_REMOVE(&oldtab[i], node, vcache_node, vn_hash);
   1087 			hash = vcache_hash(&node->vn_key);
   1088 			SLIST_INSERT_HEAD(&newtab[hash & vcache.hashmask],
   1089 			    node, vn_hash);
   1090 		}
   1091 	}
   1092 	mutex_exit(&vcache.lock);
   1093 	hashdone(oldtab, HASH_SLIST, oldmask);
   1094 }
   1095 
   1096 static inline struct vcache_node *
   1097 vcache_hash_lookup(const struct vcache_key *key, uint32_t hash)
   1098 {
   1099 	struct hashhead *hashp;
   1100 	struct vcache_node *node;
   1101 
   1102 	KASSERT(mutex_owned(&vcache.lock));
   1103 
   1104 	hashp = &vcache.hashtab[hash & vcache.hashmask];
   1105 	SLIST_FOREACH(node, hashp, vn_hash) {
   1106 		if (key->vk_mount != node->vn_key.vk_mount)
   1107 			continue;
   1108 		if (key->vk_key_len != node->vn_key.vk_key_len)
   1109 			continue;
   1110 		if (memcmp(key->vk_key, node->vn_key.vk_key, key->vk_key_len))
   1111 			continue;
   1112 		return node;
   1113 	}
   1114 	return NULL;
   1115 }
   1116 
   1117 /*
   1118  * Get a vnode / fs node pair by key and return it referenced through vpp.
   1119  */
   1120 int
   1121 vcache_get(struct mount *mp, const void *key, size_t key_len,
   1122     struct vnode **vpp)
   1123 {
   1124 	int error;
   1125 	uint32_t hash;
   1126 	const void *new_key;
   1127 	struct vnode *vp;
   1128 	struct vcache_key vcache_key;
   1129 	struct vcache_node *node, *new_node;
   1130 
   1131 	new_key = NULL;
   1132 	*vpp = NULL;
   1133 
   1134 	vcache_key.vk_mount = mp;
   1135 	vcache_key.vk_key = key;
   1136 	vcache_key.vk_key_len = key_len;
   1137 	hash = vcache_hash(&vcache_key);
   1138 
   1139 again:
   1140 	mutex_enter(&vcache.lock);
   1141 	node = vcache_hash_lookup(&vcache_key, hash);
   1142 
   1143 	/* If found, take a reference or retry. */
   1144 	if (__predict_true(node != NULL && node->vn_vnode != NULL)) {
   1145 		vp = node->vn_vnode;
   1146 		mutex_enter(vp->v_interlock);
   1147 		mutex_exit(&vcache.lock);
   1148 		error = vget(vp, 0, true /* wait */);
   1149 		if (error == ENOENT)
   1150 			goto again;
   1151 		if (error == 0)
   1152 			*vpp = vp;
   1153 		KASSERT((error != 0) == (*vpp == NULL));
   1154 		return error;
   1155 	}
   1156 
   1157 	/* If another thread loads this node, wait and retry. */
   1158 	if (node != NULL) {
   1159 		KASSERT(node->vn_vnode == NULL);
   1160 		mutex_exit(&vcache.lock);
   1161 		kpause("vcache", false, mstohz(20), NULL);
   1162 		goto again;
   1163 	}
   1164 	mutex_exit(&vcache.lock);
   1165 
   1166 	/* Allocate and initialize a new vcache / vnode pair. */
   1167 	error = vfs_busy(mp, NULL);
   1168 	if (error)
   1169 		return error;
   1170 	new_node = pool_cache_get(vcache.pool, PR_WAITOK);
   1171 	new_node->vn_vnode = NULL;
   1172 	new_node->vn_key = vcache_key;
   1173 	vp = vnalloc(NULL);
   1174 	mutex_enter(&vcache.lock);
   1175 	node = vcache_hash_lookup(&vcache_key, hash);
   1176 	if (node == NULL) {
   1177 		SLIST_INSERT_HEAD(&vcache.hashtab[hash & vcache.hashmask],
   1178 		    new_node, vn_hash);
   1179 		node = new_node;
   1180 	}
   1181 	mutex_exit(&vcache.lock);
   1182 
   1183 	/* If another thread beat us inserting this node, retry. */
   1184 	if (node != new_node) {
   1185 		pool_cache_put(vcache.pool, new_node);
   1186 		KASSERT(vp->v_usecount == 1);
   1187 		vp->v_usecount = 0;
   1188 		vnfree(vp);
   1189 		vfs_unbusy(mp, false, NULL);
   1190 		goto again;
   1191 	}
   1192 
   1193 	/* Load the fs node.  Exclusive as new_node->vn_vnode is NULL. */
   1194 	vp->v_iflag |= VI_CHANGING;
   1195 	error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key);
   1196 	if (error) {
   1197 		mutex_enter(&vcache.lock);
   1198 		SLIST_REMOVE(&vcache.hashtab[hash & vcache.hashmask],
   1199 		    new_node, vcache_node, vn_hash);
   1200 		mutex_exit(&vcache.lock);
   1201 		pool_cache_put(vcache.pool, new_node);
   1202 		KASSERT(vp->v_usecount == 1);
   1203 		vp->v_usecount = 0;
   1204 		vnfree(vp);
   1205 		vfs_unbusy(mp, false, NULL);
   1206 		KASSERT(*vpp == NULL);
   1207 		return error;
   1208 	}
   1209 	KASSERT(new_key != NULL);
   1210 	KASSERT(memcmp(key, new_key, key_len) == 0);
   1211 	KASSERT(vp->v_op != NULL);
   1212 	vfs_insmntque(vp, mp);
   1213 	if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
   1214 		vp->v_vflag |= VV_MPSAFE;
   1215 	vfs_unbusy(mp, true, NULL);
   1216 
   1217 	/* Finished loading, finalize node. */
   1218 	mutex_enter(&vcache.lock);
   1219 	new_node->vn_key.vk_key = new_key;
   1220 	new_node->vn_vnode = vp;
   1221 	mutex_exit(&vcache.lock);
   1222 	mutex_enter(vp->v_interlock);
   1223 	vp->v_iflag &= ~VI_CHANGING;
   1224 	cv_broadcast(&vp->v_cv);
   1225 	mutex_exit(vp->v_interlock);
   1226 	*vpp = vp;
   1227 	return 0;
   1228 }
   1229 
   1230 /*
   1231  * Create a new vnode / fs node pair and return it referenced through vpp.
   1232  */
   1233 int
   1234 vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap,
   1235     kauth_cred_t cred, struct vnode **vpp)
   1236 {
   1237 	int error;
   1238 	uint32_t hash;
   1239 	struct vnode *vp;
   1240 	struct vcache_node *new_node;
   1241 	struct vcache_node *old_node __diagused;
   1242 
   1243 	*vpp = NULL;
   1244 
   1245 	/* Allocate and initialize a new vcache / vnode pair. */
   1246 	error = vfs_busy(mp, NULL);
   1247 	if (error)
   1248 		return error;
   1249 	new_node = pool_cache_get(vcache.pool, PR_WAITOK);
   1250 	new_node->vn_key.vk_mount = mp;
   1251 	new_node->vn_vnode = NULL;
   1252 	vp = vnalloc(NULL);
   1253 
   1254 	/* Create and load the fs node. */
   1255 	vp->v_iflag |= VI_CHANGING;
   1256 	error = VFS_NEWVNODE(mp, dvp, vp, vap, cred,
   1257 	    &new_node->vn_key.vk_key_len, &new_node->vn_key.vk_key);
   1258 	if (error) {
   1259 		pool_cache_put(vcache.pool, new_node);
   1260 		KASSERT(vp->v_usecount == 1);
   1261 		vp->v_usecount = 0;
   1262 		vnfree(vp);
   1263 		vfs_unbusy(mp, false, NULL);
   1264 		KASSERT(*vpp == NULL);
   1265 		return error;
   1266 	}
   1267 	KASSERT(new_node->vn_key.vk_key != NULL);
   1268 	KASSERT(vp->v_op != NULL);
   1269 	hash = vcache_hash(&new_node->vn_key);
   1270 
   1271 	/* Wait for previous instance to be reclaimed, then insert new node. */
   1272 	mutex_enter(&vcache.lock);
   1273 	while ((old_node = vcache_hash_lookup(&new_node->vn_key, hash))) {
   1274 #ifdef DIAGNOSTIC
   1275 		if (old_node->vn_vnode != NULL)
   1276 			mutex_enter(old_node->vn_vnode->v_interlock);
   1277 		KASSERT(old_node->vn_vnode == NULL ||
   1278 		    (old_node->vn_vnode->v_iflag & (VI_XLOCK | VI_CLEAN)) != 0);
   1279 		if (old_node->vn_vnode != NULL)
   1280 			mutex_exit(old_node->vn_vnode->v_interlock);
   1281 #endif
   1282 		mutex_exit(&vcache.lock);
   1283 		kpause("vcache", false, mstohz(20), NULL);
   1284 		mutex_enter(&vcache.lock);
   1285 	}
   1286 	SLIST_INSERT_HEAD(&vcache.hashtab[hash & vcache.hashmask],
   1287 	    new_node, vn_hash);
   1288 	mutex_exit(&vcache.lock);
   1289 	vfs_insmntque(vp, mp);
   1290 	if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
   1291 		vp->v_vflag |= VV_MPSAFE;
   1292 	vfs_unbusy(mp, true, NULL);
   1293 
   1294 	/* Finished loading, finalize node. */
   1295 	mutex_enter(&vcache.lock);
   1296 	new_node->vn_vnode = vp;
   1297 	mutex_exit(&vcache.lock);
   1298 	mutex_enter(vp->v_interlock);
   1299 	vp->v_iflag &= ~VI_CHANGING;
   1300 	cv_broadcast(&vp->v_cv);
   1301 	mutex_exit(vp->v_interlock);
   1302 	*vpp = vp;
   1303 	return 0;
   1304 }
   1305 
   1306 /*
   1307  * Prepare key change: lock old and new cache node.
   1308  * Return an error if the new node already exists.
   1309  */
   1310 int
   1311 vcache_rekey_enter(struct mount *mp, struct vnode *vp,
   1312     const void *old_key, size_t old_key_len,
   1313     const void *new_key, size_t new_key_len)
   1314 {
   1315 	uint32_t old_hash, new_hash;
   1316 	struct vcache_key old_vcache_key, new_vcache_key;
   1317 	struct vcache_node *node, *new_node;
   1318 
   1319 	old_vcache_key.vk_mount = mp;
   1320 	old_vcache_key.vk_key = old_key;
   1321 	old_vcache_key.vk_key_len = old_key_len;
   1322 	old_hash = vcache_hash(&old_vcache_key);
   1323 
   1324 	new_vcache_key.vk_mount = mp;
   1325 	new_vcache_key.vk_key = new_key;
   1326 	new_vcache_key.vk_key_len = new_key_len;
   1327 	new_hash = vcache_hash(&new_vcache_key);
   1328 
   1329 	new_node = pool_cache_get(vcache.pool, PR_WAITOK);
   1330 	new_node->vn_vnode = NULL;
   1331 	new_node->vn_key = new_vcache_key;
   1332 
   1333 	mutex_enter(&vcache.lock);
   1334 	node = vcache_hash_lookup(&new_vcache_key, new_hash);
   1335 	if (node != NULL) {
   1336 		mutex_exit(&vcache.lock);
   1337 		pool_cache_put(vcache.pool, new_node);
   1338 		return EEXIST;
   1339 	}
   1340 	SLIST_INSERT_HEAD(&vcache.hashtab[new_hash & vcache.hashmask],
   1341 	    new_node, vn_hash);
   1342 	node = vcache_hash_lookup(&old_vcache_key, old_hash);
   1343 	KASSERT(node != NULL);
   1344 	KASSERT(node->vn_vnode == vp);
   1345 	node->vn_vnode = NULL;
   1346 	node->vn_key = old_vcache_key;
   1347 	mutex_exit(&vcache.lock);
   1348 	return 0;
   1349 }
   1350 
   1351 /*
   1352  * Key change complete: remove old node and unlock new node.
   1353  */
   1354 void
   1355 vcache_rekey_exit(struct mount *mp, struct vnode *vp,
   1356     const void *old_key, size_t old_key_len,
   1357     const void *new_key, size_t new_key_len)
   1358 {
   1359 	uint32_t old_hash, new_hash;
   1360 	struct vcache_key old_vcache_key, new_vcache_key;
   1361 	struct vcache_node *node;
   1362 
   1363 	old_vcache_key.vk_mount = mp;
   1364 	old_vcache_key.vk_key = old_key;
   1365 	old_vcache_key.vk_key_len = old_key_len;
   1366 	old_hash = vcache_hash(&old_vcache_key);
   1367 
   1368 	new_vcache_key.vk_mount = mp;
   1369 	new_vcache_key.vk_key = new_key;
   1370 	new_vcache_key.vk_key_len = new_key_len;
   1371 	new_hash = vcache_hash(&new_vcache_key);
   1372 
   1373 	mutex_enter(&vcache.lock);
   1374 	node = vcache_hash_lookup(&new_vcache_key, new_hash);
   1375 	KASSERT(node != NULL && node->vn_vnode == NULL);
   1376 	KASSERT(node->vn_key.vk_key_len == new_key_len);
   1377 	node->vn_vnode = vp;
   1378 	node->vn_key = new_vcache_key;
   1379 	node = vcache_hash_lookup(&old_vcache_key, old_hash);
   1380 	KASSERT(node != NULL);
   1381 	KASSERT(node->vn_vnode == NULL);
   1382 	SLIST_REMOVE(&vcache.hashtab[old_hash & vcache.hashmask],
   1383 	    node, vcache_node, vn_hash);
   1384 	mutex_exit(&vcache.lock);
   1385 	pool_cache_put(vcache.pool, node);
   1386 }
   1387 
   1388 /*
   1389  * Remove a vnode / fs node pair from the cache.
   1390  */
   1391 void
   1392 vcache_remove(struct mount *mp, const void *key, size_t key_len)
   1393 {
   1394 	uint32_t hash;
   1395 	struct vcache_key vcache_key;
   1396 	struct vcache_node *node;
   1397 
   1398 	vcache_key.vk_mount = mp;
   1399 	vcache_key.vk_key = key;
   1400 	vcache_key.vk_key_len = key_len;
   1401 	hash = vcache_hash(&vcache_key);
   1402 
   1403 	mutex_enter(&vcache.lock);
   1404 	node = vcache_hash_lookup(&vcache_key, hash);
   1405 	KASSERT(node != NULL);
   1406 	SLIST_REMOVE(&vcache.hashtab[hash & vcache.hashmask],
   1407 	    node, vcache_node, vn_hash);
   1408 	mutex_exit(&vcache.lock);
   1409 	pool_cache_put(vcache.pool, node);
   1410 }
   1411 
   1412 /*
   1413  * Update outstanding I/O count and do wakeup if requested.
   1414  */
   1415 void
   1416 vwakeup(struct buf *bp)
   1417 {
   1418 	vnode_t *vp;
   1419 
   1420 	if ((vp = bp->b_vp) == NULL)
   1421 		return;
   1422 
   1423 	KASSERT(bp->b_objlock == vp->v_interlock);
   1424 	KASSERT(mutex_owned(bp->b_objlock));
   1425 
   1426 	if (--vp->v_numoutput < 0)
   1427 		vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp);
   1428 	if (vp->v_numoutput == 0)
   1429 		cv_broadcast(&vp->v_cv);
   1430 }
   1431 
   1432 /*
   1433  * Test a vnode for being or becoming dead.  Returns one of:
   1434  * EBUSY:  vnode is becoming dead, with "flags == VDEAD_NOWAIT" only.
   1435  * ENOENT: vnode is dead.
   1436  * 0:      otherwise.
   1437  *
   1438  * Whenever this function returns a non-zero value all future
   1439  * calls will also return a non-zero value.
   1440  */
   1441 int
   1442 vdead_check(struct vnode *vp, int flags)
   1443 {
   1444 
   1445 	KASSERT(mutex_owned(vp->v_interlock));
   1446 	if (ISSET(vp->v_iflag, VI_XLOCK)) {
   1447 		if (ISSET(flags, VDEAD_NOWAIT))
   1448 			return EBUSY;
   1449 		vwait(vp, VI_XLOCK);
   1450 		KASSERT(ISSET(vp->v_iflag, VI_CLEAN));
   1451 	}
   1452 	if (ISSET(vp->v_iflag, VI_CLEAN))
   1453 		return ENOENT;
   1454 	return 0;
   1455 }
   1456 
   1457 /*
   1458  * Wait for a vnode (typically with VI_XLOCK set) to be cleaned or
   1459  * recycled.
   1460  */
   1461 static void
   1462 vwait(vnode_t *vp, int flags)
   1463 {
   1464 
   1465 	KASSERT(mutex_owned(vp->v_interlock));
   1466 	KASSERT(vp->v_usecount != 0);
   1467 
   1468 	while ((vp->v_iflag & flags) != 0)
   1469 		cv_wait(&vp->v_cv, vp->v_interlock);
   1470 }
   1471 
   1472 int
   1473 vfs_drainvnodes(long target)
   1474 {
   1475 	int error;
   1476 
   1477 	mutex_enter(&vnode_free_list_lock);
   1478 
   1479 	while (numvnodes > target) {
   1480 		error = cleanvnode();
   1481 		if (error != 0)
   1482 			return error;
   1483 		mutex_enter(&vnode_free_list_lock);
   1484 	}
   1485 
   1486 	mutex_exit(&vnode_free_list_lock);
   1487 
   1488 	vcache_reinit();
   1489 
   1490 	return 0;
   1491 }
   1492 
   1493 void
   1494 vnpanic(vnode_t *vp, const char *fmt, ...)
   1495 {
   1496 	va_list ap;
   1497 
   1498 #ifdef DIAGNOSTIC
   1499 	vprint(NULL, vp);
   1500 #endif
   1501 	va_start(ap, fmt);
   1502 	vpanic(fmt, ap);
   1503 	va_end(ap);
   1504 }
   1505