Home | History | Annotate | Line # | Download | only in kern
vfs_cache.c revision 1.126
      1 /*	$NetBSD: vfs_cache.c,v 1.126 2020/01/06 11:22:33 ad Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2008 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  * POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 /*
     30  * Copyright (c) 1989, 1993
     31  *	The Regents of the University of California.  All rights reserved.
     32  *
     33  * Redistribution and use in source and binary forms, with or without
     34  * modification, are permitted provided that the following conditions
     35  * are met:
     36  * 1. Redistributions of source code must retain the above copyright
     37  *    notice, this list of conditions and the following disclaimer.
     38  * 2. Redistributions in binary form must reproduce the above copyright
     39  *    notice, this list of conditions and the following disclaimer in the
     40  *    documentation and/or other materials provided with the distribution.
     41  * 3. Neither the name of the University nor the names of its contributors
     42  *    may be used to endorse or promote products derived from this software
     43  *    without specific prior written permission.
     44  *
     45  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     46  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     47  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     48  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     49  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     50  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     51  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     52  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     53  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     54  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     55  * SUCH DAMAGE.
     56  *
     57  *	@(#)vfs_cache.c	8.3 (Berkeley) 8/22/94
     58  */
     59 
     60 #include <sys/cdefs.h>
     61 __KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.126 2020/01/06 11:22:33 ad Exp $");
     62 
     63 #define __NAMECACHE_PRIVATE
     64 #ifdef _KERNEL_OPT
     65 #include "opt_ddb.h"
     66 #include "opt_dtrace.h"
     67 #include "opt_revcache.h"
     68 #endif
     69 
     70 #include <sys/param.h>
     71 #include <sys/atomic.h>
     72 #include <sys/cpu.h>
     73 #include <sys/errno.h>
     74 #include <sys/evcnt.h>
     75 #include <sys/kernel.h>
     76 #include <sys/kthread.h>
     77 #include <sys/mount.h>
     78 #include <sys/mutex.h>
     79 #include <sys/namei.h>
     80 #include <sys/pool.h>
     81 #include <sys/sdt.h>
     82 #include <sys/sysctl.h>
     83 #include <sys/systm.h>
     84 #include <sys/time.h>
     85 #include <sys/vnode_impl.h>
     86 
     87 /*
     88  * Name caching works as follows:
     89  *
     90  * Names found by directory scans are retained in a cache
     91  * for future reference.  It is managed LRU, so frequently
     92  * used names will hang around.  Cache is indexed by hash value
     93  * obtained from (dvp, name) where dvp refers to the directory
     94  * containing name.
     95  *
     96  * Upon reaching the last segment of a path, if the reference
     97  * is for DELETE, or NOCACHE is set (rewrite), and the
     98  * name is located in the cache, it will be dropped.
     99  */
    100 
    101 /*
    102  * Cache entry lifetime:
    103  *
    104  *	nonexistent
    105  *	---create---> active
    106  *	---invalidate---> queued
    107  *	---reclaim---> nonexistent.
    108  *
    109  * States:
    110  * - Nonexistent.  Cache entry does not exist.
    111  *
    112  * - Active.  cache_lookup, cache_lookup_raw, cache_revlookup can look
    113  *   up, acquire references, and hand off references to vnodes,
    114  *   e.g. via v_interlock.  Marked by nonnull ncp->nc_dvp.
    115  *
    116  * - Queued.  Pending desstruction by cache_reclaim.  Cannot be used by
    117  *   cache_lookup, cache_lookup_raw, or cache_revlookup.  May still be
    118  *   on lists.  Marked by null ncp->nc_dvp.
    119  *
    120  * Transitions:
    121  *
    122  * - Create: nonexistent--->active
    123  *
    124  *   Done by cache_enter(dvp, vp, name, namelen, cnflags), called by
    125  *   VOP_LOOKUP after the answer is found.  Allocates a struct
    126  *   namecache object, initializes it with the above fields, and
    127  *   activates it by inserting it into the forward and reverse tables.
    128  *
    129  * - Invalidate: active--->queued
    130  *
    131  *   Done by cache_invalidate.  If not already invalidated, nullify
    132  *   ncp->nc_dvp and ncp->nc_vp, and add to cache_gcqueue.  Called,
    133  *   among various other places, in cache_lookup(dvp, name, namelen,
    134  *   nameiop, cnflags, &iswht, &vp) when MAKEENTRY is missing from
    135  *   cnflags.
    136  *
    137  * - Reclaim: queued--->nonexistent
    138  *
    139  *   Done by cache_reclaim.  Disassociate ncp from any lists it is on
    140  *   and free memory.
    141  */
    142 
    143 /*
    144  * Locking.
    145  *
    146  * L namecache_lock		Global lock for namecache table and queues.
    147  * C struct nchcpu::cpu_lock	Per-CPU lock to reduce read contention.
    148  * N struct namecache::nc_lock	Per-entry lock.
    149  * V struct vnode::v_interlock	Vnode interlock.
    150  *
    151  * Lock order: L -> C -> N -> V
    152  *
    153  *	Examples:
    154  *	. L->C: cache_reclaim
    155  *	. C->N->V: cache_lookup
    156  *	. L->N->V: cache_purge1, cache_revlookup
    157  *
    158  * All use serialized by namecache_lock:
    159  *
    160  *	nclruhead / struct namecache::nc_lru
    161  *	struct vnode_impl::vi_dnclist / struct namecache::nc_dvlist
    162  *	struct vnode_impl::vi_nclist / struct namecache::nc_vlist
    163  *	nchstats
    164  *
    165  * - Insertion serialized by namecache_lock,
    166  * - read protected by per-CPU lock,
    167  * - insert/read ordering guaranteed by memory barriers, and
    168  * - deletion allowed only under namecache_lock and *all* per-CPU locks
    169  *   in CPU_INFO_FOREACH order:
    170  *
    171  *	nchashtbl / struct namecache::nc_hash
    172  *
    173  *   The per-CPU locks exist only to reduce the probability of
    174  *   contention between readers.  We do not bind to a CPU, so
    175  *   contention is still possible.
    176  *
    177  * All use serialized by struct namecache::nc_lock:
    178  *
    179  *	struct namecache::nc_dvp
    180  *	struct namecache::nc_vp
    181  *	struct namecache::nc_gcqueue (*)
    182  *	struct namecache::nc_hittime (**)
    183  *
    184  * (*) Once on the queue, only cache_thread uses this nc_gcqueue, unlocked.
    185  * (**) cache_prune reads nc_hittime unlocked, since approximate is OK.
    186  *
    187  * Unlocked because stable after initialization:
    188  *
    189  *	struct namecache::nc_dvp
    190  *	struct namecache::nc_vp
    191  *	struct namecache::nc_flags
    192  *	struct namecache::nc_nlen
    193  *	struct namecache::nc_name
    194  *
    195  * Unlocked because approximation is OK:
    196  *
    197  *	struct nchcpu::cpu_stats
    198  *	struct nchcpu::cpu_stats_last
    199  *
    200  * Updates under namecache_lock or any per-CPU lock are marked with
    201  * COUNT, while updates outside those locks are marked with COUNT_UNL.
    202  *
    203  * - The theory seems to have been that you could replace COUNT_UNL by
    204  *   atomic operations -- except that doesn't help unless you also
    205  *   replace COUNT by atomic operations, because mixing atomics and
    206  *   nonatomics is a recipe for failure.
    207  * - We use 32-bit per-CPU counters and 64-bit global counters under
    208  *   the theory that 32-bit counters are less likely to be hosed by
    209  *   nonatomic increment.
    210  */
    211 
    212 /*
    213  * The comment below is preserved for posterity in case it is
    214  * important, but it is clear that everywhere the namecache_count_*()
    215  * functions are called, other cache_*() functions that take the same
    216  * locks are also called, so I can't imagine how this could be a
    217  * problem:
    218  *
    219  * N.B.: Attempting to protect COUNT_UNL() increments by taking
    220  * a per-cpu lock in the namecache_count_*() functions causes
    221  * a deadlock.  Don't do that, use atomic increments instead if
    222  * the imperfections here bug you.
    223  */
    224 
    225 /*
    226  * struct nchstats_percpu:
    227  *
    228  *	Per-CPU counters.
    229  */
    230 struct nchstats_percpu _NAMEI_CACHE_STATS(uint32_t);
    231 
    232 /*
    233  * struct nchcpu:
    234  *
    235  *	Per-CPU namecache state: lock and per-CPU counters.
    236  */
    237 struct nchcpu {
    238 	kmutex_t		cpu_lock;
    239 	struct nchstats_percpu	cpu_stats;
    240 	/* XXX maybe __cacheline_aligned would improve this? */
    241 	struct nchstats_percpu	cpu_stats_last;	/* from last sample */
    242 };
    243 
    244 /*
    245  * The type for the hash code. While the hash function generates a
    246  * u32, the hash code has historically been passed around as a u_long,
    247  * and the value is modified by xor'ing a uintptr_t, so it's not
    248  * entirely clear what the best type is. For now I'll leave it
    249  * unchanged as u_long.
    250  */
    251 
    252 typedef u_long nchash_t;
    253 
    254 /*
    255  * Structures associated with name cacheing.
    256  */
    257 
    258 static kmutex_t *namecache_lock __read_mostly;
    259 static pool_cache_t namecache_cache __read_mostly;
    260 static TAILQ_HEAD(, namecache) nclruhead __cacheline_aligned;
    261 
    262 static LIST_HEAD(nchashhead, namecache) *nchashtbl __read_mostly;
    263 static u_long	nchash __read_mostly;
    264 
    265 #define	NCHASH2(hash, dvp)	\
    266 	(((hash) ^ ((uintptr_t)(dvp) >> 3)) & nchash)
    267 
    268 /* Number of cache entries allocated. */
    269 static long	numcache __cacheline_aligned;
    270 
    271 /* Garbage collection queue and number of entries pending in it. */
    272 static void	*cache_gcqueue;
    273 static u_int	cache_gcpend;
    274 
    275 /* Cache effectiveness statistics.  This holds total from per-cpu stats */
    276 struct nchstats	nchstats __cacheline_aligned;
    277 
    278 /*
    279  * Macros to count an event, update the central stats with per-cpu
    280  * values and add current per-cpu increments to the subsystem total
    281  * last collected by cache_reclaim().
    282  */
    283 #define	CACHE_STATS_CURRENT	/* nothing */
    284 
    285 #define	COUNT(cpup, f)	((cpup)->cpu_stats.f++)
    286 
    287 #define	UPDATE(cpup, f) do { \
    288 	struct nchcpu *Xcpup = (cpup); \
    289 	uint32_t Xcnt = (volatile uint32_t) Xcpup->cpu_stats.f; \
    290 	nchstats.f += Xcnt - Xcpup->cpu_stats_last.f; \
    291 	Xcpup->cpu_stats_last.f = Xcnt; \
    292 } while (/* CONSTCOND */ 0)
    293 
    294 #define	ADD(stats, cpup, f) do { \
    295 	struct nchcpu *Xcpup = (cpup); \
    296 	stats.f += Xcpup->cpu_stats.f - Xcpup->cpu_stats_last.f; \
    297 } while (/* CONSTCOND */ 0)
    298 
    299 /* Do unlocked stats the same way. Use a different name to allow mind changes */
    300 #define	COUNT_UNL(cpup, f)	COUNT((cpup), f)
    301 
    302 static const int cache_lowat = 95;
    303 static const int cache_hiwat = 98;
    304 static const int cache_hottime = 5;	/* number of seconds */
    305 static int doingcache = 1;		/* 1 => enable the cache */
    306 
    307 static struct evcnt cache_ev_scan;
    308 static struct evcnt cache_ev_gc;
    309 static struct evcnt cache_ev_over;
    310 static struct evcnt cache_ev_under;
    311 static struct evcnt cache_ev_forced;
    312 
    313 static struct namecache *cache_lookup_entry(
    314     const struct vnode *, const char *, size_t);
    315 static void cache_thread(void *);
    316 static void cache_invalidate(struct namecache *);
    317 static void cache_disassociate(struct namecache *);
    318 static void cache_reclaim(void);
    319 static int cache_ctor(void *, void *, int);
    320 static void cache_dtor(void *, void *);
    321 
    322 static struct sysctllog *sysctllog;
    323 static void sysctl_cache_stat_setup(void);
    324 
    325 SDT_PROVIDER_DEFINE(vfs);
    326 
    327 SDT_PROBE_DEFINE1(vfs, namecache, invalidate, done, "struct vnode *");
    328 SDT_PROBE_DEFINE1(vfs, namecache, purge, parents, "struct vnode *");
    329 SDT_PROBE_DEFINE1(vfs, namecache, purge, children, "struct vnode *");
    330 SDT_PROBE_DEFINE2(vfs, namecache, purge, name, "char *", "size_t");
    331 SDT_PROBE_DEFINE1(vfs, namecache, purge, vfs, "struct mount *");
    332 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *",
    333     "char *", "size_t");
    334 SDT_PROBE_DEFINE3(vfs, namecache, lookup, miss, "struct vnode *",
    335     "char *", "size_t");
    336 SDT_PROBE_DEFINE3(vfs, namecache, lookup, toolong, "struct vnode *",
    337     "char *", "size_t");
    338 SDT_PROBE_DEFINE2(vfs, namecache, revlookup, success, "struct vnode *",
    339      "struct vnode *");
    340 SDT_PROBE_DEFINE2(vfs, namecache, revlookup, fail, "struct vnode *",
    341      "int");
    342 SDT_PROBE_DEFINE2(vfs, namecache, prune, done, "int", "int");
    343 SDT_PROBE_DEFINE3(vfs, namecache, enter, toolong, "struct vnode *",
    344     "char *", "size_t");
    345 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *",
    346     "char *", "size_t");
    347 
    348 /*
    349  * Compute the hash for an entry.
    350  *
    351  * (This is for now a wrapper around namei_hash, whose interface is
    352  * for the time being slightly inconvenient.)
    353  */
    354 static nchash_t
    355 cache_hash(const char *name, size_t namelen)
    356 {
    357 	const char *endptr;
    358 
    359 	endptr = name + namelen;
    360 	return namei_hash(name, &endptr);
    361 }
    362 
    363 /*
    364  * Invalidate a cache entry and enqueue it for garbage collection.
    365  * The caller needs to hold namecache_lock or a per-cpu lock to hold
    366  * off cache_reclaim().
    367  */
    368 static void
    369 cache_invalidate(struct namecache *ncp)
    370 {
    371 	void *head;
    372 
    373 	KASSERT(mutex_owned(&ncp->nc_lock));
    374 
    375 	if (ncp->nc_dvp != NULL) {
    376 		SDT_PROBE(vfs, namecache, invalidate, done, ncp->nc_dvp,
    377 		    0, 0, 0, 0);
    378 
    379 		ncp->nc_vp = NULL;
    380 		ncp->nc_dvp = NULL;
    381 		do {
    382 			head = cache_gcqueue;
    383 			ncp->nc_gcqueue = head;
    384 		} while (atomic_cas_ptr(&cache_gcqueue, head, ncp) != head);
    385 		atomic_inc_uint(&cache_gcpend);
    386 	}
    387 }
    388 
    389 /*
    390  * Disassociate a namecache entry from any vnodes it is attached to,
    391  * and remove from the global LRU list.
    392  */
    393 static void
    394 cache_disassociate(struct namecache *ncp)
    395 {
    396 
    397 	KASSERT(mutex_owned(namecache_lock));
    398 	KASSERT(ncp->nc_dvp == NULL);
    399 
    400 	if (ncp->nc_lru.tqe_prev != NULL) {
    401 		TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
    402 		ncp->nc_lru.tqe_prev = NULL;
    403 	}
    404 	if (ncp->nc_vlist.le_prev != NULL) {
    405 		LIST_REMOVE(ncp, nc_vlist);
    406 		ncp->nc_vlist.le_prev = NULL;
    407 	}
    408 	if (ncp->nc_dvlist.le_prev != NULL) {
    409 		LIST_REMOVE(ncp, nc_dvlist);
    410 		ncp->nc_dvlist.le_prev = NULL;
    411 	}
    412 }
    413 
    414 /*
    415  * Lock all CPUs to prevent any cache lookup activity.  Conceptually,
    416  * this locks out all "readers".
    417  */
    418 static void
    419 cache_lock_cpus(void)
    420 {
    421 	CPU_INFO_ITERATOR cii;
    422 	struct cpu_info *ci;
    423 	struct nchcpu *cpup;
    424 
    425 	/*
    426 	 * Lock out all CPUs first, then harvest per-cpu stats.  This
    427 	 * is probably not quite as cache-efficient as doing the lock
    428 	 * and harvest at the same time, but allows cache_stat_sysctl()
    429 	 * to make do with a per-cpu lock.
    430 	 */
    431 	for (CPU_INFO_FOREACH(cii, ci)) {
    432 		cpup = ci->ci_data.cpu_nch;
    433 		mutex_enter(&cpup->cpu_lock);
    434 	}
    435 	for (CPU_INFO_FOREACH(cii, ci)) {
    436 		cpup = ci->ci_data.cpu_nch;
    437 		UPDATE(cpup, ncs_goodhits);
    438 		UPDATE(cpup, ncs_neghits);
    439 		UPDATE(cpup, ncs_badhits);
    440 		UPDATE(cpup, ncs_falsehits);
    441 		UPDATE(cpup, ncs_miss);
    442 		UPDATE(cpup, ncs_long);
    443 		UPDATE(cpup, ncs_pass2);
    444 		UPDATE(cpup, ncs_2passes);
    445 		UPDATE(cpup, ncs_revhits);
    446 		UPDATE(cpup, ncs_revmiss);
    447 	}
    448 }
    449 
    450 /*
    451  * Release all CPU locks.
    452  */
    453 static void
    454 cache_unlock_cpus(void)
    455 {
    456 	CPU_INFO_ITERATOR cii;
    457 	struct cpu_info *ci;
    458 	struct nchcpu *cpup;
    459 
    460 	for (CPU_INFO_FOREACH(cii, ci)) {
    461 		cpup = ci->ci_data.cpu_nch;
    462 		mutex_exit(&cpup->cpu_lock);
    463 	}
    464 }
    465 
    466 /*
    467  * Find a single cache entry and return it locked.
    468  * The caller needs to hold namecache_lock or a per-cpu lock to hold
    469  * off cache_reclaim().
    470  */
    471 static struct namecache *
    472 cache_lookup_entry(const struct vnode *dvp, const char *name, size_t namelen)
    473 {
    474 	struct nchashhead *ncpp;
    475 	struct namecache *ncp;
    476 	nchash_t hash;
    477 
    478 	KASSERT(dvp != NULL);
    479 	hash = cache_hash(name, namelen);
    480 	ncpp = &nchashtbl[NCHASH2(hash, dvp)];
    481 
    482 	LIST_FOREACH(ncp, ncpp, nc_hash) {
    483 		membar_datadep_consumer();	/* for Alpha... */
    484 		if (ncp->nc_dvp != dvp ||
    485 		    ncp->nc_nlen != namelen ||
    486 		    memcmp(ncp->nc_name, name, (u_int)ncp->nc_nlen))
    487 		    	continue;
    488 	    	mutex_enter(&ncp->nc_lock);
    489 		if (__predict_true(ncp->nc_dvp == dvp)) {
    490 			ncp->nc_hittime = hardclock_ticks;
    491 			SDT_PROBE(vfs, namecache, lookup, hit, dvp,
    492 			    name, namelen, 0, 0);
    493 			return ncp;
    494 		}
    495 		/* Raced: entry has been nullified. */
    496 		mutex_exit(&ncp->nc_lock);
    497 	}
    498 
    499 	SDT_PROBE(vfs, namecache, lookup, miss, dvp,
    500 	    name, namelen, 0, 0);
    501 	return NULL;
    502 }
    503 
    504 /*
    505  * Look for a the name in the cache. We don't do this
    506  * if the segment name is long, simply so the cache can avoid
    507  * holding long names (which would either waste space, or
    508  * add greatly to the complexity).
    509  *
    510  * Lookup is called with DVP pointing to the directory to search,
    511  * and CNP providing the name of the entry being sought: cn_nameptr
    512  * is the name, cn_namelen is its length, and cn_flags is the flags
    513  * word from the namei operation.
    514  *
    515  * DVP must be locked.
    516  *
    517  * There are three possible non-error return states:
    518  *    1. Nothing was found in the cache. Nothing is known about
    519  *       the requested name.
    520  *    2. A negative entry was found in the cache, meaning that the
    521  *       requested name definitely does not exist.
    522  *    3. A positive entry was found in the cache, meaning that the
    523  *       requested name does exist and that we are providing the
    524  *       vnode.
    525  * In these cases the results are:
    526  *    1. 0 returned; VN is set to NULL.
    527  *    2. 1 returned; VN is set to NULL.
    528  *    3. 1 returned; VN is set to the vnode found.
    529  *
    530  * The additional result argument ISWHT is set to zero, unless a
    531  * negative entry is found that was entered as a whiteout, in which
    532  * case ISWHT is set to one.
    533  *
    534  * The ISWHT_RET argument pointer may be null. In this case an
    535  * assertion is made that the whiteout flag is not set. File systems
    536  * that do not support whiteouts can/should do this.
    537  *
    538  * Filesystems that do support whiteouts should add ISWHITEOUT to
    539  * cnp->cn_flags if ISWHT comes back nonzero.
    540  *
    541  * When a vnode is returned, it is locked, as per the vnode lookup
    542  * locking protocol.
    543  *
    544  * There is no way for this function to fail, in the sense of
    545  * generating an error that requires aborting the namei operation.
    546  *
    547  * (Prior to October 2012, this function returned an integer status,
    548  * and a vnode, and mucked with the flags word in CNP for whiteouts.
    549  * The integer status was -1 for "nothing found", ENOENT for "a
    550  * negative entry found", 0 for "a positive entry found", and possibly
    551  * other errors, and the value of VN might or might not have been set
    552  * depending on what error occurred.)
    553  */
    554 bool
    555 cache_lookup(struct vnode *dvp, const char *name, size_t namelen,
    556 	     uint32_t nameiop, uint32_t cnflags,
    557 	     int *iswht_ret, struct vnode **vn_ret)
    558 {
    559 	struct namecache *ncp;
    560 	struct vnode *vp;
    561 	struct nchcpu *cpup;
    562 	int error;
    563 	bool hit;
    564 
    565 
    566 	/* Establish default result values */
    567 	if (iswht_ret != NULL) {
    568 		*iswht_ret = 0;
    569 	}
    570 	*vn_ret = NULL;
    571 
    572 	if (__predict_false(!doingcache)) {
    573 		return false;
    574 	}
    575 
    576 	cpup = curcpu()->ci_data.cpu_nch;
    577 	mutex_enter(&cpup->cpu_lock);
    578 	if (__predict_false(namelen > USHRT_MAX)) {
    579 		SDT_PROBE(vfs, namecache, lookup, toolong, dvp,
    580 		    name, namelen, 0, 0);
    581 		COUNT(cpup, ncs_long);
    582 		mutex_exit(&cpup->cpu_lock);
    583 		/* found nothing */
    584 		return false;
    585 	}
    586 
    587 	ncp = cache_lookup_entry(dvp, name, namelen);
    588 	if (__predict_false(ncp == NULL)) {
    589 		COUNT(cpup, ncs_miss);
    590 		mutex_exit(&cpup->cpu_lock);
    591 		/* found nothing */
    592 		return false;
    593 	}
    594 	if ((cnflags & MAKEENTRY) == 0) {
    595 		COUNT(cpup, ncs_badhits);
    596 		/*
    597 		 * Last component and we are renaming or deleting,
    598 		 * the cache entry is invalid, or otherwise don't
    599 		 * want cache entry to exist.
    600 		 */
    601 		cache_invalidate(ncp);
    602 		mutex_exit(&ncp->nc_lock);
    603 		mutex_exit(&cpup->cpu_lock);
    604 		/* found nothing */
    605 		return false;
    606 	}
    607 	if (ncp->nc_vp == NULL) {
    608 		if (iswht_ret != NULL) {
    609 			/*
    610 			 * Restore the ISWHITEOUT flag saved earlier.
    611 			 */
    612 			KASSERT((ncp->nc_flags & ~ISWHITEOUT) == 0);
    613 			*iswht_ret = (ncp->nc_flags & ISWHITEOUT) != 0;
    614 		} else {
    615 			KASSERT(ncp->nc_flags == 0);
    616 		}
    617 
    618 		if (__predict_true(nameiop != CREATE ||
    619 		    (cnflags & ISLASTCN) == 0)) {
    620 			COUNT(cpup, ncs_neghits);
    621 			/* found neg entry; vn is already null from above */
    622 			hit = true;
    623 		} else {
    624 			COUNT(cpup, ncs_badhits);
    625 			/*
    626 			 * Last component and we are preparing to create
    627 			 * the named object, so flush the negative cache
    628 			 * entry.
    629 			 */
    630 			cache_invalidate(ncp);
    631 			/* found nothing */
    632 			hit = false;
    633 		}
    634 		mutex_exit(&ncp->nc_lock);
    635 		mutex_exit(&cpup->cpu_lock);
    636 		return hit;
    637 	}
    638 
    639 	vp = ncp->nc_vp;
    640 	mutex_enter(vp->v_interlock);
    641 	mutex_exit(&ncp->nc_lock);
    642 	mutex_exit(&cpup->cpu_lock);
    643 
    644 	/*
    645 	 * Unlocked except for the vnode interlock.  Call vcache_tryvget().
    646 	 */
    647 	error = vcache_tryvget(vp);
    648 	if (error) {
    649 		KASSERT(error == EBUSY);
    650 		/*
    651 		 * This vnode is being cleaned out.
    652 		 * XXX badhits?
    653 		 */
    654 		COUNT_UNL(cpup, ncs_falsehits);
    655 		/* found nothing */
    656 		return false;
    657 	}
    658 
    659 	COUNT_UNL(cpup, ncs_goodhits);
    660 	/* found it */
    661 	*vn_ret = vp;
    662 	return true;
    663 }
    664 
    665 
    666 /*
    667  * Cut-'n-pasted version of the above without the nameiop argument.
    668  */
    669 bool
    670 cache_lookup_raw(struct vnode *dvp, const char *name, size_t namelen,
    671 		 uint32_t cnflags,
    672 		 int *iswht_ret, struct vnode **vn_ret)
    673 {
    674 	struct namecache *ncp;
    675 	struct vnode *vp;
    676 	struct nchcpu *cpup;
    677 	int error;
    678 
    679 	/* Establish default results. */
    680 	if (iswht_ret != NULL) {
    681 		*iswht_ret = 0;
    682 	}
    683 	*vn_ret = NULL;
    684 
    685 	if (__predict_false(!doingcache)) {
    686 		/* found nothing */
    687 		return false;
    688 	}
    689 
    690 	cpup = curcpu()->ci_data.cpu_nch;
    691 	mutex_enter(&cpup->cpu_lock);
    692 	if (__predict_false(namelen > USHRT_MAX)) {
    693 		COUNT(cpup, ncs_long);
    694 		mutex_exit(&cpup->cpu_lock);
    695 		/* found nothing */
    696 		return false;
    697 	}
    698 	ncp = cache_lookup_entry(dvp, name, namelen);
    699 	if (__predict_false(ncp == NULL)) {
    700 		COUNT(cpup, ncs_miss);
    701 		mutex_exit(&cpup->cpu_lock);
    702 		/* found nothing */
    703 		return false;
    704 	}
    705 	vp = ncp->nc_vp;
    706 	if (vp == NULL) {
    707 		/*
    708 		 * Restore the ISWHITEOUT flag saved earlier.
    709 		 */
    710 		if (iswht_ret != NULL) {
    711 			KASSERT((ncp->nc_flags & ~ISWHITEOUT) == 0);
    712 			/*cnp->cn_flags |= ncp->nc_flags;*/
    713 			*iswht_ret = (ncp->nc_flags & ISWHITEOUT) != 0;
    714 		}
    715 		COUNT(cpup, ncs_neghits);
    716 		mutex_exit(&ncp->nc_lock);
    717 		mutex_exit(&cpup->cpu_lock);
    718 		/* found negative entry; vn is already null from above */
    719 		return true;
    720 	}
    721 	mutex_enter(vp->v_interlock);
    722 	mutex_exit(&ncp->nc_lock);
    723 	mutex_exit(&cpup->cpu_lock);
    724 
    725 	/*
    726 	 * Unlocked except for the vnode interlock.  Call vcache_tryvget().
    727 	 */
    728 	error = vcache_tryvget(vp);
    729 	if (error) {
    730 		KASSERT(error == EBUSY);
    731 		/*
    732 		 * This vnode is being cleaned out.
    733 		 * XXX badhits?
    734 		 */
    735 		COUNT_UNL(cpup, ncs_falsehits);
    736 		/* found nothing */
    737 		return false;
    738 	}
    739 
    740 	COUNT_UNL(cpup, ncs_goodhits); /* XXX can be "badhits" */
    741 	/* found it */
    742 	*vn_ret = vp;
    743 	return true;
    744 }
    745 
    746 /*
    747  * Scan cache looking for name of directory entry pointing at vp.
    748  *
    749  * If the lookup succeeds the vnode is referenced and stored in dvpp.
    750  *
    751  * If bufp is non-NULL, also place the name in the buffer which starts
    752  * at bufp, immediately before *bpp, and move bpp backwards to point
    753  * at the start of it.  (Yes, this is a little baroque, but it's done
    754  * this way to cater to the whims of getcwd).
    755  *
    756  * Returns 0 on success, -1 on cache miss, positive errno on failure.
    757  */
    758 int
    759 cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp)
    760 {
    761 	struct namecache *ncp;
    762 	struct vnode *dvp;
    763 	struct nchcpu *cpup;
    764 	char *bp;
    765 	int error, nlen;
    766 
    767 	KASSERT(vp != NULL);
    768 
    769 	if (!doingcache)
    770 		goto out;
    771 
    772 	/*
    773 	 * We increment counters in the local CPU's per-cpu stats.
    774 	 * We don't take the per-cpu lock, however, since this function
    775 	 * is the only place these counters are incremented so no one
    776 	 * will be racing with us to increment them.
    777 	 */
    778 	cpup = curcpu()->ci_data.cpu_nch;
    779 	mutex_enter(namecache_lock);
    780 	LIST_FOREACH(ncp, &VNODE_TO_VIMPL(vp)->vi_nclist, nc_vlist) {
    781 		mutex_enter(&ncp->nc_lock);
    782 		if (ncp->nc_vp == vp &&
    783 		    (dvp = ncp->nc_dvp) != NULL &&
    784 		    dvp != vp) { 		/* avoid pesky . entries.. */
    785 			if (ncp->nc_nlen == 1 &&
    786 			    ncp->nc_name[0] == '.') {
    787 			    	mutex_exit(&ncp->nc_lock);
    788 			    	continue;
    789 			}
    790 			if (ncp->nc_nlen == 2 &&
    791 			    ncp->nc_name[0] == '.' &&
    792 			    ncp->nc_name[1] == '.') {
    793 			    	mutex_exit(&ncp->nc_lock);
    794 			    	continue;
    795 			}
    796 			COUNT(cpup, ncs_revhits);
    797 			nlen = ncp->nc_nlen;
    798 
    799 			if (bufp) {
    800 				bp = *bpp;
    801 				bp -= nlen;
    802 				if (bp <= bufp) {
    803 					*dvpp = NULL;
    804 					mutex_exit(&ncp->nc_lock);
    805 					mutex_exit(namecache_lock);
    806 					SDT_PROBE(vfs, namecache, revlookup,
    807 					    fail, vp, ERANGE, 0, 0, 0);
    808 					return (ERANGE);
    809 				}
    810 				memcpy(bp, ncp->nc_name, nlen);
    811 				*bpp = bp;
    812 			}
    813 
    814 			mutex_enter(dvp->v_interlock);
    815 			mutex_exit(&ncp->nc_lock);
    816 			mutex_exit(namecache_lock);
    817 			error = vcache_tryvget(dvp);
    818 			if (error) {
    819 				KASSERT(error == EBUSY);
    820 				if (bufp)
    821 					(*bpp) += nlen;
    822 				*dvpp = NULL;
    823 				SDT_PROBE(vfs, namecache, revlookup, fail, vp,
    824 				    error, 0, 0, 0);
    825 				return -1;
    826 			}
    827 			*dvpp = dvp;
    828 			SDT_PROBE(vfs, namecache, revlookup, success, vp, dvp,
    829 			    0, 0, 0);
    830 			return (0);
    831 		}
    832 		mutex_exit(&ncp->nc_lock);
    833 	}
    834 	COUNT(cpup, ncs_revmiss);
    835 	mutex_exit(namecache_lock);
    836  out:
    837 	*dvpp = NULL;
    838 	return (-1);
    839 }
    840 
    841 /*
    842  * Add an entry to the cache
    843  */
    844 void
    845 cache_enter(struct vnode *dvp, struct vnode *vp,
    846 	    const char *name, size_t namelen, uint32_t cnflags)
    847 {
    848 	struct namecache *ncp;
    849 	struct namecache *oncp;
    850 	struct nchashhead *ncpp;
    851 	nchash_t hash;
    852 
    853 	/* First, check whether we can/should add a cache entry. */
    854 	if ((cnflags & MAKEENTRY) == 0 ||
    855 	    __predict_false(namelen > USHRT_MAX || !doingcache)) {
    856 		SDT_PROBE(vfs, namecache, enter, toolong, vp, name, namelen,
    857 		    0, 0);
    858 		return;
    859 	}
    860 
    861 	SDT_PROBE(vfs, namecache, enter, done, vp, name, namelen, 0, 0);
    862 	if (numcache > desiredvnodes) {
    863 		mutex_enter(namecache_lock);
    864 		cache_ev_forced.ev_count++;
    865 		cache_reclaim();
    866 		mutex_exit(namecache_lock);
    867 	}
    868 
    869 	if (namelen > NCHNAMLEN) {
    870 		ncp = kmem_alloc(sizeof(*ncp) + namelen, KM_SLEEP);
    871 		cache_ctor(NULL, ncp, 0);
    872 	} else
    873 		ncp = pool_cache_get(namecache_cache, PR_WAITOK);
    874 
    875 	mutex_enter(namecache_lock);
    876 	numcache++;
    877 
    878 	/*
    879 	 * Concurrent lookups in the same directory may race for a
    880 	 * cache entry.  if there's a duplicated entry, free it.
    881 	 */
    882 	oncp = cache_lookup_entry(dvp, name, namelen);
    883 	if (oncp) {
    884 		cache_invalidate(oncp);
    885 		mutex_exit(&oncp->nc_lock);
    886 	}
    887 
    888 	/* Grab the vnode we just found. */
    889 	mutex_enter(&ncp->nc_lock);
    890 	ncp->nc_vp = vp;
    891 	ncp->nc_flags = 0;
    892 	ncp->nc_hittime = 0;
    893 	ncp->nc_gcqueue = NULL;
    894 	if (vp == NULL) {
    895 		/*
    896 		 * For negative hits, save the ISWHITEOUT flag so we can
    897 		 * restore it later when the cache entry is used again.
    898 		 */
    899 		ncp->nc_flags = cnflags & ISWHITEOUT;
    900 	}
    901 
    902 	/* Fill in cache info. */
    903 	ncp->nc_dvp = dvp;
    904 	LIST_INSERT_HEAD(&VNODE_TO_VIMPL(dvp)->vi_dnclist, ncp, nc_dvlist);
    905 	if (vp)
    906 		LIST_INSERT_HEAD(&VNODE_TO_VIMPL(vp)->vi_nclist, ncp, nc_vlist);
    907 	else {
    908 		ncp->nc_vlist.le_prev = NULL;
    909 		ncp->nc_vlist.le_next = NULL;
    910 	}
    911 	KASSERT(namelen <= USHRT_MAX);
    912 	ncp->nc_nlen = namelen;
    913 	memcpy(ncp->nc_name, name, (unsigned)ncp->nc_nlen);
    914 	TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru);
    915 	hash = cache_hash(name, namelen);
    916 	ncpp = &nchashtbl[NCHASH2(hash, dvp)];
    917 
    918 	/*
    919 	 * Flush updates before making visible in table.  No need for a
    920 	 * memory barrier on the other side: to see modifications the
    921 	 * list must be followed, meaning a dependent pointer load.
    922 	 * The below is LIST_INSERT_HEAD() inlined, with the memory
    923 	 * barrier included in the correct place.
    924 	 */
    925 	if ((ncp->nc_hash.le_next = ncpp->lh_first) != NULL)
    926 		ncpp->lh_first->nc_hash.le_prev = &ncp->nc_hash.le_next;
    927 	ncp->nc_hash.le_prev = &ncpp->lh_first;
    928 	membar_producer();
    929 	ncpp->lh_first = ncp;
    930 	mutex_exit(&ncp->nc_lock);
    931 	mutex_exit(namecache_lock);
    932 }
    933 
    934 /*
    935  * Name cache initialization, from vfs_init() when we are booting
    936  */
    937 void
    938 nchinit(void)
    939 {
    940 	int error;
    941 
    942 	TAILQ_INIT(&nclruhead);
    943 	namecache_cache = pool_cache_init(sizeof(struct namecache) + NCHNAMLEN,
    944 	    coherency_unit, 0, 0, "ncache", NULL, IPL_NONE, cache_ctor,
    945 	    cache_dtor, NULL);
    946 	KASSERT(namecache_cache != NULL);
    947 
    948 	namecache_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
    949 	nchashtbl = hashinit(desiredvnodes, HASH_LIST, true, &nchash);
    950 
    951 	error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, cache_thread,
    952 	    NULL, NULL, "cachegc");
    953 	if (error != 0)
    954 		panic("nchinit %d", error);
    955 
    956 	evcnt_attach_dynamic(&cache_ev_scan, EVCNT_TYPE_MISC, NULL,
    957 	   "namecache", "entries scanned");
    958 	evcnt_attach_dynamic(&cache_ev_gc, EVCNT_TYPE_MISC, NULL,
    959 	   "namecache", "entries collected");
    960 	evcnt_attach_dynamic(&cache_ev_over, EVCNT_TYPE_MISC, NULL,
    961 	   "namecache", "over scan target");
    962 	evcnt_attach_dynamic(&cache_ev_under, EVCNT_TYPE_MISC, NULL,
    963 	   "namecache", "under scan target");
    964 	evcnt_attach_dynamic(&cache_ev_forced, EVCNT_TYPE_MISC, NULL,
    965 	   "namecache", "forced reclaims");
    966 
    967 	sysctl_cache_stat_setup();
    968 }
    969 
    970 static int
    971 cache_ctor(void *arg, void *obj, int flag)
    972 {
    973 	struct namecache *ncp;
    974 
    975 	ncp = obj;
    976 	mutex_init(&ncp->nc_lock, MUTEX_DEFAULT, IPL_NONE);
    977 
    978 	return 0;
    979 }
    980 
    981 static void
    982 cache_dtor(void *arg, void *obj)
    983 {
    984 	struct namecache *ncp;
    985 
    986 	ncp = obj;
    987 	mutex_destroy(&ncp->nc_lock);
    988 }
    989 
    990 /*
    991  * Called once for each CPU in the system as attached.
    992  */
    993 void
    994 cache_cpu_init(struct cpu_info *ci)
    995 {
    996 	struct nchcpu *cpup;
    997 	size_t sz;
    998 
    999 	sz = roundup2(sizeof(*cpup), coherency_unit) + coherency_unit;
   1000 	cpup = kmem_zalloc(sz, KM_SLEEP);
   1001 	cpup = (void *)roundup2((uintptr_t)cpup, coherency_unit);
   1002 	mutex_init(&cpup->cpu_lock, MUTEX_DEFAULT, IPL_NONE);
   1003 	ci->ci_data.cpu_nch = cpup;
   1004 }
   1005 
   1006 /*
   1007  * Name cache reinitialization, for when the maximum number of vnodes increases.
   1008  */
   1009 void
   1010 nchreinit(void)
   1011 {
   1012 	struct namecache *ncp;
   1013 	struct nchashhead *oldhash, *hash;
   1014 	u_long i, oldmask, mask;
   1015 
   1016 	hash = hashinit(desiredvnodes, HASH_LIST, true, &mask);
   1017 	mutex_enter(namecache_lock);
   1018 	cache_lock_cpus();
   1019 	oldhash = nchashtbl;
   1020 	oldmask = nchash;
   1021 	nchashtbl = hash;
   1022 	nchash = mask;
   1023 	for (i = 0; i <= oldmask; i++) {
   1024 		while ((ncp = LIST_FIRST(&oldhash[i])) != NULL) {
   1025 			LIST_REMOVE(ncp, nc_hash);
   1026 			ncp->nc_hash.le_prev = NULL;
   1027 		}
   1028 	}
   1029 	cache_unlock_cpus();
   1030 	mutex_exit(namecache_lock);
   1031 	hashdone(oldhash, HASH_LIST, oldmask);
   1032 }
   1033 
   1034 /*
   1035  * Cache flush, a particular vnode; called when a vnode is renamed to
   1036  * hide entries that would now be invalid
   1037  */
   1038 void
   1039 cache_purge1(struct vnode *vp, const char *name, size_t namelen, int flags)
   1040 {
   1041 	struct namecache *ncp, *ncnext;
   1042 
   1043 	mutex_enter(namecache_lock);
   1044 	if (flags & PURGE_PARENTS) {
   1045 		SDT_PROBE(vfs, namecache, purge, parents, vp, 0, 0, 0, 0);
   1046 
   1047 		for (ncp = LIST_FIRST(&VNODE_TO_VIMPL(vp)->vi_nclist);
   1048 		    ncp != NULL; ncp = ncnext) {
   1049 			ncnext = LIST_NEXT(ncp, nc_vlist);
   1050 			mutex_enter(&ncp->nc_lock);
   1051 			cache_invalidate(ncp);
   1052 			mutex_exit(&ncp->nc_lock);
   1053 			cache_disassociate(ncp);
   1054 		}
   1055 	}
   1056 	if (flags & PURGE_CHILDREN) {
   1057 		SDT_PROBE(vfs, namecache, purge, children, vp, 0, 0, 0, 0);
   1058 		for (ncp = LIST_FIRST(&VNODE_TO_VIMPL(vp)->vi_dnclist);
   1059 		    ncp != NULL; ncp = ncnext) {
   1060 			ncnext = LIST_NEXT(ncp, nc_dvlist);
   1061 			mutex_enter(&ncp->nc_lock);
   1062 			cache_invalidate(ncp);
   1063 			mutex_exit(&ncp->nc_lock);
   1064 			cache_disassociate(ncp);
   1065 		}
   1066 	}
   1067 	if (name != NULL) {
   1068 		SDT_PROBE(vfs, namecache, purge, name, name, namelen, 0, 0, 0);
   1069 		ncp = cache_lookup_entry(vp, name, namelen);
   1070 		if (ncp) {
   1071 			cache_invalidate(ncp);
   1072 			mutex_exit(&ncp->nc_lock);
   1073 			cache_disassociate(ncp);
   1074 		}
   1075 	}
   1076 	mutex_exit(namecache_lock);
   1077 }
   1078 
   1079 /*
   1080  * Cache flush, a whole filesystem; called when filesys is umounted to
   1081  * remove entries that would now be invalid.
   1082  */
   1083 void
   1084 cache_purgevfs(struct mount *mp)
   1085 {
   1086 	struct namecache *ncp, *nxtcp;
   1087 
   1088 	SDT_PROBE(vfs, namecache, purge, vfs, mp, 0, 0, 0, 0);
   1089 	mutex_enter(namecache_lock);
   1090 	for (ncp = TAILQ_FIRST(&nclruhead); ncp != NULL; ncp = nxtcp) {
   1091 		nxtcp = TAILQ_NEXT(ncp, nc_lru);
   1092 		mutex_enter(&ncp->nc_lock);
   1093 		if (ncp->nc_dvp != NULL && ncp->nc_dvp->v_mount == mp) {
   1094 			/* Free the resources we had. */
   1095 			cache_invalidate(ncp);
   1096 			cache_disassociate(ncp);
   1097 		}
   1098 		mutex_exit(&ncp->nc_lock);
   1099 	}
   1100 	cache_reclaim();
   1101 	mutex_exit(namecache_lock);
   1102 }
   1103 
   1104 /*
   1105  * Scan global list invalidating entries until we meet a preset target.
   1106  * Prefer to invalidate entries that have not scored a hit within
   1107  * cache_hottime seconds.  We sort the LRU list only for this routine's
   1108  * benefit.
   1109  */
   1110 static void
   1111 cache_prune(int incache, int target)
   1112 {
   1113 	struct namecache *ncp, *nxtcp, *sentinel;
   1114 	int items, recent, tryharder;
   1115 
   1116 	KASSERT(mutex_owned(namecache_lock));
   1117 
   1118 	SDT_PROBE(vfs, namecache, prune, done, incache, target, 0, 0, 0);
   1119 	items = 0;
   1120 	tryharder = 0;
   1121 	recent = hardclock_ticks - hz * cache_hottime;
   1122 	sentinel = NULL;
   1123 	for (ncp = TAILQ_FIRST(&nclruhead); ncp != NULL; ncp = nxtcp) {
   1124 		if (incache <= target)
   1125 			break;
   1126 		items++;
   1127 		nxtcp = TAILQ_NEXT(ncp, nc_lru);
   1128 		if (ncp == sentinel) {
   1129 			/*
   1130 			 * If we looped back on ourself, then ignore
   1131 			 * recent entries and purge whatever we find.
   1132 			 */
   1133 			tryharder = 1;
   1134 		}
   1135 		if (ncp->nc_dvp == NULL)
   1136 			continue;
   1137 		if (!tryharder && (ncp->nc_hittime - recent) > 0) {
   1138 			if (sentinel == NULL)
   1139 				sentinel = ncp;
   1140 			TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
   1141 			TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru);
   1142 			continue;
   1143 		}
   1144 		mutex_enter(&ncp->nc_lock);
   1145 		if (ncp->nc_dvp != NULL) {
   1146 			cache_invalidate(ncp);
   1147 			cache_disassociate(ncp);
   1148 			incache--;
   1149 		}
   1150 		mutex_exit(&ncp->nc_lock);
   1151 	}
   1152 	cache_ev_scan.ev_count += items;
   1153 }
   1154 
   1155 /*
   1156  * Collect dead cache entries from all CPUs and garbage collect.
   1157  */
   1158 static void
   1159 cache_reclaim(void)
   1160 {
   1161 	struct namecache *ncp, *next;
   1162 	int items;
   1163 
   1164 	KASSERT(mutex_owned(namecache_lock));
   1165 
   1166 	/*
   1167 	 * If the number of extant entries not awaiting garbage collection
   1168 	 * exceeds the high water mark, then reclaim stale entries until we
   1169 	 * reach our low water mark.
   1170 	 */
   1171 	items = numcache - cache_gcpend;
   1172 	if (items > (uint64_t)desiredvnodes * cache_hiwat / 100) {
   1173 		cache_prune(items, (int)((uint64_t)desiredvnodes *
   1174 		    cache_lowat / 100));
   1175 		cache_ev_over.ev_count++;
   1176 	} else
   1177 		cache_ev_under.ev_count++;
   1178 
   1179 	/*
   1180 	 * Stop forward lookup activity on all CPUs and garbage collect dead
   1181 	 * entries.
   1182 	 */
   1183 	cache_lock_cpus();
   1184 	ncp = cache_gcqueue;
   1185 	cache_gcqueue = NULL;
   1186 	items = cache_gcpend;
   1187 	cache_gcpend = 0;
   1188 	while (ncp != NULL) {
   1189 		next = ncp->nc_gcqueue;
   1190 		cache_disassociate(ncp);
   1191 		KASSERT(ncp->nc_dvp == NULL);
   1192 		if (ncp->nc_hash.le_prev != NULL) {
   1193 			LIST_REMOVE(ncp, nc_hash);
   1194 			ncp->nc_hash.le_prev = NULL;
   1195 		}
   1196 		if (ncp->nc_nlen > NCHNAMLEN) {
   1197 			cache_dtor(NULL, ncp);
   1198 			kmem_free(ncp, sizeof(*ncp) + ncp->nc_nlen);
   1199 		} else
   1200 			pool_cache_put(namecache_cache, ncp);
   1201 		ncp = next;
   1202 	}
   1203 	cache_unlock_cpus();
   1204 	numcache -= items;
   1205 	cache_ev_gc.ev_count += items;
   1206 }
   1207 
   1208 /*
   1209  * Cache maintainence thread, awakening once per second to:
   1210  *
   1211  * => keep number of entries below the high water mark
   1212  * => sort pseudo-LRU list
   1213  * => garbage collect dead entries
   1214  */
   1215 static void
   1216 cache_thread(void *arg)
   1217 {
   1218 
   1219 	mutex_enter(namecache_lock);
   1220 	for (;;) {
   1221 		cache_reclaim();
   1222 		kpause("cachegc", false, hz, namecache_lock);
   1223 	}
   1224 }
   1225 
   1226 #ifdef DDB
   1227 void
   1228 namecache_print(struct vnode *vp, void (*pr)(const char *, ...))
   1229 {
   1230 	struct vnode *dvp = NULL;
   1231 	struct namecache *ncp;
   1232 
   1233 	TAILQ_FOREACH(ncp, &nclruhead, nc_lru) {
   1234 		if (ncp->nc_vp == vp && ncp->nc_dvp != NULL) {
   1235 			(*pr)("name %.*s\n", ncp->nc_nlen, ncp->nc_name);
   1236 			dvp = ncp->nc_dvp;
   1237 		}
   1238 	}
   1239 	if (dvp == NULL) {
   1240 		(*pr)("name not found\n");
   1241 		return;
   1242 	}
   1243 	vp = dvp;
   1244 	TAILQ_FOREACH(ncp, &nclruhead, nc_lru) {
   1245 		if (ncp->nc_vp == vp) {
   1246 			(*pr)("parent %.*s\n", ncp->nc_nlen, ncp->nc_name);
   1247 		}
   1248 	}
   1249 }
   1250 #endif
   1251 
   1252 void
   1253 namecache_count_pass2(void)
   1254 {
   1255 	struct nchcpu *cpup = curcpu()->ci_data.cpu_nch;
   1256 
   1257 	COUNT_UNL(cpup, ncs_pass2);
   1258 }
   1259 
   1260 void
   1261 namecache_count_2passes(void)
   1262 {
   1263 	struct nchcpu *cpup = curcpu()->ci_data.cpu_nch;
   1264 
   1265 	COUNT_UNL(cpup, ncs_2passes);
   1266 }
   1267 
   1268 /*
   1269  * Fetch the current values of the stats.  We return the most
   1270  * recent values harvested into nchstats by cache_reclaim(), which
   1271  * will be less than a second old.
   1272  */
   1273 static int
   1274 cache_stat_sysctl(SYSCTLFN_ARGS)
   1275 {
   1276 	struct nchstats stats;
   1277 	struct nchcpu *my_cpup;
   1278 #ifdef CACHE_STATS_CURRENT
   1279 	CPU_INFO_ITERATOR cii;
   1280 	struct cpu_info *ci;
   1281 #endif	/* CACHE_STATS_CURRENT */
   1282 
   1283 	if (oldp == NULL) {
   1284 		*oldlenp = sizeof(stats);
   1285 		return 0;
   1286 	}
   1287 
   1288 	if (*oldlenp < sizeof(stats)) {
   1289 		*oldlenp = 0;
   1290 		return 0;
   1291 	}
   1292 
   1293 	/*
   1294 	 * Take this CPU's per-cpu lock to hold off cache_reclaim()
   1295 	 * from doing a stats update while doing minimal damage to
   1296 	 * concurrent operations.
   1297 	 */
   1298 	sysctl_unlock();
   1299 	my_cpup = curcpu()->ci_data.cpu_nch;
   1300 	mutex_enter(&my_cpup->cpu_lock);
   1301 	stats = nchstats;
   1302 #ifdef CACHE_STATS_CURRENT
   1303 	for (CPU_INFO_FOREACH(cii, ci)) {
   1304 		struct nchcpu *cpup = ci->ci_data.cpu_nch;
   1305 
   1306 		ADD(stats, cpup, ncs_goodhits);
   1307 		ADD(stats, cpup, ncs_neghits);
   1308 		ADD(stats, cpup, ncs_badhits);
   1309 		ADD(stats, cpup, ncs_falsehits);
   1310 		ADD(stats, cpup, ncs_miss);
   1311 		ADD(stats, cpup, ncs_long);
   1312 		ADD(stats, cpup, ncs_pass2);
   1313 		ADD(stats, cpup, ncs_2passes);
   1314 		ADD(stats, cpup, ncs_revhits);
   1315 		ADD(stats, cpup, ncs_revmiss);
   1316 	}
   1317 #endif	/* CACHE_STATS_CURRENT */
   1318 	mutex_exit(&my_cpup->cpu_lock);
   1319 	sysctl_relock();
   1320 
   1321 	*oldlenp = sizeof(stats);
   1322 	return sysctl_copyout(l, &stats, oldp, sizeof(stats));
   1323 }
   1324 
   1325 static void
   1326 sysctl_cache_stat_setup(void)
   1327 {
   1328 
   1329 	KASSERT(sysctllog == NULL);
   1330 	sysctl_createv(&sysctllog, 0, NULL, NULL,
   1331 		       CTLFLAG_PERMANENT,
   1332 		       CTLTYPE_STRUCT, "namecache_stats",
   1333 		       SYSCTL_DESCR("namecache statistics"),
   1334 		       cache_stat_sysctl, 0, NULL, 0,
   1335 		       CTL_VFS, CTL_CREATE, CTL_EOL);
   1336 }
   1337