Home | History | Annotate | Line # | Download | only in kern
vfs_cache.c revision 1.120
      1 /*	$NetBSD: vfs_cache.c,v 1.120 2017/03/18 22:36:56 riastradh Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2008 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  * POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 /*
     30  * Copyright (c) 1989, 1993
     31  *	The Regents of the University of California.  All rights reserved.
     32  *
     33  * Redistribution and use in source and binary forms, with or without
     34  * modification, are permitted provided that the following conditions
     35  * are met:
     36  * 1. Redistributions of source code must retain the above copyright
     37  *    notice, this list of conditions and the following disclaimer.
     38  * 2. Redistributions in binary form must reproduce the above copyright
     39  *    notice, this list of conditions and the following disclaimer in the
     40  *    documentation and/or other materials provided with the distribution.
     41  * 3. Neither the name of the University nor the names of its contributors
     42  *    may be used to endorse or promote products derived from this software
     43  *    without specific prior written permission.
     44  *
     45  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     46  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     47  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     48  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     49  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     50  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     51  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     52  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     53  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     54  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     55  * SUCH DAMAGE.
     56  *
     57  *	@(#)vfs_cache.c	8.3 (Berkeley) 8/22/94
     58  */
     59 
     60 #include <sys/cdefs.h>
     61 __KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.120 2017/03/18 22:36:56 riastradh Exp $");
     62 
     63 #ifdef _KERNEL_OPT
     64 #include "opt_ddb.h"
     65 #include "opt_dtrace.h"
     66 #include "opt_revcache.h"
     67 #endif
     68 
     69 #include <sys/param.h>
     70 #include <sys/atomic.h>
     71 #include <sys/cpu.h>
     72 #include <sys/errno.h>
     73 #include <sys/evcnt.h>
     74 #include <sys/kernel.h>
     75 #include <sys/kthread.h>
     76 #include <sys/mount.h>
     77 #include <sys/mutex.h>
     78 #include <sys/namei.h>
     79 #include <sys/pool.h>
     80 #include <sys/sdt.h>
     81 #include <sys/sysctl.h>
     82 #include <sys/systm.h>
     83 #include <sys/time.h>
     84 #include <sys/vnode_impl.h>
     85 
     86 #define NAMECACHE_ENTER_REVERSE
     87 /*
     88  * Name caching works as follows:
     89  *
     90  * Names found by directory scans are retained in a cache
     91  * for future reference.  It is managed LRU, so frequently
     92  * used names will hang around.  Cache is indexed by hash value
     93  * obtained from (dvp, name) where dvp refers to the directory
     94  * containing name.
     95  *
     96  * For simplicity (and economy of storage), names longer than
     97  * a maximum length of NCHNAMLEN are not cached; they occur
     98  * infrequently in any case, and are almost never of interest.
     99  *
    100  * Upon reaching the last segment of a path, if the reference
    101  * is for DELETE, or NOCACHE is set (rewrite), and the
    102  * name is located in the cache, it will be dropped.
    103  */
    104 
    105 /*
    106  * Cache entry lifetime:
    107  *
    108  *	nonexistent
    109  *	---create---> active
    110  *	---invalidate---> queued
    111  *	---reclaim---> nonexistent.
    112  *
    113  * States:
    114  * - Nonexistent.  Cache entry does not exist.
    115  *
    116  * - Active.  cache_lookup, cache_lookup_raw, cache_revlookup can look
    117  *   up, acquire references, and hand off references to vnodes,
    118  *   e.g. via v_interlock.  Marked by nonnull ncp->nc_dvp.
    119  *
    120  * - Queued.  Pending desstruction by cache_reclaim.  Cannot be used by
    121  *   cache_lookup, cache_lookup_raw, or cache_revlookup.  May still be
    122  *   on lists.  Marked by null ncp->nc_dvp.
    123  *
    124  * Transitions:
    125  *
    126  * - Create: nonexistent--->active
    127  *
    128  *   Done by cache_enter(dvp, vp, name, namelen, cnflags), called by
    129  *   VOP_LOOKUP after the answer is found.  Allocates a struct
    130  *   namecache object, initializes it with the above fields, and
    131  *   activates it by inserting it into the forward and reverse tables.
    132  *
    133  * - Invalidate: active--->queued
    134  *
    135  *   Done by cache_invalidate.  If not already invalidated, nullify
    136  *   ncp->nc_dvp and ncp->nc_vp, and add to cache_gcqueue.  Called,
    137  *   among various other places, in cache_lookup(dvp, name, namelen,
    138  *   nameiop, cnflags, &iswht, &vp) when MAKEENTRY is missing from
    139  *   cnflags.
    140  *
    141  * - Reclaim: queued--->nonexistent
    142  *
    143  *   Done by cache_reclaim.  Disassociate ncp from any lists it is on
    144  *   and free memory.
    145  */
    146 
    147 /*
    148  * Locking.
    149  *
    150  * L namecache_lock		Global lock for namecache table and queues.
    151  * C struct nchcpu::cpu_lock	Per-CPU lock to reduce read contention.
    152  * N struct namecache::nc_lock	Per-entry lock.
    153  * V struct vnode::v_interlock	Vnode interlock.
    154  *
    155  * Lock order: L -> C -> N -> V
    156  *
    157  *	Examples:
    158  *	. L->C: cache_reclaim
    159  *	. C->N->V: cache_lookup
    160  *	. L->N->V: cache_purge1, cache_revlookup
    161  *
    162  * All use serialized by namecache_lock:
    163  *
    164  *	nclruhead / struct namecache::nc_lru
    165  *	ncvhashtbl / struct namecache::nc_vhash
    166  *	struct vnode_impl::vi_dnclist / struct namecache::nc_dvlist
    167  *	struct vnode_impl::vi_nclist / struct namecache::nc_vlist
    168  *	nchstats
    169  *
    170  * - Insertion serialized by namecache_lock,
    171  * - read protected by per-CPU lock,
    172  * - insert/read ordering guaranteed by memory barriers, and
    173  * - deletion allowed only under namecache_lock and *all* per-CPU locks
    174  *   in CPU_INFO_FOREACH order:
    175  *
    176  *	nchashtbl / struct namecache::nc_hash
    177  *
    178  *   The per-CPU locks exist only to reduce the probability of
    179  *   contention between readers.  We do not bind to a CPU, so
    180  *   contention is still possible.
    181  *
    182  * All use serialized by struct namecache::nc_lock:
    183  *
    184  *	struct namecache::nc_dvp
    185  *	struct namecache::nc_vp
    186  *	struct namecache::nc_gcqueue (*)
    187  *	struct namecache::nc_hittime (**)
    188  *
    189  * (*) Once on the queue, only cache_thread uses this nc_gcqueue, unlocked.
    190  * (**) cache_prune reads nc_hittime unlocked, since approximate is OK.
    191  *
    192  * Unlocked because stable after initialization:
    193  *
    194  *	struct namecache::nc_dvp
    195  *	struct namecache::nc_vp
    196  *	struct namecache::nc_flags
    197  *	struct namecache::nc_nlen
    198  *	struct namecache::nc_name
    199  *
    200  * Unlocked because approximation is OK:
    201  *
    202  *	struct nchcpu::cpu_stats
    203  *	struct nchcpu::cpu_stats_last
    204  *
    205  * Updates under namecache_lock or any per-CPU lock are marked with
    206  * COUNT, while updates outside those locks are marked with COUNT_UNL.
    207  *
    208  * - The theory seems to have been that you could replace COUNT_UNL by
    209  *   atomic operations -- except that doesn't help unless you also
    210  *   replace COUNT by atomic operations, because mixing atomics and
    211  *   nonatomics is a recipe for failure.
    212  * - We use 32-bit per-CPU counters and 64-bit global counters under
    213  *   the theory that 32-bit counters are less likely to be hosed by
    214  *   nonatomic increment.
    215  */
    216 
    217 /*
    218  * The comment below is preserved for posterity in case it is
    219  * important, but it is clear that everywhere the namecache_count_*()
    220  * functions are called, other cache_*() functions that take the same
    221  * locks are also called, so I can't imagine how this could be a
    222  * problem:
    223  *
    224  * N.B.: Attempting to protect COUNT_UNL() increments by taking
    225  * a per-cpu lock in the namecache_count_*() functions causes
    226  * a deadlock.  Don't do that, use atomic increments instead if
    227  * the imperfections here bug you.
    228  */
    229 
    230 /*
    231  * struct nchstats_percpu:
    232  *
    233  *	Per-CPU counters.
    234  */
    235 struct nchstats_percpu _NAMEI_CACHE_STATS(uint32_t);
    236 
    237 /*
    238  * struct nchcpu:
    239  *
    240  *	Per-CPU namecache state: lock and per-CPU counters.
    241  */
    242 struct nchcpu {
    243 	kmutex_t		cpu_lock;
    244 	struct nchstats_percpu	cpu_stats;
    245 	/* XXX maybe __cacheline_aligned would improve this? */
    246 	struct nchstats_percpu	cpu_stats_last;	/* from last sample */
    247 };
    248 
    249 /*
    250  * The type for the hash code. While the hash function generates a
    251  * u32, the hash code has historically been passed around as a u_long,
    252  * and the value is modified by xor'ing a uintptr_t, so it's not
    253  * entirely clear what the best type is. For now I'll leave it
    254  * unchanged as u_long.
    255  */
    256 
    257 typedef u_long nchash_t;
    258 
    259 /*
    260  * Structures associated with name cacheing.
    261  */
    262 
    263 static kmutex_t *namecache_lock __read_mostly;
    264 static pool_cache_t namecache_cache __read_mostly;
    265 static TAILQ_HEAD(, namecache) nclruhead __cacheline_aligned;
    266 
    267 static LIST_HEAD(nchashhead, namecache) *nchashtbl __read_mostly;
    268 static u_long	nchash __read_mostly;
    269 
    270 #define	NCHASH2(hash, dvp)	\
    271 	(((hash) ^ ((uintptr_t)(dvp) >> 3)) & nchash)
    272 
    273 static LIST_HEAD(ncvhashhead, namecache) *ncvhashtbl __read_mostly;
    274 static u_long	ncvhash __read_mostly;
    275 
    276 #define	NCVHASH(vp)		(((uintptr_t)(vp) >> 3) & ncvhash)
    277 
    278 /* Number of cache entries allocated. */
    279 static long	numcache __cacheline_aligned;
    280 
    281 /* Garbage collection queue and number of entries pending in it. */
    282 static void	*cache_gcqueue;
    283 static u_int	cache_gcpend;
    284 
    285 /* Cache effectiveness statistics.  This holds total from per-cpu stats */
    286 struct nchstats	nchstats __cacheline_aligned;
    287 
    288 /*
    289  * Macros to count an event, update the central stats with per-cpu
    290  * values and add current per-cpu increments to the subsystem total
    291  * last collected by cache_reclaim().
    292  */
    293 #define	CACHE_STATS_CURRENT	/* nothing */
    294 
    295 #define	COUNT(cpup, f)	((cpup)->cpu_stats.f++)
    296 
    297 #define	UPDATE(cpup, f) do { \
    298 	struct nchcpu *Xcpup = (cpup); \
    299 	uint32_t Xcnt = (volatile uint32_t) Xcpup->cpu_stats.f; \
    300 	nchstats.f += Xcnt - Xcpup->cpu_stats_last.f; \
    301 	Xcpup->cpu_stats_last.f = Xcnt; \
    302 } while (/* CONSTCOND */ 0)
    303 
    304 #define	ADD(stats, cpup, f) do { \
    305 	struct nchcpu *Xcpup = (cpup); \
    306 	stats.f += Xcpup->cpu_stats.f - Xcpup->cpu_stats_last.f; \
    307 } while (/* CONSTCOND */ 0)
    308 
    309 /* Do unlocked stats the same way. Use a different name to allow mind changes */
    310 #define	COUNT_UNL(cpup, f)	COUNT((cpup), f)
    311 
    312 static const int cache_lowat = 95;
    313 static const int cache_hiwat = 98;
    314 static const int cache_hottime = 5;	/* number of seconds */
    315 static int doingcache = 1;		/* 1 => enable the cache */
    316 
    317 static struct evcnt cache_ev_scan;
    318 static struct evcnt cache_ev_gc;
    319 static struct evcnt cache_ev_over;
    320 static struct evcnt cache_ev_under;
    321 static struct evcnt cache_ev_forced;
    322 
    323 static struct namecache *cache_lookup_entry(
    324     const struct vnode *, const char *, size_t);
    325 static void cache_thread(void *);
    326 static void cache_invalidate(struct namecache *);
    327 static void cache_disassociate(struct namecache *);
    328 static void cache_reclaim(void);
    329 static int cache_ctor(void *, void *, int);
    330 static void cache_dtor(void *, void *);
    331 
    332 static struct sysctllog *sysctllog;
    333 static void sysctl_cache_stat_setup(void);
    334 
    335 SDT_PROVIDER_DEFINE(vfs);
    336 
    337 SDT_PROBE_DEFINE1(vfs, namecache, invalidate, done, "struct vnode *");
    338 SDT_PROBE_DEFINE1(vfs, namecache, purge, parents, "struct vnode *");
    339 SDT_PROBE_DEFINE1(vfs, namecache, purge, children, "struct vnode *");
    340 SDT_PROBE_DEFINE2(vfs, namecache, purge, name, "char *", "size_t");
    341 SDT_PROBE_DEFINE1(vfs, namecache, purge, vfs, "struct mount *");
    342 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *",
    343     "char *", "size_t");
    344 SDT_PROBE_DEFINE3(vfs, namecache, lookup, miss, "struct vnode *",
    345     "char *", "size_t");
    346 SDT_PROBE_DEFINE3(vfs, namecache, lookup, toolong, "struct vnode *",
    347     "char *", "size_t");
    348 SDT_PROBE_DEFINE2(vfs, namecache, revlookup, success, "struct vnode *",
    349      "struct vnode *");
    350 SDT_PROBE_DEFINE2(vfs, namecache, revlookup, fail, "struct vnode *",
    351      "int");
    352 SDT_PROBE_DEFINE2(vfs, namecache, prune, done, "int", "int");
    353 SDT_PROBE_DEFINE3(vfs, namecache, enter, toolong, "struct vnode *",
    354     "char *", "size_t");
    355 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *",
    356     "char *", "size_t");
    357 
    358 /*
    359  * Compute the hash for an entry.
    360  *
    361  * (This is for now a wrapper around namei_hash, whose interface is
    362  * for the time being slightly inconvenient.)
    363  */
    364 static nchash_t
    365 cache_hash(const char *name, size_t namelen)
    366 {
    367 	const char *endptr;
    368 
    369 	endptr = name + namelen;
    370 	return namei_hash(name, &endptr);
    371 }
    372 
    373 /*
    374  * Invalidate a cache entry and enqueue it for garbage collection.
    375  * The caller needs to hold namecache_lock or a per-cpu lock to hold
    376  * off cache_reclaim().
    377  */
    378 static void
    379 cache_invalidate(struct namecache *ncp)
    380 {
    381 	void *head;
    382 
    383 	KASSERT(mutex_owned(&ncp->nc_lock));
    384 
    385 	if (ncp->nc_dvp != NULL) {
    386 		SDT_PROBE(vfs, namecache, invalidate, done, ncp->nc_dvp,
    387 		    0, 0, 0, 0);
    388 
    389 		ncp->nc_vp = NULL;
    390 		ncp->nc_dvp = NULL;
    391 		do {
    392 			head = cache_gcqueue;
    393 			ncp->nc_gcqueue = head;
    394 		} while (atomic_cas_ptr(&cache_gcqueue, head, ncp) != head);
    395 		atomic_inc_uint(&cache_gcpend);
    396 	}
    397 }
    398 
    399 /*
    400  * Disassociate a namecache entry from any vnodes it is attached to,
    401  * and remove from the global LRU list.
    402  */
    403 static void
    404 cache_disassociate(struct namecache *ncp)
    405 {
    406 
    407 	KASSERT(mutex_owned(namecache_lock));
    408 	KASSERT(ncp->nc_dvp == NULL);
    409 
    410 	if (ncp->nc_lru.tqe_prev != NULL) {
    411 		TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
    412 		ncp->nc_lru.tqe_prev = NULL;
    413 	}
    414 	if (ncp->nc_vhash.le_prev != NULL) {
    415 		LIST_REMOVE(ncp, nc_vhash);
    416 		ncp->nc_vhash.le_prev = NULL;
    417 	}
    418 	if (ncp->nc_vlist.le_prev != NULL) {
    419 		LIST_REMOVE(ncp, nc_vlist);
    420 		ncp->nc_vlist.le_prev = NULL;
    421 	}
    422 	if (ncp->nc_dvlist.le_prev != NULL) {
    423 		LIST_REMOVE(ncp, nc_dvlist);
    424 		ncp->nc_dvlist.le_prev = NULL;
    425 	}
    426 }
    427 
    428 /*
    429  * Lock all CPUs to prevent any cache lookup activity.  Conceptually,
    430  * this locks out all "readers".
    431  */
    432 static void
    433 cache_lock_cpus(void)
    434 {
    435 	CPU_INFO_ITERATOR cii;
    436 	struct cpu_info *ci;
    437 	struct nchcpu *cpup;
    438 
    439 	/*
    440 	 * Lock out all CPUs first, then harvest per-cpu stats.  This
    441 	 * is probably not quite as cache-efficient as doing the lock
    442 	 * and harvest at the same time, but allows cache_stat_sysctl()
    443 	 * to make do with a per-cpu lock.
    444 	 */
    445 	for (CPU_INFO_FOREACH(cii, ci)) {
    446 		cpup = ci->ci_data.cpu_nch;
    447 		mutex_enter(&cpup->cpu_lock);
    448 	}
    449 	for (CPU_INFO_FOREACH(cii, ci)) {
    450 		cpup = ci->ci_data.cpu_nch;
    451 		UPDATE(cpup, ncs_goodhits);
    452 		UPDATE(cpup, ncs_neghits);
    453 		UPDATE(cpup, ncs_badhits);
    454 		UPDATE(cpup, ncs_falsehits);
    455 		UPDATE(cpup, ncs_miss);
    456 		UPDATE(cpup, ncs_long);
    457 		UPDATE(cpup, ncs_pass2);
    458 		UPDATE(cpup, ncs_2passes);
    459 		UPDATE(cpup, ncs_revhits);
    460 		UPDATE(cpup, ncs_revmiss);
    461 	}
    462 }
    463 
    464 /*
    465  * Release all CPU locks.
    466  */
    467 static void
    468 cache_unlock_cpus(void)
    469 {
    470 	CPU_INFO_ITERATOR cii;
    471 	struct cpu_info *ci;
    472 	struct nchcpu *cpup;
    473 
    474 	for (CPU_INFO_FOREACH(cii, ci)) {
    475 		cpup = ci->ci_data.cpu_nch;
    476 		mutex_exit(&cpup->cpu_lock);
    477 	}
    478 }
    479 
    480 /*
    481  * Find a single cache entry and return it locked.
    482  * The caller needs to hold namecache_lock or a per-cpu lock to hold
    483  * off cache_reclaim().
    484  */
    485 static struct namecache *
    486 cache_lookup_entry(const struct vnode *dvp, const char *name, size_t namelen)
    487 {
    488 	struct nchashhead *ncpp;
    489 	struct namecache *ncp;
    490 	nchash_t hash;
    491 
    492 	KASSERT(dvp != NULL);
    493 	hash = cache_hash(name, namelen);
    494 	ncpp = &nchashtbl[NCHASH2(hash, dvp)];
    495 
    496 	LIST_FOREACH(ncp, ncpp, nc_hash) {
    497 		membar_datadep_consumer();	/* for Alpha... */
    498 		if (ncp->nc_dvp != dvp ||
    499 		    ncp->nc_nlen != namelen ||
    500 		    memcmp(ncp->nc_name, name, (u_int)ncp->nc_nlen))
    501 		    	continue;
    502 	    	mutex_enter(&ncp->nc_lock);
    503 		if (__predict_true(ncp->nc_dvp == dvp)) {
    504 			ncp->nc_hittime = hardclock_ticks;
    505 			SDT_PROBE(vfs, namecache, lookup, hit, dvp,
    506 			    name, namelen, 0, 0);
    507 			return ncp;
    508 		}
    509 		/* Raced: entry has been nullified. */
    510 		mutex_exit(&ncp->nc_lock);
    511 	}
    512 
    513 	SDT_PROBE(vfs, namecache, lookup, miss, dvp,
    514 	    name, namelen, 0, 0);
    515 	return NULL;
    516 }
    517 
    518 /*
    519  * Look for a the name in the cache. We don't do this
    520  * if the segment name is long, simply so the cache can avoid
    521  * holding long names (which would either waste space, or
    522  * add greatly to the complexity).
    523  *
    524  * Lookup is called with DVP pointing to the directory to search,
    525  * and CNP providing the name of the entry being sought: cn_nameptr
    526  * is the name, cn_namelen is its length, and cn_flags is the flags
    527  * word from the namei operation.
    528  *
    529  * DVP must be locked.
    530  *
    531  * There are three possible non-error return states:
    532  *    1. Nothing was found in the cache. Nothing is known about
    533  *       the requested name.
    534  *    2. A negative entry was found in the cache, meaning that the
    535  *       requested name definitely does not exist.
    536  *    3. A positive entry was found in the cache, meaning that the
    537  *       requested name does exist and that we are providing the
    538  *       vnode.
    539  * In these cases the results are:
    540  *    1. 0 returned; VN is set to NULL.
    541  *    2. 1 returned; VN is set to NULL.
    542  *    3. 1 returned; VN is set to the vnode found.
    543  *
    544  * The additional result argument ISWHT is set to zero, unless a
    545  * negative entry is found that was entered as a whiteout, in which
    546  * case ISWHT is set to one.
    547  *
    548  * The ISWHT_RET argument pointer may be null. In this case an
    549  * assertion is made that the whiteout flag is not set. File systems
    550  * that do not support whiteouts can/should do this.
    551  *
    552  * Filesystems that do support whiteouts should add ISWHITEOUT to
    553  * cnp->cn_flags if ISWHT comes back nonzero.
    554  *
    555  * When a vnode is returned, it is locked, as per the vnode lookup
    556  * locking protocol.
    557  *
    558  * There is no way for this function to fail, in the sense of
    559  * generating an error that requires aborting the namei operation.
    560  *
    561  * (Prior to October 2012, this function returned an integer status,
    562  * and a vnode, and mucked with the flags word in CNP for whiteouts.
    563  * The integer status was -1 for "nothing found", ENOENT for "a
    564  * negative entry found", 0 for "a positive entry found", and possibly
    565  * other errors, and the value of VN might or might not have been set
    566  * depending on what error occurred.)
    567  */
    568 bool
    569 cache_lookup(struct vnode *dvp, const char *name, size_t namelen,
    570 	     uint32_t nameiop, uint32_t cnflags,
    571 	     int *iswht_ret, struct vnode **vn_ret)
    572 {
    573 	struct namecache *ncp;
    574 	struct vnode *vp;
    575 	struct nchcpu *cpup;
    576 	int error;
    577 	bool hit;
    578 
    579 
    580 	/* Establish default result values */
    581 	if (iswht_ret != NULL) {
    582 		*iswht_ret = 0;
    583 	}
    584 	*vn_ret = NULL;
    585 
    586 	if (__predict_false(!doingcache)) {
    587 		return false;
    588 	}
    589 
    590 	cpup = curcpu()->ci_data.cpu_nch;
    591 	mutex_enter(&cpup->cpu_lock);
    592 	if (__predict_false(namelen > NCHNAMLEN)) {
    593 		SDT_PROBE(vfs, namecache, lookup, toolong, dvp,
    594 		    name, namelen, 0, 0);
    595 		COUNT(cpup, ncs_long);
    596 		mutex_exit(&cpup->cpu_lock);
    597 		/* found nothing */
    598 		return false;
    599 	}
    600 
    601 	ncp = cache_lookup_entry(dvp, name, namelen);
    602 	if (__predict_false(ncp == NULL)) {
    603 		COUNT(cpup, ncs_miss);
    604 		mutex_exit(&cpup->cpu_lock);
    605 		/* found nothing */
    606 		return false;
    607 	}
    608 	if ((cnflags & MAKEENTRY) == 0) {
    609 		COUNT(cpup, ncs_badhits);
    610 		/*
    611 		 * Last component and we are renaming or deleting,
    612 		 * the cache entry is invalid, or otherwise don't
    613 		 * want cache entry to exist.
    614 		 */
    615 		cache_invalidate(ncp);
    616 		mutex_exit(&ncp->nc_lock);
    617 		mutex_exit(&cpup->cpu_lock);
    618 		/* found nothing */
    619 		return false;
    620 	}
    621 	if (ncp->nc_vp == NULL) {
    622 		if (iswht_ret != NULL) {
    623 			/*
    624 			 * Restore the ISWHITEOUT flag saved earlier.
    625 			 */
    626 			KASSERT((ncp->nc_flags & ~ISWHITEOUT) == 0);
    627 			*iswht_ret = (ncp->nc_flags & ISWHITEOUT) != 0;
    628 		} else {
    629 			KASSERT(ncp->nc_flags == 0);
    630 		}
    631 
    632 		if (__predict_true(nameiop != CREATE ||
    633 		    (cnflags & ISLASTCN) == 0)) {
    634 			COUNT(cpup, ncs_neghits);
    635 			/* found neg entry; vn is already null from above */
    636 			hit = true;
    637 		} else {
    638 			COUNT(cpup, ncs_badhits);
    639 			/*
    640 			 * Last component and we are preparing to create
    641 			 * the named object, so flush the negative cache
    642 			 * entry.
    643 			 */
    644 			cache_invalidate(ncp);
    645 			/* found nothing */
    646 			hit = false;
    647 		}
    648 		mutex_exit(&ncp->nc_lock);
    649 		mutex_exit(&cpup->cpu_lock);
    650 		return hit;
    651 	}
    652 
    653 	vp = ncp->nc_vp;
    654 	mutex_enter(vp->v_interlock);
    655 	mutex_exit(&ncp->nc_lock);
    656 	mutex_exit(&cpup->cpu_lock);
    657 
    658 	/*
    659 	 * Unlocked except for the vnode interlock.  Call vcache_tryvget().
    660 	 */
    661 	error = vcache_tryvget(vp);
    662 	if (error) {
    663 		KASSERT(error == EBUSY);
    664 		/*
    665 		 * This vnode is being cleaned out.
    666 		 * XXX badhits?
    667 		 */
    668 		COUNT_UNL(cpup, ncs_falsehits);
    669 		/* found nothing */
    670 		return false;
    671 	}
    672 
    673 	COUNT_UNL(cpup, ncs_goodhits);
    674 	/* found it */
    675 	*vn_ret = vp;
    676 	return true;
    677 }
    678 
    679 
    680 /*
    681  * Cut-'n-pasted version of the above without the nameiop argument.
    682  */
    683 bool
    684 cache_lookup_raw(struct vnode *dvp, const char *name, size_t namelen,
    685 		 uint32_t cnflags,
    686 		 int *iswht_ret, struct vnode **vn_ret)
    687 {
    688 	struct namecache *ncp;
    689 	struct vnode *vp;
    690 	struct nchcpu *cpup;
    691 	int error;
    692 
    693 	/* Establish default results. */
    694 	if (iswht_ret != NULL) {
    695 		*iswht_ret = 0;
    696 	}
    697 	*vn_ret = NULL;
    698 
    699 	if (__predict_false(!doingcache)) {
    700 		/* found nothing */
    701 		return false;
    702 	}
    703 
    704 	cpup = curcpu()->ci_data.cpu_nch;
    705 	mutex_enter(&cpup->cpu_lock);
    706 	if (__predict_false(namelen > NCHNAMLEN)) {
    707 		COUNT(cpup, ncs_long);
    708 		mutex_exit(&cpup->cpu_lock);
    709 		/* found nothing */
    710 		return false;
    711 	}
    712 	ncp = cache_lookup_entry(dvp, name, namelen);
    713 	if (__predict_false(ncp == NULL)) {
    714 		COUNT(cpup, ncs_miss);
    715 		mutex_exit(&cpup->cpu_lock);
    716 		/* found nothing */
    717 		return false;
    718 	}
    719 	vp = ncp->nc_vp;
    720 	if (vp == NULL) {
    721 		/*
    722 		 * Restore the ISWHITEOUT flag saved earlier.
    723 		 */
    724 		if (iswht_ret != NULL) {
    725 			KASSERT((ncp->nc_flags & ~ISWHITEOUT) == 0);
    726 			/*cnp->cn_flags |= ncp->nc_flags;*/
    727 			*iswht_ret = (ncp->nc_flags & ISWHITEOUT) != 0;
    728 		}
    729 		COUNT(cpup, ncs_neghits);
    730 		mutex_exit(&ncp->nc_lock);
    731 		mutex_exit(&cpup->cpu_lock);
    732 		/* found negative entry; vn is already null from above */
    733 		return true;
    734 	}
    735 	mutex_enter(vp->v_interlock);
    736 	mutex_exit(&ncp->nc_lock);
    737 	mutex_exit(&cpup->cpu_lock);
    738 
    739 	/*
    740 	 * Unlocked except for the vnode interlock.  Call vcache_tryvget().
    741 	 */
    742 	error = vcache_tryvget(vp);
    743 	if (error) {
    744 		KASSERT(error == EBUSY);
    745 		/*
    746 		 * This vnode is being cleaned out.
    747 		 * XXX badhits?
    748 		 */
    749 		COUNT_UNL(cpup, ncs_falsehits);
    750 		/* found nothing */
    751 		return false;
    752 	}
    753 
    754 	COUNT_UNL(cpup, ncs_goodhits); /* XXX can be "badhits" */
    755 	/* found it */
    756 	*vn_ret = vp;
    757 	return true;
    758 }
    759 
    760 /*
    761  * Scan cache looking for name of directory entry pointing at vp.
    762  *
    763  * If the lookup succeeds the vnode is referenced and stored in dvpp.
    764  *
    765  * If bufp is non-NULL, also place the name in the buffer which starts
    766  * at bufp, immediately before *bpp, and move bpp backwards to point
    767  * at the start of it.  (Yes, this is a little baroque, but it's done
    768  * this way to cater to the whims of getcwd).
    769  *
    770  * Returns 0 on success, -1 on cache miss, positive errno on failure.
    771  */
    772 int
    773 cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp)
    774 {
    775 	struct namecache *ncp;
    776 	struct vnode *dvp;
    777 	struct ncvhashhead *nvcpp;
    778 	struct nchcpu *cpup;
    779 	char *bp;
    780 	int error, nlen;
    781 
    782 	if (!doingcache)
    783 		goto out;
    784 
    785 	nvcpp = &ncvhashtbl[NCVHASH(vp)];
    786 
    787 	/*
    788 	 * We increment counters in the local CPU's per-cpu stats.
    789 	 * We don't take the per-cpu lock, however, since this function
    790 	 * is the only place these counters are incremented so no one
    791 	 * will be racing with us to increment them.
    792 	 */
    793 	cpup = curcpu()->ci_data.cpu_nch;
    794 	mutex_enter(namecache_lock);
    795 	LIST_FOREACH(ncp, nvcpp, nc_vhash) {
    796 		mutex_enter(&ncp->nc_lock);
    797 		if (ncp->nc_vp == vp &&
    798 		    (dvp = ncp->nc_dvp) != NULL &&
    799 		    dvp != vp) { 		/* avoid pesky . entries.. */
    800 
    801 #ifdef DIAGNOSTIC
    802 			if (ncp->nc_nlen == 1 &&
    803 			    ncp->nc_name[0] == '.')
    804 				panic("cache_revlookup: found entry for .");
    805 
    806 			if (ncp->nc_nlen == 2 &&
    807 			    ncp->nc_name[0] == '.' &&
    808 			    ncp->nc_name[1] == '.')
    809 				panic("cache_revlookup: found entry for ..");
    810 #endif
    811 			COUNT(cpup, ncs_revhits);
    812 			nlen = ncp->nc_nlen;
    813 
    814 			if (bufp) {
    815 				bp = *bpp;
    816 				bp -= nlen;
    817 				if (bp <= bufp) {
    818 					*dvpp = NULL;
    819 					mutex_exit(&ncp->nc_lock);
    820 					mutex_exit(namecache_lock);
    821 					SDT_PROBE(vfs, namecache, revlookup,
    822 					    fail, vp, ERANGE, 0, 0, 0);
    823 					return (ERANGE);
    824 				}
    825 				memcpy(bp, ncp->nc_name, nlen);
    826 				*bpp = bp;
    827 			}
    828 
    829 			mutex_enter(dvp->v_interlock);
    830 			mutex_exit(&ncp->nc_lock);
    831 			mutex_exit(namecache_lock);
    832 			error = vcache_tryvget(dvp);
    833 			if (error) {
    834 				KASSERT(error == EBUSY);
    835 				if (bufp)
    836 					(*bpp) += nlen;
    837 				*dvpp = NULL;
    838 				SDT_PROBE(vfs, namecache, revlookup, fail, vp,
    839 				    error, 0, 0, 0);
    840 				return -1;
    841 			}
    842 			*dvpp = dvp;
    843 			SDT_PROBE(vfs, namecache, revlookup, success, vp, dvp,
    844 			    0, 0, 0);
    845 			return (0);
    846 		}
    847 		mutex_exit(&ncp->nc_lock);
    848 	}
    849 	COUNT(cpup, ncs_revmiss);
    850 	mutex_exit(namecache_lock);
    851  out:
    852 	*dvpp = NULL;
    853 	return (-1);
    854 }
    855 
    856 /*
    857  * Add an entry to the cache
    858  */
    859 void
    860 cache_enter(struct vnode *dvp, struct vnode *vp,
    861 	    const char *name, size_t namelen, uint32_t cnflags)
    862 {
    863 	struct namecache *ncp;
    864 	struct namecache *oncp;
    865 	struct nchashhead *ncpp;
    866 	struct ncvhashhead *nvcpp;
    867 	nchash_t hash;
    868 
    869 	/* First, check whether we can/should add a cache entry. */
    870 	if ((cnflags & MAKEENTRY) == 0 ||
    871 	    __predict_false(namelen > NCHNAMLEN || !doingcache)) {
    872 		SDT_PROBE(vfs, namecache, enter, toolong, vp, name, namelen,
    873 		    0, 0);
    874 		return;
    875 	}
    876 
    877 	SDT_PROBE(vfs, namecache, enter, done, vp, name, namelen, 0, 0);
    878 	if (numcache > desiredvnodes) {
    879 		mutex_enter(namecache_lock);
    880 		cache_ev_forced.ev_count++;
    881 		cache_reclaim();
    882 		mutex_exit(namecache_lock);
    883 	}
    884 
    885 	ncp = pool_cache_get(namecache_cache, PR_WAITOK);
    886 	mutex_enter(namecache_lock);
    887 	numcache++;
    888 
    889 	/*
    890 	 * Concurrent lookups in the same directory may race for a
    891 	 * cache entry.  if there's a duplicated entry, free it.
    892 	 */
    893 	oncp = cache_lookup_entry(dvp, name, namelen);
    894 	if (oncp) {
    895 		cache_invalidate(oncp);
    896 		mutex_exit(&oncp->nc_lock);
    897 	}
    898 
    899 	/* Grab the vnode we just found. */
    900 	mutex_enter(&ncp->nc_lock);
    901 	ncp->nc_vp = vp;
    902 	ncp->nc_flags = 0;
    903 	ncp->nc_hittime = 0;
    904 	ncp->nc_gcqueue = NULL;
    905 	if (vp == NULL) {
    906 		/*
    907 		 * For negative hits, save the ISWHITEOUT flag so we can
    908 		 * restore it later when the cache entry is used again.
    909 		 */
    910 		ncp->nc_flags = cnflags & ISWHITEOUT;
    911 	}
    912 
    913 	/* Fill in cache info. */
    914 	ncp->nc_dvp = dvp;
    915 	LIST_INSERT_HEAD(&VNODE_TO_VIMPL(dvp)->vi_dnclist, ncp, nc_dvlist);
    916 	if (vp)
    917 		LIST_INSERT_HEAD(&VNODE_TO_VIMPL(vp)->vi_nclist, ncp, nc_vlist);
    918 	else {
    919 		ncp->nc_vlist.le_prev = NULL;
    920 		ncp->nc_vlist.le_next = NULL;
    921 	}
    922 	KASSERT(namelen <= NCHNAMLEN);
    923 	ncp->nc_nlen = namelen;
    924 	memcpy(ncp->nc_name, name, (unsigned)ncp->nc_nlen);
    925 	TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru);
    926 	hash = cache_hash(name, namelen);
    927 	ncpp = &nchashtbl[NCHASH2(hash, dvp)];
    928 
    929 	/*
    930 	 * Flush updates before making visible in table.  No need for a
    931 	 * memory barrier on the other side: to see modifications the
    932 	 * list must be followed, meaning a dependent pointer load.
    933 	 * The below is LIST_INSERT_HEAD() inlined, with the memory
    934 	 * barrier included in the correct place.
    935 	 */
    936 	if ((ncp->nc_hash.le_next = ncpp->lh_first) != NULL)
    937 		ncpp->lh_first->nc_hash.le_prev = &ncp->nc_hash.le_next;
    938 	ncp->nc_hash.le_prev = &ncpp->lh_first;
    939 	membar_producer();
    940 	ncpp->lh_first = ncp;
    941 
    942 	ncp->nc_vhash.le_prev = NULL;
    943 	ncp->nc_vhash.le_next = NULL;
    944 
    945 	/*
    946 	 * Create reverse-cache entries (used in getcwd) for directories.
    947 	 * (and in linux procfs exe node)
    948 	 */
    949 	if (vp != NULL &&
    950 	    vp != dvp &&
    951 #ifndef NAMECACHE_ENTER_REVERSE
    952 	    vp->v_type == VDIR &&
    953 #endif
    954 	    (ncp->nc_nlen > 2 ||
    955 	    (ncp->nc_nlen > 1 && ncp->nc_name[1] != '.') ||
    956 	    (/* ncp->nc_nlen > 0 && */ ncp->nc_name[0] != '.'))) {
    957 		nvcpp = &ncvhashtbl[NCVHASH(vp)];
    958 		LIST_INSERT_HEAD(nvcpp, ncp, nc_vhash);
    959 	}
    960 	mutex_exit(&ncp->nc_lock);
    961 	mutex_exit(namecache_lock);
    962 }
    963 
    964 /*
    965  * Name cache initialization, from vfs_init() when we are booting
    966  */
    967 void
    968 nchinit(void)
    969 {
    970 	int error;
    971 
    972 	TAILQ_INIT(&nclruhead);
    973 	namecache_cache = pool_cache_init(sizeof(struct namecache),
    974 	    coherency_unit, 0, 0, "ncache", NULL, IPL_NONE, cache_ctor,
    975 	    cache_dtor, NULL);
    976 	KASSERT(namecache_cache != NULL);
    977 
    978 	namecache_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
    979 
    980 	nchashtbl = hashinit(desiredvnodes, HASH_LIST, true, &nchash);
    981 	ncvhashtbl =
    982 #ifdef NAMECACHE_ENTER_REVERSE
    983 	    hashinit(desiredvnodes, HASH_LIST, true, &ncvhash);
    984 #else
    985 	    hashinit(desiredvnodes/8, HASH_LIST, true, &ncvhash);
    986 #endif
    987 
    988 	error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, cache_thread,
    989 	    NULL, NULL, "cachegc");
    990 	if (error != 0)
    991 		panic("nchinit %d", error);
    992 
    993 	evcnt_attach_dynamic(&cache_ev_scan, EVCNT_TYPE_MISC, NULL,
    994 	   "namecache", "entries scanned");
    995 	evcnt_attach_dynamic(&cache_ev_gc, EVCNT_TYPE_MISC, NULL,
    996 	   "namecache", "entries collected");
    997 	evcnt_attach_dynamic(&cache_ev_over, EVCNT_TYPE_MISC, NULL,
    998 	   "namecache", "over scan target");
    999 	evcnt_attach_dynamic(&cache_ev_under, EVCNT_TYPE_MISC, NULL,
   1000 	   "namecache", "under scan target");
   1001 	evcnt_attach_dynamic(&cache_ev_forced, EVCNT_TYPE_MISC, NULL,
   1002 	   "namecache", "forced reclaims");
   1003 
   1004 	sysctl_cache_stat_setup();
   1005 }
   1006 
   1007 static int
   1008 cache_ctor(void *arg, void *obj, int flag)
   1009 {
   1010 	struct namecache *ncp;
   1011 
   1012 	ncp = obj;
   1013 	mutex_init(&ncp->nc_lock, MUTEX_DEFAULT, IPL_NONE);
   1014 
   1015 	return 0;
   1016 }
   1017 
   1018 static void
   1019 cache_dtor(void *arg, void *obj)
   1020 {
   1021 	struct namecache *ncp;
   1022 
   1023 	ncp = obj;
   1024 	mutex_destroy(&ncp->nc_lock);
   1025 }
   1026 
   1027 /*
   1028  * Called once for each CPU in the system as attached.
   1029  */
   1030 void
   1031 cache_cpu_init(struct cpu_info *ci)
   1032 {
   1033 	struct nchcpu *cpup;
   1034 	size_t sz;
   1035 
   1036 	sz = roundup2(sizeof(*cpup), coherency_unit) + coherency_unit;
   1037 	cpup = kmem_zalloc(sz, KM_SLEEP);
   1038 	cpup = (void *)roundup2((uintptr_t)cpup, coherency_unit);
   1039 	mutex_init(&cpup->cpu_lock, MUTEX_DEFAULT, IPL_NONE);
   1040 	ci->ci_data.cpu_nch = cpup;
   1041 }
   1042 
   1043 /*
   1044  * Name cache reinitialization, for when the maximum number of vnodes increases.
   1045  */
   1046 void
   1047 nchreinit(void)
   1048 {
   1049 	struct namecache *ncp;
   1050 	struct nchashhead *oldhash1, *hash1;
   1051 	struct ncvhashhead *oldhash2, *hash2;
   1052 	u_long i, oldmask1, oldmask2, mask1, mask2;
   1053 
   1054 	hash1 = hashinit(desiredvnodes, HASH_LIST, true, &mask1);
   1055 	hash2 =
   1056 #ifdef NAMECACHE_ENTER_REVERSE
   1057 	    hashinit(desiredvnodes, HASH_LIST, true, &mask2);
   1058 #else
   1059 	    hashinit(desiredvnodes/8, HASH_LIST, true, &mask2);
   1060 #endif
   1061 	mutex_enter(namecache_lock);
   1062 	cache_lock_cpus();
   1063 	oldhash1 = nchashtbl;
   1064 	oldmask1 = nchash;
   1065 	nchashtbl = hash1;
   1066 	nchash = mask1;
   1067 	oldhash2 = ncvhashtbl;
   1068 	oldmask2 = ncvhash;
   1069 	ncvhashtbl = hash2;
   1070 	ncvhash = mask2;
   1071 	for (i = 0; i <= oldmask1; i++) {
   1072 		while ((ncp = LIST_FIRST(&oldhash1[i])) != NULL) {
   1073 			LIST_REMOVE(ncp, nc_hash);
   1074 			ncp->nc_hash.le_prev = NULL;
   1075 		}
   1076 	}
   1077 	for (i = 0; i <= oldmask2; i++) {
   1078 		while ((ncp = LIST_FIRST(&oldhash2[i])) != NULL) {
   1079 			LIST_REMOVE(ncp, nc_vhash);
   1080 			ncp->nc_vhash.le_prev = NULL;
   1081 		}
   1082 	}
   1083 	cache_unlock_cpus();
   1084 	mutex_exit(namecache_lock);
   1085 	hashdone(oldhash1, HASH_LIST, oldmask1);
   1086 	hashdone(oldhash2, HASH_LIST, oldmask2);
   1087 }
   1088 
   1089 /*
   1090  * Cache flush, a particular vnode; called when a vnode is renamed to
   1091  * hide entries that would now be invalid
   1092  */
   1093 void
   1094 cache_purge1(struct vnode *vp, const char *name, size_t namelen, int flags)
   1095 {
   1096 	struct namecache *ncp, *ncnext;
   1097 
   1098 	mutex_enter(namecache_lock);
   1099 	if (flags & PURGE_PARENTS) {
   1100 		SDT_PROBE(vfs, namecache, purge, parents, vp, 0, 0, 0, 0);
   1101 
   1102 		for (ncp = LIST_FIRST(&VNODE_TO_VIMPL(vp)->vi_nclist);
   1103 		    ncp != NULL; ncp = ncnext) {
   1104 			ncnext = LIST_NEXT(ncp, nc_vlist);
   1105 			mutex_enter(&ncp->nc_lock);
   1106 			cache_invalidate(ncp);
   1107 			mutex_exit(&ncp->nc_lock);
   1108 			cache_disassociate(ncp);
   1109 		}
   1110 	}
   1111 	if (flags & PURGE_CHILDREN) {
   1112 		SDT_PROBE(vfs, namecache, purge, children, vp, 0, 0, 0, 0);
   1113 		for (ncp = LIST_FIRST(&VNODE_TO_VIMPL(vp)->vi_dnclist);
   1114 		    ncp != NULL; ncp = ncnext) {
   1115 			ncnext = LIST_NEXT(ncp, nc_dvlist);
   1116 			mutex_enter(&ncp->nc_lock);
   1117 			cache_invalidate(ncp);
   1118 			mutex_exit(&ncp->nc_lock);
   1119 			cache_disassociate(ncp);
   1120 		}
   1121 	}
   1122 	if (name != NULL) {
   1123 		SDT_PROBE(vfs, namecache, purge, name, name, namelen, 0, 0, 0);
   1124 		ncp = cache_lookup_entry(vp, name, namelen);
   1125 		if (ncp) {
   1126 			cache_invalidate(ncp);
   1127 			mutex_exit(&ncp->nc_lock);
   1128 			cache_disassociate(ncp);
   1129 		}
   1130 	}
   1131 	mutex_exit(namecache_lock);
   1132 }
   1133 
   1134 /*
   1135  * Cache flush, a whole filesystem; called when filesys is umounted to
   1136  * remove entries that would now be invalid.
   1137  */
   1138 void
   1139 cache_purgevfs(struct mount *mp)
   1140 {
   1141 	struct namecache *ncp, *nxtcp;
   1142 
   1143 	SDT_PROBE(vfs, namecache, purge, vfs, mp, 0, 0, 0, 0);
   1144 	mutex_enter(namecache_lock);
   1145 	for (ncp = TAILQ_FIRST(&nclruhead); ncp != NULL; ncp = nxtcp) {
   1146 		nxtcp = TAILQ_NEXT(ncp, nc_lru);
   1147 		mutex_enter(&ncp->nc_lock);
   1148 		if (ncp->nc_dvp != NULL && ncp->nc_dvp->v_mount == mp) {
   1149 			/* Free the resources we had. */
   1150 			cache_invalidate(ncp);
   1151 			cache_disassociate(ncp);
   1152 		}
   1153 		mutex_exit(&ncp->nc_lock);
   1154 	}
   1155 	cache_reclaim();
   1156 	mutex_exit(namecache_lock);
   1157 }
   1158 
   1159 /*
   1160  * Scan global list invalidating entries until we meet a preset target.
   1161  * Prefer to invalidate entries that have not scored a hit within
   1162  * cache_hottime seconds.  We sort the LRU list only for this routine's
   1163  * benefit.
   1164  */
   1165 static void
   1166 cache_prune(int incache, int target)
   1167 {
   1168 	struct namecache *ncp, *nxtcp, *sentinel;
   1169 	int items, recent, tryharder;
   1170 
   1171 	KASSERT(mutex_owned(namecache_lock));
   1172 
   1173 	SDT_PROBE(vfs, namecache, prune, done, incache, target, 0, 0, 0);
   1174 	items = 0;
   1175 	tryharder = 0;
   1176 	recent = hardclock_ticks - hz * cache_hottime;
   1177 	sentinel = NULL;
   1178 	for (ncp = TAILQ_FIRST(&nclruhead); ncp != NULL; ncp = nxtcp) {
   1179 		if (incache <= target)
   1180 			break;
   1181 		items++;
   1182 		nxtcp = TAILQ_NEXT(ncp, nc_lru);
   1183 		if (ncp == sentinel) {
   1184 			/*
   1185 			 * If we looped back on ourself, then ignore
   1186 			 * recent entries and purge whatever we find.
   1187 			 */
   1188 			tryharder = 1;
   1189 		}
   1190 		if (ncp->nc_dvp == NULL)
   1191 			continue;
   1192 		if (!tryharder && (ncp->nc_hittime - recent) > 0) {
   1193 			if (sentinel == NULL)
   1194 				sentinel = ncp;
   1195 			TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
   1196 			TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru);
   1197 			continue;
   1198 		}
   1199 		mutex_enter(&ncp->nc_lock);
   1200 		if (ncp->nc_dvp != NULL) {
   1201 			cache_invalidate(ncp);
   1202 			cache_disassociate(ncp);
   1203 			incache--;
   1204 		}
   1205 		mutex_exit(&ncp->nc_lock);
   1206 	}
   1207 	cache_ev_scan.ev_count += items;
   1208 }
   1209 
   1210 /*
   1211  * Collect dead cache entries from all CPUs and garbage collect.
   1212  */
   1213 static void
   1214 cache_reclaim(void)
   1215 {
   1216 	struct namecache *ncp, *next;
   1217 	int items;
   1218 
   1219 	KASSERT(mutex_owned(namecache_lock));
   1220 
   1221 	/*
   1222 	 * If the number of extant entries not awaiting garbage collection
   1223 	 * exceeds the high water mark, then reclaim stale entries until we
   1224 	 * reach our low water mark.
   1225 	 */
   1226 	items = numcache - cache_gcpend;
   1227 	if (items > (uint64_t)desiredvnodes * cache_hiwat / 100) {
   1228 		cache_prune(items, (int)((uint64_t)desiredvnodes *
   1229 		    cache_lowat / 100));
   1230 		cache_ev_over.ev_count++;
   1231 	} else
   1232 		cache_ev_under.ev_count++;
   1233 
   1234 	/*
   1235 	 * Stop forward lookup activity on all CPUs and garbage collect dead
   1236 	 * entries.
   1237 	 */
   1238 	cache_lock_cpus();
   1239 	ncp = cache_gcqueue;
   1240 	cache_gcqueue = NULL;
   1241 	items = cache_gcpend;
   1242 	cache_gcpend = 0;
   1243 	while (ncp != NULL) {
   1244 		next = ncp->nc_gcqueue;
   1245 		cache_disassociate(ncp);
   1246 		KASSERT(ncp->nc_dvp == NULL);
   1247 		if (ncp->nc_hash.le_prev != NULL) {
   1248 			LIST_REMOVE(ncp, nc_hash);
   1249 			ncp->nc_hash.le_prev = NULL;
   1250 		}
   1251 		pool_cache_put(namecache_cache, ncp);
   1252 		ncp = next;
   1253 	}
   1254 	cache_unlock_cpus();
   1255 	numcache -= items;
   1256 	cache_ev_gc.ev_count += items;
   1257 }
   1258 
   1259 /*
   1260  * Cache maintainence thread, awakening once per second to:
   1261  *
   1262  * => keep number of entries below the high water mark
   1263  * => sort pseudo-LRU list
   1264  * => garbage collect dead entries
   1265  */
   1266 static void
   1267 cache_thread(void *arg)
   1268 {
   1269 
   1270 	mutex_enter(namecache_lock);
   1271 	for (;;) {
   1272 		cache_reclaim();
   1273 		kpause("cachegc", false, hz, namecache_lock);
   1274 	}
   1275 }
   1276 
   1277 #ifdef DDB
   1278 void
   1279 namecache_print(struct vnode *vp, void (*pr)(const char *, ...))
   1280 {
   1281 	struct vnode *dvp = NULL;
   1282 	struct namecache *ncp;
   1283 
   1284 	TAILQ_FOREACH(ncp, &nclruhead, nc_lru) {
   1285 		if (ncp->nc_vp == vp && ncp->nc_dvp != NULL) {
   1286 			(*pr)("name %.*s\n", ncp->nc_nlen, ncp->nc_name);
   1287 			dvp = ncp->nc_dvp;
   1288 		}
   1289 	}
   1290 	if (dvp == NULL) {
   1291 		(*pr)("name not found\n");
   1292 		return;
   1293 	}
   1294 	vp = dvp;
   1295 	TAILQ_FOREACH(ncp, &nclruhead, nc_lru) {
   1296 		if (ncp->nc_vp == vp) {
   1297 			(*pr)("parent %.*s\n", ncp->nc_nlen, ncp->nc_name);
   1298 		}
   1299 	}
   1300 }
   1301 #endif
   1302 
   1303 void
   1304 namecache_count_pass2(void)
   1305 {
   1306 	struct nchcpu *cpup = curcpu()->ci_data.cpu_nch;
   1307 
   1308 	COUNT_UNL(cpup, ncs_pass2);
   1309 }
   1310 
   1311 void
   1312 namecache_count_2passes(void)
   1313 {
   1314 	struct nchcpu *cpup = curcpu()->ci_data.cpu_nch;
   1315 
   1316 	COUNT_UNL(cpup, ncs_2passes);
   1317 }
   1318 
   1319 /*
   1320  * Fetch the current values of the stats.  We return the most
   1321  * recent values harvested into nchstats by cache_reclaim(), which
   1322  * will be less than a second old.
   1323  */
   1324 static int
   1325 cache_stat_sysctl(SYSCTLFN_ARGS)
   1326 {
   1327 	struct nchstats stats;
   1328 	struct nchcpu *my_cpup;
   1329 #ifdef CACHE_STATS_CURRENT
   1330 	CPU_INFO_ITERATOR cii;
   1331 	struct cpu_info *ci;
   1332 #endif	/* CACHE_STATS_CURRENT */
   1333 
   1334 	if (oldp == NULL) {
   1335 		*oldlenp = sizeof(stats);
   1336 		return 0;
   1337 	}
   1338 
   1339 	if (*oldlenp < sizeof(stats)) {
   1340 		*oldlenp = 0;
   1341 		return 0;
   1342 	}
   1343 
   1344 	/*
   1345 	 * Take this CPU's per-cpu lock to hold off cache_reclaim()
   1346 	 * from doing a stats update while doing minimal damage to
   1347 	 * concurrent operations.
   1348 	 */
   1349 	sysctl_unlock();
   1350 	my_cpup = curcpu()->ci_data.cpu_nch;
   1351 	mutex_enter(&my_cpup->cpu_lock);
   1352 	stats = nchstats;
   1353 #ifdef CACHE_STATS_CURRENT
   1354 	for (CPU_INFO_FOREACH(cii, ci)) {
   1355 		struct nchcpu *cpup = ci->ci_data.cpu_nch;
   1356 
   1357 		ADD(stats, cpup, ncs_goodhits);
   1358 		ADD(stats, cpup, ncs_neghits);
   1359 		ADD(stats, cpup, ncs_badhits);
   1360 		ADD(stats, cpup, ncs_falsehits);
   1361 		ADD(stats, cpup, ncs_miss);
   1362 		ADD(stats, cpup, ncs_long);
   1363 		ADD(stats, cpup, ncs_pass2);
   1364 		ADD(stats, cpup, ncs_2passes);
   1365 		ADD(stats, cpup, ncs_revhits);
   1366 		ADD(stats, cpup, ncs_revmiss);
   1367 	}
   1368 #endif	/* CACHE_STATS_CURRENT */
   1369 	mutex_exit(&my_cpup->cpu_lock);
   1370 	sysctl_relock();
   1371 
   1372 	*oldlenp = sizeof(stats);
   1373 	return sysctl_copyout(l, &stats, oldp, sizeof(stats));
   1374 }
   1375 
   1376 static void
   1377 sysctl_cache_stat_setup(void)
   1378 {
   1379 
   1380 	KASSERT(sysctllog == NULL);
   1381 	sysctl_createv(&sysctllog, 0, NULL, NULL,
   1382 		       CTLFLAG_PERMANENT,
   1383 		       CTLTYPE_STRUCT, "namecache_stats",
   1384 		       SYSCTL_DESCR("namecache statistics"),
   1385 		       cache_stat_sysctl, 0, NULL, 0,
   1386 		       CTL_VFS, CTL_CREATE, CTL_EOL);
   1387 }
   1388