vfs_vnode.c revision 1.148 1 /* $NetBSD: vfs_vnode.c,v 1.148 2023/02/22 21:44:21 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 1997-2011, 2019, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /*
34 * Copyright (c) 1989, 1993
35 * The Regents of the University of California. All rights reserved.
36 * (c) UNIX System Laboratories, Inc.
37 * All or some portions of this file are derived from material licensed
38 * to the University of California by American Telephone and Telegraph
39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40 * the permission of UNIX System Laboratories, Inc.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
67 */
68
69 /*
70 * The vnode cache subsystem.
71 *
72 * Life-cycle
73 *
74 * Normally, there are two points where new vnodes are created:
75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode
76 * starts in one of the following ways:
77 *
78 * - Allocation, via vcache_get(9) or vcache_new(9).
79 * - Reclamation of inactive vnode, via vcache_vget(9).
80 *
81 * Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9)
82 * was another, traditional way. Currently, only the draining thread
83 * recycles the vnodes. This behaviour might be revisited.
84 *
85 * The life-cycle ends when the last reference is dropped, usually
86 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform
87 * the file system that vnode is inactive. Via this call, file system
88 * indicates whether vnode can be recycled (usually, it checks its own
89 * references, e.g. count of links, whether the file was removed).
90 *
91 * Depending on indication, vnode can be put into a free list (cache),
92 * or cleaned via vcache_reclaim, which calls VOP_RECLAIM(9) to
93 * disassociate underlying file system from the vnode, and finally
94 * destroyed.
95 *
96 * Vnode state
97 *
98 * Vnode is always in one of six states:
99 * - MARKER This is a marker vnode to help list traversal. It
100 * will never change its state.
101 * - LOADING Vnode is associating underlying file system and not
102 * yet ready to use.
103 * - LOADED Vnode has associated underlying file system and is
104 * ready to use.
105 * - BLOCKED Vnode is active but cannot get new references.
106 * - RECLAIMING Vnode is disassociating from the underlying file
107 * system.
108 * - RECLAIMED Vnode has disassociated from underlying file system
109 * and is dead.
110 *
111 * Valid state changes are:
112 * LOADING -> LOADED
113 * Vnode has been initialised in vcache_get() or
114 * vcache_new() and is ready to use.
115 * BLOCKED -> RECLAIMING
116 * Vnode starts disassociation from underlying file
117 * system in vcache_reclaim().
118 * RECLAIMING -> RECLAIMED
119 * Vnode finished disassociation from underlying file
120 * system in vcache_reclaim().
121 * LOADED -> BLOCKED
122 * Either vcache_rekey*() is changing the vnode key or
123 * vrelel() is about to call VOP_INACTIVE().
124 * BLOCKED -> LOADED
125 * The block condition is over.
126 * LOADING -> RECLAIMED
127 * Either vcache_get() or vcache_new() failed to
128 * associate the underlying file system or vcache_rekey*()
129 * drops a vnode used as placeholder.
130 *
131 * Of these states LOADING, BLOCKED and RECLAIMING are intermediate
132 * and it is possible to wait for state change.
133 *
134 * State is protected with v_interlock with one exception:
135 * to change from LOADING both v_interlock and vcache_lock must be held
136 * so it is possible to check "state == LOADING" without holding
137 * v_interlock. See vcache_get() for details.
138 *
139 * Reference counting
140 *
141 * Vnode is considered active, if reference count (vnode_t::v_usecount)
142 * is non-zero. It is maintained using: vref(9) and vrele(9), as well
143 * as vput(9), routines. Common points holding references are e.g.
144 * file openings, current working directory, mount points, etc.
145 *
146 * v_usecount is adjusted with atomic operations, however to change
147 * from a non-zero value to zero the interlock must also be held.
148 */
149
150 #include <sys/cdefs.h>
151 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.148 2023/02/22 21:44:21 riastradh Exp $");
152
153 #ifdef _KERNEL_OPT
154 #include "opt_pax.h"
155 #endif
156
157 #include <sys/param.h>
158 #include <sys/kernel.h>
159
160 #include <sys/atomic.h>
161 #include <sys/buf.h>
162 #include <sys/conf.h>
163 #include <sys/device.h>
164 #include <sys/hash.h>
165 #include <sys/kauth.h>
166 #include <sys/kmem.h>
167 #include <sys/kthread.h>
168 #include <sys/module.h>
169 #include <sys/mount.h>
170 #include <sys/namei.h>
171 #include <sys/pax.h>
172 #include <sys/syscallargs.h>
173 #include <sys/sysctl.h>
174 #include <sys/systm.h>
175 #include <sys/vnode_impl.h>
176 #include <sys/wapbl.h>
177 #include <sys/fstrans.h>
178
179 #include <miscfs/deadfs/deadfs.h>
180 #include <miscfs/specfs/specdev.h>
181
182 #include <uvm/uvm.h>
183 #include <uvm/uvm_readahead.h>
184 #include <uvm/uvm_stat.h>
185
186 /* Flags to vrelel. */
187 #define VRELEL_ASYNC 0x0001 /* Always defer to vrele thread. */
188
189 #define LRU_VRELE 0
190 #define LRU_FREE 1
191 #define LRU_HOLD 2
192 #define LRU_COUNT 3
193
194 /*
195 * There are three lru lists: one holds vnodes waiting for async release,
196 * one is for vnodes which have no buffer/page references and one for those
197 * which do (i.e. v_holdcnt is non-zero). We put the lists into a single,
198 * private cache line as vnodes migrate between them while under the same
199 * lock (vdrain_lock).
200 */
201 u_int numvnodes __cacheline_aligned;
202 static vnodelst_t lru_list[LRU_COUNT] __cacheline_aligned;
203 static kmutex_t vdrain_lock __cacheline_aligned;
204 static kcondvar_t vdrain_cv;
205 static int vdrain_gen;
206 static kcondvar_t vdrain_gen_cv;
207 static bool vdrain_retry;
208 static lwp_t * vdrain_lwp;
209 SLIST_HEAD(hashhead, vnode_impl);
210 static kmutex_t vcache_lock __cacheline_aligned;
211 static kcondvar_t vcache_cv;
212 static u_int vcache_hashsize;
213 static u_long vcache_hashmask;
214 static struct hashhead *vcache_hashtab;
215 static pool_cache_t vcache_pool;
216 static void lru_requeue(vnode_t *, vnodelst_t *);
217 static vnodelst_t * lru_which(vnode_t *);
218 static vnode_impl_t * vcache_alloc(void);
219 static void vcache_dealloc(vnode_impl_t *);
220 static void vcache_free(vnode_impl_t *);
221 static void vcache_init(void);
222 static void vcache_reinit(void);
223 static void vcache_reclaim(vnode_t *);
224 static void vrelel(vnode_t *, int, int);
225 static void vdrain_thread(void *);
226 static void vnpanic(vnode_t *, const char *, ...)
227 __printflike(2, 3);
228
229 /* Routines having to do with the management of the vnode table. */
230
231 /*
232 * The high bit of v_usecount is a gate for vcache_tryvget(). It's set
233 * only when the vnode state is LOADED.
234 * The next bit of v_usecount is a flag for vrelel(). It's set
235 * from vcache_vget() and vcache_tryvget() whenever the operation succeeds.
236 */
237 #define VUSECOUNT_MASK 0x3fffffff
238 #define VUSECOUNT_GATE 0x80000000
239 #define VUSECOUNT_VGET 0x40000000
240
241 /*
242 * Return the current usecount of a vnode.
243 */
244 inline int
245 vrefcnt(struct vnode *vp)
246 {
247
248 return atomic_load_relaxed(&vp->v_usecount) & VUSECOUNT_MASK;
249 }
250
251 /* Vnode state operations and diagnostics. */
252
253 #if defined(DIAGNOSTIC)
254
255 #define VSTATE_VALID(state) \
256 ((state) != VS_ACTIVE && (state) != VS_MARKER)
257 #define VSTATE_GET(vp) \
258 vstate_assert_get((vp), __func__, __LINE__)
259 #define VSTATE_CHANGE(vp, from, to) \
260 vstate_assert_change((vp), (from), (to), __func__, __LINE__)
261 #define VSTATE_WAIT_STABLE(vp) \
262 vstate_assert_wait_stable((vp), __func__, __LINE__)
263
264 void
265 _vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line,
266 bool has_lock)
267 {
268 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
269 int refcnt = vrefcnt(vp);
270
271 if (!has_lock) {
272 enum vnode_state vstate = atomic_load_relaxed(&vip->vi_state);
273
274 if (state == VS_ACTIVE && refcnt > 0 &&
275 (vstate == VS_LOADED || vstate == VS_BLOCKED))
276 return;
277 if (vstate == state)
278 return;
279 mutex_enter((vp)->v_interlock);
280 }
281
282 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
283
284 if ((state == VS_ACTIVE && refcnt > 0 &&
285 (vip->vi_state == VS_LOADED || vip->vi_state == VS_BLOCKED)) ||
286 vip->vi_state == state) {
287 if (!has_lock)
288 mutex_exit((vp)->v_interlock);
289 return;
290 }
291 vnpanic(vp, "state is %s, usecount %d, expected %s at %s:%d",
292 vstate_name(vip->vi_state), refcnt,
293 vstate_name(state), func, line);
294 }
295
296 static enum vnode_state
297 vstate_assert_get(vnode_t *vp, const char *func, int line)
298 {
299 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
300
301 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
302 if (! VSTATE_VALID(vip->vi_state))
303 vnpanic(vp, "state is %s at %s:%d",
304 vstate_name(vip->vi_state), func, line);
305
306 return vip->vi_state;
307 }
308
309 static void
310 vstate_assert_wait_stable(vnode_t *vp, const char *func, int line)
311 {
312 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
313
314 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
315 if (! VSTATE_VALID(vip->vi_state))
316 vnpanic(vp, "state is %s at %s:%d",
317 vstate_name(vip->vi_state), func, line);
318
319 while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED)
320 cv_wait(&vp->v_cv, vp->v_interlock);
321
322 if (! VSTATE_VALID(vip->vi_state))
323 vnpanic(vp, "state is %s at %s:%d",
324 vstate_name(vip->vi_state), func, line);
325 }
326
327 static void
328 vstate_assert_change(vnode_t *vp, enum vnode_state from, enum vnode_state to,
329 const char *func, int line)
330 {
331 bool gated = (atomic_load_relaxed(&vp->v_usecount) & VUSECOUNT_GATE);
332 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
333
334 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
335 if (from == VS_LOADING)
336 KASSERTMSG(mutex_owned(&vcache_lock), "at %s:%d", func, line);
337
338 if (! VSTATE_VALID(from))
339 vnpanic(vp, "from is %s at %s:%d",
340 vstate_name(from), func, line);
341 if (! VSTATE_VALID(to))
342 vnpanic(vp, "to is %s at %s:%d",
343 vstate_name(to), func, line);
344 if (vip->vi_state != from)
345 vnpanic(vp, "from is %s, expected %s at %s:%d\n",
346 vstate_name(vip->vi_state), vstate_name(from), func, line);
347 if ((from == VS_LOADED) != gated)
348 vnpanic(vp, "state is %s, gate %d does not match at %s:%d\n",
349 vstate_name(vip->vi_state), gated, func, line);
350
351 /* Open/close the gate for vcache_tryvget(). */
352 if (to == VS_LOADED) {
353 #ifndef __HAVE_ATOMIC_AS_MEMBAR
354 membar_release();
355 #endif
356 atomic_or_uint(&vp->v_usecount, VUSECOUNT_GATE);
357 } else {
358 atomic_and_uint(&vp->v_usecount, ~VUSECOUNT_GATE);
359 }
360
361 atomic_store_relaxed(&vip->vi_state, to);
362 if (from == VS_LOADING)
363 cv_broadcast(&vcache_cv);
364 if (to == VS_LOADED || to == VS_RECLAIMED)
365 cv_broadcast(&vp->v_cv);
366 }
367
368 #else /* defined(DIAGNOSTIC) */
369
370 #define VSTATE_GET(vp) \
371 (VNODE_TO_VIMPL((vp))->vi_state)
372 #define VSTATE_CHANGE(vp, from, to) \
373 vstate_change((vp), (from), (to))
374 #define VSTATE_WAIT_STABLE(vp) \
375 vstate_wait_stable((vp))
376 void
377 _vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line,
378 bool has_lock)
379 {
380
381 }
382
383 static void
384 vstate_wait_stable(vnode_t *vp)
385 {
386 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
387
388 while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED)
389 cv_wait(&vp->v_cv, vp->v_interlock);
390 }
391
392 static void
393 vstate_change(vnode_t *vp, enum vnode_state from, enum vnode_state to)
394 {
395 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
396
397 /* Open/close the gate for vcache_tryvget(). */
398 if (to == VS_LOADED) {
399 #ifndef __HAVE_ATOMIC_AS_MEMBAR
400 membar_release();
401 #endif
402 atomic_or_uint(&vp->v_usecount, VUSECOUNT_GATE);
403 } else {
404 atomic_and_uint(&vp->v_usecount, ~VUSECOUNT_GATE);
405 }
406
407 atomic_store_relaxed(&vip->vi_state, to);
408 if (from == VS_LOADING)
409 cv_broadcast(&vcache_cv);
410 if (to == VS_LOADED || to == VS_RECLAIMED)
411 cv_broadcast(&vp->v_cv);
412 }
413
414 #endif /* defined(DIAGNOSTIC) */
415
416 void
417 vfs_vnode_sysinit(void)
418 {
419 int error __diagused, i;
420
421 dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL);
422 KASSERT(dead_rootmount != NULL);
423 dead_rootmount->mnt_iflag |= IMNT_MPSAFE;
424
425 mutex_init(&vdrain_lock, MUTEX_DEFAULT, IPL_NONE);
426 for (i = 0; i < LRU_COUNT; i++) {
427 TAILQ_INIT(&lru_list[i]);
428 }
429 vcache_init();
430
431 cv_init(&vdrain_cv, "vdrain");
432 cv_init(&vdrain_gen_cv, "vdrainwt");
433 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread,
434 NULL, &vdrain_lwp, "vdrain");
435 KASSERTMSG((error == 0), "kthread_create(vdrain) failed: %d", error);
436 }
437
438 /*
439 * Allocate a new marker vnode.
440 */
441 vnode_t *
442 vnalloc_marker(struct mount *mp)
443 {
444 vnode_impl_t *vip;
445 vnode_t *vp;
446
447 vip = pool_cache_get(vcache_pool, PR_WAITOK);
448 memset(vip, 0, sizeof(*vip));
449 vp = VIMPL_TO_VNODE(vip);
450 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1);
451 vp->v_mount = mp;
452 vp->v_type = VBAD;
453 vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
454 klist_init(&vip->vi_klist.vk_klist);
455 vp->v_klist = &vip->vi_klist;
456 vip->vi_state = VS_MARKER;
457
458 return vp;
459 }
460
461 /*
462 * Free a marker vnode.
463 */
464 void
465 vnfree_marker(vnode_t *vp)
466 {
467 vnode_impl_t *vip;
468
469 vip = VNODE_TO_VIMPL(vp);
470 KASSERT(vip->vi_state == VS_MARKER);
471 mutex_obj_free(vp->v_interlock);
472 uvm_obj_destroy(&vp->v_uobj, true);
473 klist_fini(&vip->vi_klist.vk_klist);
474 pool_cache_put(vcache_pool, vip);
475 }
476
477 /*
478 * Test a vnode for being a marker vnode.
479 */
480 bool
481 vnis_marker(vnode_t *vp)
482 {
483
484 return (VNODE_TO_VIMPL(vp)->vi_state == VS_MARKER);
485 }
486
487 /*
488 * Return the lru list this node should be on.
489 */
490 static vnodelst_t *
491 lru_which(vnode_t *vp)
492 {
493
494 KASSERT(mutex_owned(vp->v_interlock));
495
496 if (vp->v_holdcnt > 0)
497 return &lru_list[LRU_HOLD];
498 else
499 return &lru_list[LRU_FREE];
500 }
501
502 /*
503 * Put vnode to end of given list.
504 * Both the current and the new list may be NULL, used on vnode alloc/free.
505 * Adjust numvnodes and signal vdrain thread if there is work.
506 */
507 static void
508 lru_requeue(vnode_t *vp, vnodelst_t *listhd)
509 {
510 vnode_impl_t *vip;
511 int d;
512
513 /*
514 * If the vnode is on the correct list, and was put there recently,
515 * then leave it be, thus avoiding huge cache and lock contention.
516 */
517 vip = VNODE_TO_VIMPL(vp);
518 if (listhd == vip->vi_lrulisthd &&
519 (getticks() - vip->vi_lrulisttm) < hz) {
520 return;
521 }
522
523 mutex_enter(&vdrain_lock);
524 d = 0;
525 if (vip->vi_lrulisthd != NULL)
526 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
527 else
528 d++;
529 vip->vi_lrulisthd = listhd;
530 vip->vi_lrulisttm = getticks();
531 if (vip->vi_lrulisthd != NULL)
532 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
533 else
534 d--;
535 if (d != 0) {
536 /*
537 * Looks strange? This is not a bug. Don't store
538 * numvnodes unless there is a change - avoid false
539 * sharing on MP.
540 */
541 numvnodes += d;
542 }
543 if ((d > 0 && numvnodes > desiredvnodes) ||
544 listhd == &lru_list[LRU_VRELE])
545 cv_signal(&vdrain_cv);
546 mutex_exit(&vdrain_lock);
547 }
548
549 /*
550 * Release deferred vrele vnodes for this mount.
551 * Called with file system suspended.
552 */
553 void
554 vrele_flush(struct mount *mp)
555 {
556 vnode_impl_t *vip, *marker;
557 vnode_t *vp;
558 int when = 0;
559
560 KASSERT(fstrans_is_owner(mp));
561
562 marker = VNODE_TO_VIMPL(vnalloc_marker(NULL));
563
564 mutex_enter(&vdrain_lock);
565 TAILQ_INSERT_HEAD(&lru_list[LRU_VRELE], marker, vi_lrulist);
566
567 while ((vip = TAILQ_NEXT(marker, vi_lrulist))) {
568 TAILQ_REMOVE(&lru_list[LRU_VRELE], marker, vi_lrulist);
569 TAILQ_INSERT_AFTER(&lru_list[LRU_VRELE], vip, marker,
570 vi_lrulist);
571 vp = VIMPL_TO_VNODE(vip);
572 if (vnis_marker(vp))
573 continue;
574
575 KASSERT(vip->vi_lrulisthd == &lru_list[LRU_VRELE]);
576 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
577 vip->vi_lrulisthd = &lru_list[LRU_HOLD];
578 vip->vi_lrulisttm = getticks();
579 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
580 mutex_exit(&vdrain_lock);
581
582 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
583 mutex_enter(vp->v_interlock);
584 vrelel(vp, 0, LK_EXCLUSIVE);
585
586 if (getticks() > when) {
587 yield();
588 when = getticks() + hz / 10;
589 }
590
591 mutex_enter(&vdrain_lock);
592 }
593
594 TAILQ_REMOVE(&lru_list[LRU_VRELE], marker, vi_lrulist);
595 mutex_exit(&vdrain_lock);
596
597 vnfree_marker(VIMPL_TO_VNODE(marker));
598 }
599
600 /*
601 * Reclaim a cached vnode. Used from vdrain_thread only.
602 */
603 static __inline void
604 vdrain_remove(vnode_t *vp)
605 {
606 struct mount *mp;
607
608 KASSERT(mutex_owned(&vdrain_lock));
609
610 /* Probe usecount (unlocked). */
611 if (vrefcnt(vp) > 0)
612 return;
613 /* Try v_interlock -- we lock the wrong direction! */
614 if (!mutex_tryenter(vp->v_interlock))
615 return;
616 /* Probe usecount and state. */
617 if (vrefcnt(vp) > 0 || VSTATE_GET(vp) != VS_LOADED) {
618 mutex_exit(vp->v_interlock);
619 return;
620 }
621 mp = vp->v_mount;
622 if (fstrans_start_nowait(mp) != 0) {
623 mutex_exit(vp->v_interlock);
624 return;
625 }
626 vdrain_retry = true;
627 mutex_exit(&vdrain_lock);
628
629 if (vcache_vget(vp) == 0) {
630 if (!vrecycle(vp)) {
631 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
632 mutex_enter(vp->v_interlock);
633 vrelel(vp, 0, LK_EXCLUSIVE);
634 }
635 }
636 fstrans_done(mp);
637
638 mutex_enter(&vdrain_lock);
639 }
640
641 /*
642 * Release a cached vnode. Used from vdrain_thread only.
643 */
644 static __inline void
645 vdrain_vrele(vnode_t *vp)
646 {
647 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
648 struct mount *mp;
649
650 KASSERT(mutex_owned(&vdrain_lock));
651
652 mp = vp->v_mount;
653 if (fstrans_start_nowait(mp) != 0)
654 return;
655
656 /*
657 * First remove the vnode from the vrele list.
658 * Put it on the last lru list, the last vrele()
659 * will put it back onto the right list before
660 * its usecount reaches zero.
661 */
662 KASSERT(vip->vi_lrulisthd == &lru_list[LRU_VRELE]);
663 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
664 vip->vi_lrulisthd = &lru_list[LRU_HOLD];
665 vip->vi_lrulisttm = getticks();
666 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
667
668 vdrain_retry = true;
669 mutex_exit(&vdrain_lock);
670
671 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
672 mutex_enter(vp->v_interlock);
673 vrelel(vp, 0, LK_EXCLUSIVE);
674 fstrans_done(mp);
675
676 mutex_enter(&vdrain_lock);
677 }
678
679 /*
680 * Helper thread to keep the number of vnodes below desiredvnodes
681 * and release vnodes from asynchronous vrele.
682 */
683 static void
684 vdrain_thread(void *cookie)
685 {
686 int i;
687 u_int target;
688 vnode_impl_t *vip, *marker;
689
690 marker = VNODE_TO_VIMPL(vnalloc_marker(NULL));
691
692 mutex_enter(&vdrain_lock);
693
694 for (;;) {
695 vdrain_retry = false;
696 target = desiredvnodes - desiredvnodes/10;
697
698 for (i = 0; i < LRU_COUNT; i++) {
699 TAILQ_INSERT_HEAD(&lru_list[i], marker, vi_lrulist);
700 while ((vip = TAILQ_NEXT(marker, vi_lrulist))) {
701 TAILQ_REMOVE(&lru_list[i], marker, vi_lrulist);
702 TAILQ_INSERT_AFTER(&lru_list[i], vip, marker,
703 vi_lrulist);
704 if (vnis_marker(VIMPL_TO_VNODE(vip)))
705 continue;
706 if (i == LRU_VRELE)
707 vdrain_vrele(VIMPL_TO_VNODE(vip));
708 else if (numvnodes < target)
709 break;
710 else
711 vdrain_remove(VIMPL_TO_VNODE(vip));
712 }
713 TAILQ_REMOVE(&lru_list[i], marker, vi_lrulist);
714 }
715
716 if (vdrain_retry) {
717 kpause("vdrainrt", false, 1, &vdrain_lock);
718 } else {
719 vdrain_gen++;
720 cv_broadcast(&vdrain_gen_cv);
721 cv_wait(&vdrain_cv, &vdrain_lock);
722 }
723 }
724 }
725
726 /*
727 * Try to drop reference on a vnode. Abort if we are releasing the
728 * last reference. Note: this _must_ succeed if not the last reference.
729 */
730 static bool
731 vtryrele(vnode_t *vp)
732 {
733 u_int use, next;
734
735 #ifndef __HAVE_ATOMIC_AS_MEMBAR
736 membar_release();
737 #endif
738 for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
739 if (__predict_false((use & VUSECOUNT_MASK) == 1)) {
740 return false;
741 }
742 KASSERT((use & VUSECOUNT_MASK) > 1);
743 next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
744 if (__predict_true(next == use)) {
745 return true;
746 }
747 }
748 }
749
750 /*
751 * vput: unlock and release the reference.
752 */
753 void
754 vput(vnode_t *vp)
755 {
756 int lktype;
757
758 /*
759 * Do an unlocked check of the usecount. If it looks like we're not
760 * about to drop the last reference, then unlock the vnode and try
761 * to drop the reference. If it ends up being the last reference
762 * after all, vrelel() can fix it all up. Most of the time this
763 * will all go to plan.
764 */
765 if (vrefcnt(vp) > 1) {
766 VOP_UNLOCK(vp);
767 if (vtryrele(vp)) {
768 return;
769 }
770 lktype = LK_NONE;
771 } else {
772 lktype = VOP_ISLOCKED(vp);
773 KASSERT(lktype != LK_NONE);
774 }
775 mutex_enter(vp->v_interlock);
776 vrelel(vp, 0, lktype);
777 }
778
779 /*
780 * Vnode release. If reference count drops to zero, call inactive
781 * routine and either return to freelist or free to the pool.
782 */
783 static void
784 vrelel(vnode_t *vp, int flags, int lktype)
785 {
786 const bool async = ((flags & VRELEL_ASYNC) != 0);
787 bool recycle, defer, objlock_held;
788 u_int use, next;
789 int error;
790
791 objlock_held = false;
792
793 retry:
794 KASSERT(mutex_owned(vp->v_interlock));
795
796 if (__predict_false(vp->v_op == dead_vnodeop_p &&
797 VSTATE_GET(vp) != VS_RECLAIMED)) {
798 vnpanic(vp, "dead but not clean");
799 }
800
801 /*
802 * If not the last reference, just unlock and drop the reference count.
803 *
804 * Otherwise make sure we pass a point in time where we hold the
805 * last reference with VGET flag unset.
806 */
807 for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
808 if (__predict_false((use & VUSECOUNT_MASK) > 1)) {
809 if (objlock_held) {
810 objlock_held = false;
811 rw_exit(vp->v_uobj.vmobjlock);
812 }
813 if (lktype != LK_NONE) {
814 mutex_exit(vp->v_interlock);
815 lktype = LK_NONE;
816 VOP_UNLOCK(vp);
817 mutex_enter(vp->v_interlock);
818 }
819 if (vtryrele(vp)) {
820 mutex_exit(vp->v_interlock);
821 return;
822 }
823 next = atomic_load_relaxed(&vp->v_usecount);
824 continue;
825 }
826 KASSERT((use & VUSECOUNT_MASK) == 1);
827 next = use & ~VUSECOUNT_VGET;
828 if (next != use) {
829 next = atomic_cas_uint(&vp->v_usecount, use, next);
830 }
831 if (__predict_true(next == use)) {
832 break;
833 }
834 }
835 #ifndef __HAVE_ATOMIC_AS_MEMBAR
836 membar_acquire();
837 #endif
838 if (vrefcnt(vp) <= 0 || vp->v_writecount != 0) {
839 vnpanic(vp, "%s: bad ref count", __func__);
840 }
841
842 #ifdef DIAGNOSTIC
843 if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
844 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
845 vprint("vrelel: missing VOP_CLOSE()", vp);
846 }
847 #endif
848
849 /*
850 * If already clean there is no need to lock, defer or
851 * deactivate this node.
852 */
853 if (VSTATE_GET(vp) == VS_RECLAIMED) {
854 if (objlock_held) {
855 objlock_held = false;
856 rw_exit(vp->v_uobj.vmobjlock);
857 }
858 if (lktype != LK_NONE) {
859 mutex_exit(vp->v_interlock);
860 lktype = LK_NONE;
861 VOP_UNLOCK(vp);
862 mutex_enter(vp->v_interlock);
863 }
864 goto out;
865 }
866
867 /*
868 * First try to get the vnode locked for VOP_INACTIVE().
869 * Defer vnode release to vdrain_thread if caller requests
870 * it explicitly, is the pagedaemon or the lock failed.
871 */
872 defer = false;
873 if ((curlwp == uvm.pagedaemon_lwp) || async) {
874 defer = true;
875 } else if (lktype == LK_SHARED) {
876 /* Excellent chance of getting, if the last ref. */
877 error = vn_lock(vp, LK_UPGRADE | LK_RETRY | LK_NOWAIT);
878 if (error != 0) {
879 defer = true;
880 } else {
881 lktype = LK_EXCLUSIVE;
882 }
883 } else if (lktype == LK_NONE) {
884 /* Excellent chance of getting, if the last ref. */
885 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
886 if (error != 0) {
887 defer = true;
888 } else {
889 lktype = LK_EXCLUSIVE;
890 }
891 }
892 KASSERT(mutex_owned(vp->v_interlock));
893 if (defer) {
894 /*
895 * Defer reclaim to the kthread; it's not safe to
896 * clean it here. We donate it our last reference.
897 */
898 if (lktype != LK_NONE) {
899 mutex_exit(vp->v_interlock);
900 VOP_UNLOCK(vp);
901 mutex_enter(vp->v_interlock);
902 }
903 lru_requeue(vp, &lru_list[LRU_VRELE]);
904 mutex_exit(vp->v_interlock);
905 return;
906 }
907 KASSERT(lktype == LK_EXCLUSIVE);
908
909 /* If the node gained another reference, retry. */
910 use = atomic_load_relaxed(&vp->v_usecount);
911 if ((use & VUSECOUNT_VGET) != 0) {
912 goto retry;
913 }
914 KASSERT((use & VUSECOUNT_MASK) == 1);
915
916 if ((vp->v_iflag & (VI_TEXT|VI_EXECMAP|VI_WRMAP)) != 0 ||
917 (vp->v_vflag & VV_MAPPED) != 0) {
918 /* Take care of space accounting. */
919 if (!objlock_held) {
920 objlock_held = true;
921 if (!rw_tryenter(vp->v_uobj.vmobjlock, RW_WRITER)) {
922 mutex_exit(vp->v_interlock);
923 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
924 mutex_enter(vp->v_interlock);
925 goto retry;
926 }
927 }
928 if ((vp->v_iflag & VI_EXECMAP) != 0) {
929 cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages);
930 }
931 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP);
932 vp->v_vflag &= ~VV_MAPPED;
933 }
934 if (objlock_held) {
935 objlock_held = false;
936 rw_exit(vp->v_uobj.vmobjlock);
937 }
938
939 /*
940 * Deactivate the vnode, but preserve our reference across
941 * the call to VOP_INACTIVE().
942 *
943 * If VOP_INACTIVE() indicates that the file has been
944 * deleted, then recycle the vnode.
945 *
946 * Note that VOP_INACTIVE() will not drop the vnode lock.
947 */
948 mutex_exit(vp->v_interlock);
949 recycle = false;
950 VOP_INACTIVE(vp, &recycle);
951 if (!recycle) {
952 lktype = LK_NONE;
953 VOP_UNLOCK(vp);
954 }
955 mutex_enter(vp->v_interlock);
956
957 /*
958 * Block new references then check again to see if a
959 * new reference was acquired in the meantime. If
960 * it was, restore the vnode state and try again.
961 */
962 if (recycle) {
963 VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
964 use = atomic_load_relaxed(&vp->v_usecount);
965 if ((use & VUSECOUNT_VGET) != 0) {
966 VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
967 goto retry;
968 }
969 KASSERT((use & VUSECOUNT_MASK) == 1);
970 }
971
972 /*
973 * Recycle the vnode if the file is now unused (unlinked).
974 */
975 if (recycle) {
976 VSTATE_ASSERT(vp, VS_BLOCKED);
977 KASSERT(lktype == LK_EXCLUSIVE);
978 /* vcache_reclaim drops the lock. */
979 lktype = LK_NONE;
980 vcache_reclaim(vp);
981 }
982 KASSERT(vrefcnt(vp) > 0);
983 KASSERT(lktype == LK_NONE);
984
985 out:
986 for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
987 if (__predict_false((use & VUSECOUNT_VGET) != 0 &&
988 (use & VUSECOUNT_MASK) == 1)) {
989 /* Gained and released another reference, retry. */
990 goto retry;
991 }
992 next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
993 if (__predict_true(next == use)) {
994 if (__predict_false((use & VUSECOUNT_MASK) != 1)) {
995 /* Gained another reference. */
996 mutex_exit(vp->v_interlock);
997 return;
998 }
999 break;
1000 }
1001 }
1002 #ifndef __HAVE_ATOMIC_AS_MEMBAR
1003 membar_acquire();
1004 #endif
1005
1006 if (VSTATE_GET(vp) == VS_RECLAIMED && vp->v_holdcnt == 0) {
1007 /*
1008 * It's clean so destroy it. It isn't referenced
1009 * anywhere since it has been reclaimed.
1010 */
1011 vcache_free(VNODE_TO_VIMPL(vp));
1012 } else {
1013 /*
1014 * Otherwise, put it back onto the freelist. It
1015 * can't be destroyed while still associated with
1016 * a file system.
1017 */
1018 lru_requeue(vp, lru_which(vp));
1019 mutex_exit(vp->v_interlock);
1020 }
1021 }
1022
1023 void
1024 vrele(vnode_t *vp)
1025 {
1026
1027 if (vtryrele(vp)) {
1028 return;
1029 }
1030 mutex_enter(vp->v_interlock);
1031 vrelel(vp, 0, LK_NONE);
1032 }
1033
1034 /*
1035 * Asynchronous vnode release, vnode is released in different context.
1036 */
1037 void
1038 vrele_async(vnode_t *vp)
1039 {
1040
1041 if (vtryrele(vp)) {
1042 return;
1043 }
1044 mutex_enter(vp->v_interlock);
1045 vrelel(vp, VRELEL_ASYNC, LK_NONE);
1046 }
1047
1048 /*
1049 * Vnode reference, where a reference is already held by some other
1050 * object (for example, a file structure).
1051 *
1052 * NB: lockless code sequences may rely on this not blocking.
1053 */
1054 void
1055 vref(vnode_t *vp)
1056 {
1057
1058 KASSERT(vrefcnt(vp) > 0);
1059
1060 atomic_inc_uint(&vp->v_usecount);
1061 }
1062
1063 /*
1064 * Page or buffer structure gets a reference.
1065 * Called with v_interlock held.
1066 */
1067 void
1068 vholdl(vnode_t *vp)
1069 {
1070
1071 KASSERT(mutex_owned(vp->v_interlock));
1072
1073 if (vp->v_holdcnt++ == 0 && vrefcnt(vp) == 0)
1074 lru_requeue(vp, lru_which(vp));
1075 }
1076
1077 /*
1078 * Page or buffer structure gets a reference.
1079 */
1080 void
1081 vhold(vnode_t *vp)
1082 {
1083
1084 mutex_enter(vp->v_interlock);
1085 vholdl(vp);
1086 mutex_exit(vp->v_interlock);
1087 }
1088
1089 /*
1090 * Page or buffer structure frees a reference.
1091 * Called with v_interlock held.
1092 */
1093 void
1094 holdrelel(vnode_t *vp)
1095 {
1096
1097 KASSERT(mutex_owned(vp->v_interlock));
1098
1099 if (vp->v_holdcnt <= 0) {
1100 vnpanic(vp, "%s: holdcnt vp %p", __func__, vp);
1101 }
1102
1103 vp->v_holdcnt--;
1104 if (vp->v_holdcnt == 0 && vrefcnt(vp) == 0)
1105 lru_requeue(vp, lru_which(vp));
1106 }
1107
1108 /*
1109 * Page or buffer structure frees a reference.
1110 */
1111 void
1112 holdrele(vnode_t *vp)
1113 {
1114
1115 mutex_enter(vp->v_interlock);
1116 holdrelel(vp);
1117 mutex_exit(vp->v_interlock);
1118 }
1119
1120 /*
1121 * Recycle an unused vnode if caller holds the last reference.
1122 */
1123 bool
1124 vrecycle(vnode_t *vp)
1125 {
1126 int error __diagused;
1127
1128 mutex_enter(vp->v_interlock);
1129
1130 /* If the vnode is already clean we're done. */
1131 VSTATE_WAIT_STABLE(vp);
1132 if (VSTATE_GET(vp) != VS_LOADED) {
1133 VSTATE_ASSERT(vp, VS_RECLAIMED);
1134 vrelel(vp, 0, LK_NONE);
1135 return true;
1136 }
1137
1138 /* Prevent further references until the vnode is locked. */
1139 VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
1140
1141 /* Make sure we hold the last reference. */
1142 if (vrefcnt(vp) != 1) {
1143 VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
1144 mutex_exit(vp->v_interlock);
1145 return false;
1146 }
1147
1148 mutex_exit(vp->v_interlock);
1149
1150 /*
1151 * On a leaf file system this lock will always succeed as we hold
1152 * the last reference and prevent further references.
1153 * On layered file systems waiting for the lock would open a can of
1154 * deadlocks as the lower vnodes may have other active references.
1155 */
1156 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
1157
1158 mutex_enter(vp->v_interlock);
1159 if (error) {
1160 VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
1161 mutex_exit(vp->v_interlock);
1162 return false;
1163 }
1164
1165 KASSERT(vrefcnt(vp) == 1);
1166 vcache_reclaim(vp);
1167 vrelel(vp, 0, LK_NONE);
1168
1169 return true;
1170 }
1171
1172 /*
1173 * Helper for vrevoke() to propagate suspension from lastmp
1174 * to thismp. Both args may be NULL.
1175 * Returns the currently suspended file system or NULL.
1176 */
1177 static struct mount *
1178 vrevoke_suspend_next(struct mount *lastmp, struct mount *thismp)
1179 {
1180 int error;
1181
1182 if (lastmp == thismp)
1183 return thismp;
1184
1185 if (lastmp != NULL)
1186 vfs_resume(lastmp);
1187
1188 if (thismp == NULL)
1189 return NULL;
1190
1191 do {
1192 error = vfs_suspend(thismp, 0);
1193 } while (error == EINTR || error == ERESTART);
1194
1195 if (error == 0)
1196 return thismp;
1197
1198 KASSERT(error == EOPNOTSUPP || error == ENOENT);
1199 return NULL;
1200 }
1201
1202 /*
1203 * Eliminate all activity associated with the requested vnode
1204 * and with all vnodes aliased to the requested vnode.
1205 */
1206 void
1207 vrevoke(vnode_t *vp)
1208 {
1209 struct mount *mp;
1210 vnode_t *vq;
1211 enum vtype type;
1212 dev_t dev;
1213
1214 KASSERT(vrefcnt(vp) > 0);
1215
1216 mp = vrevoke_suspend_next(NULL, vp->v_mount);
1217
1218 mutex_enter(vp->v_interlock);
1219 VSTATE_WAIT_STABLE(vp);
1220 if (VSTATE_GET(vp) == VS_RECLAIMED) {
1221 mutex_exit(vp->v_interlock);
1222 } else if (vp->v_type != VBLK && vp->v_type != VCHR) {
1223 atomic_inc_uint(&vp->v_usecount);
1224 mutex_exit(vp->v_interlock);
1225 vgone(vp);
1226 } else {
1227 dev = vp->v_rdev;
1228 type = vp->v_type;
1229 mutex_exit(vp->v_interlock);
1230
1231 while (spec_node_lookup_by_dev(type, dev, VDEAD_NOWAIT, &vq)
1232 == 0) {
1233 mp = vrevoke_suspend_next(mp, vq->v_mount);
1234 vgone(vq);
1235 }
1236 }
1237 vrevoke_suspend_next(mp, NULL);
1238 }
1239
1240 /*
1241 * Eliminate all activity associated with a vnode in preparation for
1242 * reuse. Drops a reference from the vnode.
1243 */
1244 void
1245 vgone(vnode_t *vp)
1246 {
1247 int lktype;
1248
1249 KASSERT(vp->v_mount == dead_rootmount || fstrans_is_owner(vp->v_mount));
1250
1251 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1252 lktype = LK_EXCLUSIVE;
1253 mutex_enter(vp->v_interlock);
1254 VSTATE_WAIT_STABLE(vp);
1255 if (VSTATE_GET(vp) == VS_LOADED) {
1256 VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
1257 vcache_reclaim(vp);
1258 lktype = LK_NONE;
1259 }
1260 VSTATE_ASSERT(vp, VS_RECLAIMED);
1261 vrelel(vp, 0, lktype);
1262 }
1263
1264 static inline uint32_t
1265 vcache_hash(const struct vcache_key *key)
1266 {
1267 uint32_t hash = HASH32_BUF_INIT;
1268
1269 KASSERT(key->vk_key_len > 0);
1270
1271 hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash);
1272 hash = hash32_buf(key->vk_key, key->vk_key_len, hash);
1273 return hash;
1274 }
1275
1276 static int
1277 vcache_stats(struct hashstat_sysctl *hs, bool fill)
1278 {
1279 vnode_impl_t *vip;
1280 uint64_t chain;
1281
1282 strlcpy(hs->hash_name, "vcache", sizeof(hs->hash_name));
1283 strlcpy(hs->hash_desc, "vnode cache hash", sizeof(hs->hash_desc));
1284 if (!fill)
1285 return 0;
1286
1287 hs->hash_size = vcache_hashmask + 1;
1288
1289 for (size_t i = 0; i < hs->hash_size; i++) {
1290 chain = 0;
1291 mutex_enter(&vcache_lock);
1292 SLIST_FOREACH(vip, &vcache_hashtab[i], vi_hash) {
1293 chain++;
1294 }
1295 mutex_exit(&vcache_lock);
1296 if (chain > 0) {
1297 hs->hash_used++;
1298 hs->hash_items += chain;
1299 if (chain > hs->hash_maxchain)
1300 hs->hash_maxchain = chain;
1301 }
1302 preempt_point();
1303 }
1304
1305 return 0;
1306 }
1307
1308 static void
1309 vcache_init(void)
1310 {
1311
1312 vcache_pool = pool_cache_init(sizeof(vnode_impl_t), coherency_unit,
1313 0, 0, "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL);
1314 KASSERT(vcache_pool != NULL);
1315 mutex_init(&vcache_lock, MUTEX_DEFAULT, IPL_NONE);
1316 cv_init(&vcache_cv, "vcache");
1317 vcache_hashsize = desiredvnodes;
1318 vcache_hashtab = hashinit(desiredvnodes, HASH_SLIST, true,
1319 &vcache_hashmask);
1320 hashstat_register("vcache", vcache_stats);
1321 }
1322
1323 static void
1324 vcache_reinit(void)
1325 {
1326 int i;
1327 uint32_t hash;
1328 u_long oldmask, newmask;
1329 struct hashhead *oldtab, *newtab;
1330 vnode_impl_t *vip;
1331
1332 newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask);
1333 mutex_enter(&vcache_lock);
1334 oldtab = vcache_hashtab;
1335 oldmask = vcache_hashmask;
1336 vcache_hashsize = desiredvnodes;
1337 vcache_hashtab = newtab;
1338 vcache_hashmask = newmask;
1339 for (i = 0; i <= oldmask; i++) {
1340 while ((vip = SLIST_FIRST(&oldtab[i])) != NULL) {
1341 SLIST_REMOVE(&oldtab[i], vip, vnode_impl, vi_hash);
1342 hash = vcache_hash(&vip->vi_key);
1343 SLIST_INSERT_HEAD(&newtab[hash & vcache_hashmask],
1344 vip, vi_hash);
1345 }
1346 }
1347 mutex_exit(&vcache_lock);
1348 hashdone(oldtab, HASH_SLIST, oldmask);
1349 }
1350
1351 static inline vnode_impl_t *
1352 vcache_hash_lookup(const struct vcache_key *key, uint32_t hash)
1353 {
1354 struct hashhead *hashp;
1355 vnode_impl_t *vip;
1356
1357 KASSERT(mutex_owned(&vcache_lock));
1358
1359 hashp = &vcache_hashtab[hash & vcache_hashmask];
1360 SLIST_FOREACH(vip, hashp, vi_hash) {
1361 if (key->vk_mount != vip->vi_key.vk_mount)
1362 continue;
1363 if (key->vk_key_len != vip->vi_key.vk_key_len)
1364 continue;
1365 if (memcmp(key->vk_key, vip->vi_key.vk_key, key->vk_key_len))
1366 continue;
1367 return vip;
1368 }
1369 return NULL;
1370 }
1371
1372 /*
1373 * Allocate a new, uninitialized vcache node.
1374 */
1375 static vnode_impl_t *
1376 vcache_alloc(void)
1377 {
1378 vnode_impl_t *vip;
1379 vnode_t *vp;
1380
1381 vip = pool_cache_get(vcache_pool, PR_WAITOK);
1382 vp = VIMPL_TO_VNODE(vip);
1383 memset(vip, 0, sizeof(*vip));
1384
1385 rw_init(&vip->vi_lock);
1386 vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
1387
1388 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1);
1389 klist_init(&vip->vi_klist.vk_klist);
1390 vp->v_klist = &vip->vi_klist;
1391 cv_init(&vp->v_cv, "vnode");
1392 cache_vnode_init(vp);
1393
1394 vp->v_usecount = 1;
1395 vp->v_type = VNON;
1396 vp->v_size = vp->v_writesize = VSIZENOTSET;
1397
1398 vip->vi_state = VS_LOADING;
1399
1400 lru_requeue(vp, &lru_list[LRU_FREE]);
1401
1402 return vip;
1403 }
1404
1405 /*
1406 * Deallocate a vcache node in state VS_LOADING.
1407 *
1408 * vcache_lock held on entry and released on return.
1409 */
1410 static void
1411 vcache_dealloc(vnode_impl_t *vip)
1412 {
1413 vnode_t *vp;
1414
1415 KASSERT(mutex_owned(&vcache_lock));
1416
1417 vp = VIMPL_TO_VNODE(vip);
1418 vfs_ref(dead_rootmount);
1419 vfs_insmntque(vp, dead_rootmount);
1420 mutex_enter(vp->v_interlock);
1421 vp->v_op = dead_vnodeop_p;
1422 VSTATE_CHANGE(vp, VS_LOADING, VS_RECLAIMED);
1423 mutex_exit(&vcache_lock);
1424 vrelel(vp, 0, LK_NONE);
1425 }
1426
1427 /*
1428 * Free an unused, unreferenced vcache node.
1429 * v_interlock locked on entry.
1430 */
1431 static void
1432 vcache_free(vnode_impl_t *vip)
1433 {
1434 vnode_t *vp;
1435
1436 vp = VIMPL_TO_VNODE(vip);
1437 KASSERT(mutex_owned(vp->v_interlock));
1438
1439 KASSERT(vrefcnt(vp) == 0);
1440 KASSERT(vp->v_holdcnt == 0);
1441 KASSERT(vp->v_writecount == 0);
1442 lru_requeue(vp, NULL);
1443 mutex_exit(vp->v_interlock);
1444
1445 vfs_insmntque(vp, NULL);
1446 if (vp->v_type == VBLK || vp->v_type == VCHR)
1447 spec_node_destroy(vp);
1448
1449 mutex_obj_free(vp->v_interlock);
1450 rw_destroy(&vip->vi_lock);
1451 uvm_obj_destroy(&vp->v_uobj, true);
1452 KASSERT(vp->v_klist == &vip->vi_klist);
1453 klist_fini(&vip->vi_klist.vk_klist);
1454 cv_destroy(&vp->v_cv);
1455 cache_vnode_fini(vp);
1456 pool_cache_put(vcache_pool, vip);
1457 }
1458
1459 /*
1460 * Try to get an initial reference on this cached vnode.
1461 * Returns zero on success or EBUSY if the vnode state is not LOADED.
1462 *
1463 * NB: lockless code sequences may rely on this not blocking.
1464 */
1465 int
1466 vcache_tryvget(vnode_t *vp)
1467 {
1468 u_int use, next;
1469
1470 for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
1471 if (__predict_false((use & VUSECOUNT_GATE) == 0)) {
1472 return EBUSY;
1473 }
1474 next = atomic_cas_uint(&vp->v_usecount,
1475 use, (use + 1) | VUSECOUNT_VGET);
1476 if (__predict_true(next == use)) {
1477 #ifndef __HAVE_ATOMIC_AS_MEMBAR
1478 membar_acquire();
1479 #endif
1480 return 0;
1481 }
1482 }
1483 }
1484
1485 /*
1486 * Try to get an initial reference on this cached vnode.
1487 * Returns zero on success and ENOENT if the vnode has been reclaimed.
1488 * Will wait for the vnode state to be stable.
1489 *
1490 * v_interlock locked on entry and unlocked on exit.
1491 */
1492 int
1493 vcache_vget(vnode_t *vp)
1494 {
1495 int error;
1496
1497 KASSERT(mutex_owned(vp->v_interlock));
1498
1499 /* Increment hold count to prevent vnode from disappearing. */
1500 vp->v_holdcnt++;
1501 VSTATE_WAIT_STABLE(vp);
1502 vp->v_holdcnt--;
1503
1504 /* If this was the last reference to a reclaimed vnode free it now. */
1505 if (__predict_false(VSTATE_GET(vp) == VS_RECLAIMED)) {
1506 if (vp->v_holdcnt == 0 && vrefcnt(vp) == 0)
1507 vcache_free(VNODE_TO_VIMPL(vp));
1508 else
1509 mutex_exit(vp->v_interlock);
1510 return ENOENT;
1511 }
1512 VSTATE_ASSERT(vp, VS_LOADED);
1513 error = vcache_tryvget(vp);
1514 KASSERT(error == 0);
1515 mutex_exit(vp->v_interlock);
1516
1517 return 0;
1518 }
1519
1520 /*
1521 * Get a vnode / fs node pair by key and return it referenced through vpp.
1522 */
1523 int
1524 vcache_get(struct mount *mp, const void *key, size_t key_len,
1525 struct vnode **vpp)
1526 {
1527 int error;
1528 uint32_t hash;
1529 const void *new_key;
1530 struct vnode *vp;
1531 struct vcache_key vcache_key;
1532 vnode_impl_t *vip, *new_vip;
1533
1534 new_key = NULL;
1535 *vpp = NULL;
1536
1537 vcache_key.vk_mount = mp;
1538 vcache_key.vk_key = key;
1539 vcache_key.vk_key_len = key_len;
1540 hash = vcache_hash(&vcache_key);
1541
1542 again:
1543 mutex_enter(&vcache_lock);
1544 vip = vcache_hash_lookup(&vcache_key, hash);
1545
1546 /* If found, take a reference or retry. */
1547 if (__predict_true(vip != NULL)) {
1548 /*
1549 * If the vnode is loading we cannot take the v_interlock
1550 * here as it might change during load (see uvm_obj_setlock()).
1551 * As changing state from VS_LOADING requires both vcache_lock
1552 * and v_interlock it is safe to test with vcache_lock held.
1553 *
1554 * Wait for vnodes changing state from VS_LOADING and retry.
1555 */
1556 if (__predict_false(vip->vi_state == VS_LOADING)) {
1557 cv_wait(&vcache_cv, &vcache_lock);
1558 mutex_exit(&vcache_lock);
1559 goto again;
1560 }
1561 vp = VIMPL_TO_VNODE(vip);
1562 mutex_enter(vp->v_interlock);
1563 mutex_exit(&vcache_lock);
1564 error = vcache_vget(vp);
1565 if (error == ENOENT)
1566 goto again;
1567 if (error == 0)
1568 *vpp = vp;
1569 KASSERT((error != 0) == (*vpp == NULL));
1570 return error;
1571 }
1572 mutex_exit(&vcache_lock);
1573
1574 /* Allocate and initialize a new vcache / vnode pair. */
1575 error = vfs_busy(mp);
1576 if (error)
1577 return error;
1578 new_vip = vcache_alloc();
1579 new_vip->vi_key = vcache_key;
1580 vp = VIMPL_TO_VNODE(new_vip);
1581 mutex_enter(&vcache_lock);
1582 vip = vcache_hash_lookup(&vcache_key, hash);
1583 if (vip == NULL) {
1584 SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask],
1585 new_vip, vi_hash);
1586 vip = new_vip;
1587 }
1588
1589 /* If another thread beat us inserting this node, retry. */
1590 if (vip != new_vip) {
1591 vcache_dealloc(new_vip);
1592 vfs_unbusy(mp);
1593 goto again;
1594 }
1595 mutex_exit(&vcache_lock);
1596
1597 /* Load the fs node. Exclusive as new_node is VS_LOADING. */
1598 error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key);
1599 if (error) {
1600 mutex_enter(&vcache_lock);
1601 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
1602 new_vip, vnode_impl, vi_hash);
1603 vcache_dealloc(new_vip);
1604 vfs_unbusy(mp);
1605 KASSERT(*vpp == NULL);
1606 return error;
1607 }
1608 KASSERT(new_key != NULL);
1609 KASSERT(memcmp(key, new_key, key_len) == 0);
1610 KASSERT(vp->v_op != NULL);
1611 vfs_insmntque(vp, mp);
1612 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1613 vp->v_vflag |= VV_MPSAFE;
1614 vfs_ref(mp);
1615 vfs_unbusy(mp);
1616
1617 /* Finished loading, finalize node. */
1618 mutex_enter(&vcache_lock);
1619 new_vip->vi_key.vk_key = new_key;
1620 mutex_enter(vp->v_interlock);
1621 VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED);
1622 mutex_exit(vp->v_interlock);
1623 mutex_exit(&vcache_lock);
1624 *vpp = vp;
1625 return 0;
1626 }
1627
1628 /*
1629 * Create a new vnode / fs node pair and return it referenced through vpp.
1630 */
1631 int
1632 vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap,
1633 kauth_cred_t cred, void *extra, struct vnode **vpp)
1634 {
1635 int error;
1636 uint32_t hash;
1637 struct vnode *vp, *ovp;
1638 vnode_impl_t *vip, *ovip;
1639
1640 *vpp = NULL;
1641
1642 /* Allocate and initialize a new vcache / vnode pair. */
1643 error = vfs_busy(mp);
1644 if (error)
1645 return error;
1646 vip = vcache_alloc();
1647 vip->vi_key.vk_mount = mp;
1648 vp = VIMPL_TO_VNODE(vip);
1649
1650 /* Create and load the fs node. */
1651 error = VFS_NEWVNODE(mp, dvp, vp, vap, cred, extra,
1652 &vip->vi_key.vk_key_len, &vip->vi_key.vk_key);
1653 if (error) {
1654 mutex_enter(&vcache_lock);
1655 vcache_dealloc(vip);
1656 vfs_unbusy(mp);
1657 KASSERT(*vpp == NULL);
1658 return error;
1659 }
1660 KASSERT(vp->v_op != NULL);
1661 KASSERT((vip->vi_key.vk_key_len == 0) == (mp == dead_rootmount));
1662 if (vip->vi_key.vk_key_len > 0) {
1663 KASSERT(vip->vi_key.vk_key != NULL);
1664 hash = vcache_hash(&vip->vi_key);
1665
1666 /*
1667 * Wait for previous instance to be reclaimed,
1668 * then insert new node.
1669 */
1670 mutex_enter(&vcache_lock);
1671 while ((ovip = vcache_hash_lookup(&vip->vi_key, hash))) {
1672 ovp = VIMPL_TO_VNODE(ovip);
1673 mutex_enter(ovp->v_interlock);
1674 mutex_exit(&vcache_lock);
1675 error = vcache_vget(ovp);
1676 KASSERT(error == ENOENT);
1677 mutex_enter(&vcache_lock);
1678 }
1679 SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask],
1680 vip, vi_hash);
1681 mutex_exit(&vcache_lock);
1682 }
1683 vfs_insmntque(vp, mp);
1684 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1685 vp->v_vflag |= VV_MPSAFE;
1686 vfs_ref(mp);
1687 vfs_unbusy(mp);
1688
1689 /* Finished loading, finalize node. */
1690 mutex_enter(&vcache_lock);
1691 mutex_enter(vp->v_interlock);
1692 VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED);
1693 mutex_exit(&vcache_lock);
1694 mutex_exit(vp->v_interlock);
1695 *vpp = vp;
1696 return 0;
1697 }
1698
1699 /*
1700 * Prepare key change: update old cache nodes key and lock new cache node.
1701 * Return an error if the new node already exists.
1702 */
1703 int
1704 vcache_rekey_enter(struct mount *mp, struct vnode *vp,
1705 const void *old_key, size_t old_key_len,
1706 const void *new_key, size_t new_key_len)
1707 {
1708 uint32_t old_hash, new_hash;
1709 struct vcache_key old_vcache_key, new_vcache_key;
1710 vnode_impl_t *vip, *new_vip;
1711
1712 old_vcache_key.vk_mount = mp;
1713 old_vcache_key.vk_key = old_key;
1714 old_vcache_key.vk_key_len = old_key_len;
1715 old_hash = vcache_hash(&old_vcache_key);
1716
1717 new_vcache_key.vk_mount = mp;
1718 new_vcache_key.vk_key = new_key;
1719 new_vcache_key.vk_key_len = new_key_len;
1720 new_hash = vcache_hash(&new_vcache_key);
1721
1722 new_vip = vcache_alloc();
1723 new_vip->vi_key = new_vcache_key;
1724
1725 /* Insert locked new node used as placeholder. */
1726 mutex_enter(&vcache_lock);
1727 vip = vcache_hash_lookup(&new_vcache_key, new_hash);
1728 if (vip != NULL) {
1729 vcache_dealloc(new_vip);
1730 return EEXIST;
1731 }
1732 SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask],
1733 new_vip, vi_hash);
1734
1735 /* Replace old nodes key with the temporary copy. */
1736 vip = vcache_hash_lookup(&old_vcache_key, old_hash);
1737 KASSERT(vip != NULL);
1738 KASSERT(VIMPL_TO_VNODE(vip) == vp);
1739 KASSERT(vip->vi_key.vk_key != old_vcache_key.vk_key);
1740 vip->vi_key = old_vcache_key;
1741 mutex_exit(&vcache_lock);
1742 return 0;
1743 }
1744
1745 /*
1746 * Key change complete: update old node and remove placeholder.
1747 */
1748 void
1749 vcache_rekey_exit(struct mount *mp, struct vnode *vp,
1750 const void *old_key, size_t old_key_len,
1751 const void *new_key, size_t new_key_len)
1752 {
1753 uint32_t old_hash, new_hash;
1754 struct vcache_key old_vcache_key, new_vcache_key;
1755 vnode_impl_t *vip, *new_vip;
1756 struct vnode *new_vp;
1757
1758 old_vcache_key.vk_mount = mp;
1759 old_vcache_key.vk_key = old_key;
1760 old_vcache_key.vk_key_len = old_key_len;
1761 old_hash = vcache_hash(&old_vcache_key);
1762
1763 new_vcache_key.vk_mount = mp;
1764 new_vcache_key.vk_key = new_key;
1765 new_vcache_key.vk_key_len = new_key_len;
1766 new_hash = vcache_hash(&new_vcache_key);
1767
1768 mutex_enter(&vcache_lock);
1769
1770 /* Lookup old and new node. */
1771 vip = vcache_hash_lookup(&old_vcache_key, old_hash);
1772 KASSERT(vip != NULL);
1773 KASSERT(VIMPL_TO_VNODE(vip) == vp);
1774
1775 new_vip = vcache_hash_lookup(&new_vcache_key, new_hash);
1776 KASSERT(new_vip != NULL);
1777 KASSERT(new_vip->vi_key.vk_key_len == new_key_len);
1778 new_vp = VIMPL_TO_VNODE(new_vip);
1779 mutex_enter(new_vp->v_interlock);
1780 VSTATE_ASSERT(VIMPL_TO_VNODE(new_vip), VS_LOADING);
1781 mutex_exit(new_vp->v_interlock);
1782
1783 /* Rekey old node and put it onto its new hashlist. */
1784 vip->vi_key = new_vcache_key;
1785 if (old_hash != new_hash) {
1786 SLIST_REMOVE(&vcache_hashtab[old_hash & vcache_hashmask],
1787 vip, vnode_impl, vi_hash);
1788 SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask],
1789 vip, vi_hash);
1790 }
1791
1792 /* Remove new node used as placeholder. */
1793 SLIST_REMOVE(&vcache_hashtab[new_hash & vcache_hashmask],
1794 new_vip, vnode_impl, vi_hash);
1795 vcache_dealloc(new_vip);
1796 }
1797
1798 /*
1799 * Disassociate the underlying file system from a vnode.
1800 *
1801 * Must be called with vnode locked and will return unlocked.
1802 * Must be called with the interlock held, and will return with it held.
1803 */
1804 static void
1805 vcache_reclaim(vnode_t *vp)
1806 {
1807 lwp_t *l = curlwp;
1808 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
1809 struct mount *mp = vp->v_mount;
1810 uint32_t hash;
1811 uint8_t temp_buf[64], *temp_key;
1812 size_t temp_key_len;
1813 bool recycle;
1814 int error;
1815
1816 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
1817 KASSERT(mutex_owned(vp->v_interlock));
1818 KASSERT(vrefcnt(vp) != 0);
1819
1820 temp_key_len = vip->vi_key.vk_key_len;
1821 /*
1822 * Prevent the vnode from being recycled or brought into use
1823 * while we clean it out.
1824 */
1825 VSTATE_CHANGE(vp, VS_BLOCKED, VS_RECLAIMING);
1826
1827 /*
1828 * Send NOTE_REVOKE now, before we call VOP_RECLAIM(),
1829 * because VOP_RECLAIM() could cause vp->v_klist to
1830 * become invalid. Don't check for interest in NOTE_REVOKE
1831 * here; it's always posted because it sets EV_EOF.
1832 *
1833 * Once it's been posted, reset vp->v_klist to point to
1834 * our own local storage, in case we were sharing with
1835 * someone else.
1836 */
1837 KNOTE(&vp->v_klist->vk_klist, NOTE_REVOKE);
1838 vp->v_klist = &vip->vi_klist;
1839 mutex_exit(vp->v_interlock);
1840
1841 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
1842 mutex_enter(vp->v_interlock);
1843 if ((vp->v_iflag & VI_EXECMAP) != 0) {
1844 cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages);
1845 }
1846 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
1847 vp->v_iflag |= VI_DEADCHECK; /* for genfs_getpages() */
1848 mutex_exit(vp->v_interlock);
1849 rw_exit(vp->v_uobj.vmobjlock);
1850
1851 /*
1852 * With vnode state set to reclaiming, purge name cache immediately
1853 * to prevent new handles on vnode, and wait for existing threads
1854 * trying to get a handle to notice VS_RECLAIMED status and abort.
1855 */
1856 cache_purge(vp);
1857
1858 /* Replace the vnode key with a temporary copy. */
1859 if (vip->vi_key.vk_key_len > sizeof(temp_buf)) {
1860 temp_key = kmem_alloc(temp_key_len, KM_SLEEP);
1861 } else {
1862 temp_key = temp_buf;
1863 }
1864 if (vip->vi_key.vk_key_len > 0) {
1865 mutex_enter(&vcache_lock);
1866 memcpy(temp_key, vip->vi_key.vk_key, temp_key_len);
1867 vip->vi_key.vk_key = temp_key;
1868 mutex_exit(&vcache_lock);
1869 }
1870
1871 fstrans_start(mp);
1872
1873 /*
1874 * Clean out any cached data associated with the vnode.
1875 */
1876 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
1877 if (error != 0) {
1878 if (wapbl_vphaswapbl(vp))
1879 WAPBL_DISCARD(wapbl_vptomp(vp));
1880 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
1881 }
1882 KASSERTMSG((error == 0), "vinvalbuf failed: %d", error);
1883 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1884 if (vp->v_type == VBLK || vp->v_type == VCHR) {
1885 spec_node_revoke(vp);
1886 }
1887
1888 /*
1889 * Disassociate the underlying file system from the vnode.
1890 * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks
1891 * the vnode, and may destroy the vnode so that VOP_UNLOCK
1892 * would no longer function.
1893 */
1894 VOP_INACTIVE(vp, &recycle);
1895 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
1896 if (VOP_RECLAIM(vp)) {
1897 vnpanic(vp, "%s: cannot reclaim", __func__);
1898 }
1899
1900 KASSERT(vp->v_data == NULL);
1901 KASSERT((vp->v_iflag & VI_PAGES) == 0);
1902
1903 if (vp->v_type == VREG && vp->v_ractx != NULL) {
1904 uvm_ra_freectx(vp->v_ractx);
1905 vp->v_ractx = NULL;
1906 }
1907
1908 if (vip->vi_key.vk_key_len > 0) {
1909 /* Remove from vnode cache. */
1910 hash = vcache_hash(&vip->vi_key);
1911 mutex_enter(&vcache_lock);
1912 KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash));
1913 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
1914 vip, vnode_impl, vi_hash);
1915 mutex_exit(&vcache_lock);
1916 }
1917 if (temp_key != temp_buf)
1918 kmem_free(temp_key, temp_key_len);
1919
1920 /* Done with purge, notify sleepers of the grim news. */
1921 mutex_enter(vp->v_interlock);
1922 vp->v_op = dead_vnodeop_p;
1923 VSTATE_CHANGE(vp, VS_RECLAIMING, VS_RECLAIMED);
1924 vp->v_tag = VT_NON;
1925 mutex_exit(vp->v_interlock);
1926
1927 /*
1928 * Move to dead mount. Must be after changing the operations
1929 * vector as vnode operations enter the mount before using the
1930 * operations vector. See sys/kern/vnode_if.c.
1931 */
1932 vp->v_vflag &= ~VV_ROOT;
1933 vfs_ref(dead_rootmount);
1934 vfs_insmntque(vp, dead_rootmount);
1935
1936 #ifdef PAX_SEGVGUARD
1937 pax_segvguard_cleanup(vp);
1938 #endif /* PAX_SEGVGUARD */
1939
1940 mutex_enter(vp->v_interlock);
1941 fstrans_done(mp);
1942 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1943 }
1944
1945 /*
1946 * Disassociate the underlying file system from an open device vnode
1947 * and make it anonymous.
1948 *
1949 * Vnode unlocked on entry, drops a reference to the vnode.
1950 */
1951 void
1952 vcache_make_anon(vnode_t *vp)
1953 {
1954 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
1955 uint32_t hash;
1956 bool recycle;
1957
1958 KASSERT(vp->v_type == VBLK || vp->v_type == VCHR);
1959 KASSERT(vp->v_mount == dead_rootmount || fstrans_is_owner(vp->v_mount));
1960 VSTATE_ASSERT_UNLOCKED(vp, VS_ACTIVE);
1961
1962 /* Remove from vnode cache. */
1963 hash = vcache_hash(&vip->vi_key);
1964 mutex_enter(&vcache_lock);
1965 KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash));
1966 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
1967 vip, vnode_impl, vi_hash);
1968 vip->vi_key.vk_mount = dead_rootmount;
1969 vip->vi_key.vk_key_len = 0;
1970 vip->vi_key.vk_key = NULL;
1971 mutex_exit(&vcache_lock);
1972
1973 /*
1974 * Disassociate the underlying file system from the vnode.
1975 * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks
1976 * the vnode, and may destroy the vnode so that VOP_UNLOCK
1977 * would no longer function.
1978 */
1979 if (vn_lock(vp, LK_EXCLUSIVE)) {
1980 vnpanic(vp, "%s: cannot lock", __func__);
1981 }
1982 VOP_INACTIVE(vp, &recycle);
1983 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
1984 if (VOP_RECLAIM(vp)) {
1985 vnpanic(vp, "%s: cannot reclaim", __func__);
1986 }
1987
1988 /* Purge name cache. */
1989 cache_purge(vp);
1990
1991 /* Done with purge, change operations vector. */
1992 mutex_enter(vp->v_interlock);
1993 vp->v_op = spec_vnodeop_p;
1994 vp->v_vflag |= VV_MPSAFE;
1995 mutex_exit(vp->v_interlock);
1996
1997 /*
1998 * Move to dead mount. Must be after changing the operations
1999 * vector as vnode operations enter the mount before using the
2000 * operations vector. See sys/kern/vnode_if.c.
2001 */
2002 vfs_ref(dead_rootmount);
2003 vfs_insmntque(vp, dead_rootmount);
2004
2005 vrele(vp);
2006 }
2007
2008 /*
2009 * Update outstanding I/O count and do wakeup if requested.
2010 */
2011 void
2012 vwakeup(struct buf *bp)
2013 {
2014 vnode_t *vp;
2015
2016 if ((vp = bp->b_vp) == NULL)
2017 return;
2018
2019 KASSERT(bp->b_objlock == vp->v_interlock);
2020 KASSERT(mutex_owned(bp->b_objlock));
2021
2022 if (--vp->v_numoutput < 0)
2023 vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp);
2024 if (vp->v_numoutput == 0)
2025 cv_broadcast(&vp->v_cv);
2026 }
2027
2028 /*
2029 * Test a vnode for being or becoming dead. Returns one of:
2030 * EBUSY: vnode is becoming dead, with "flags == VDEAD_NOWAIT" only.
2031 * ENOENT: vnode is dead.
2032 * 0: otherwise.
2033 *
2034 * Whenever this function returns a non-zero value all future
2035 * calls will also return a non-zero value.
2036 */
2037 int
2038 vdead_check(struct vnode *vp, int flags)
2039 {
2040
2041 KASSERT(mutex_owned(vp->v_interlock));
2042
2043 if (! ISSET(flags, VDEAD_NOWAIT))
2044 VSTATE_WAIT_STABLE(vp);
2045
2046 if (VSTATE_GET(vp) == VS_RECLAIMING) {
2047 KASSERT(ISSET(flags, VDEAD_NOWAIT));
2048 return EBUSY;
2049 } else if (VSTATE_GET(vp) == VS_RECLAIMED) {
2050 return ENOENT;
2051 }
2052
2053 return 0;
2054 }
2055
2056 int
2057 vfs_drainvnodes(void)
2058 {
2059 int i, gen;
2060
2061 mutex_enter(&vdrain_lock);
2062 for (i = 0; i < 2; i++) {
2063 gen = vdrain_gen;
2064 while (gen == vdrain_gen) {
2065 cv_broadcast(&vdrain_cv);
2066 cv_wait(&vdrain_gen_cv, &vdrain_lock);
2067 }
2068 }
2069 mutex_exit(&vdrain_lock);
2070
2071 if (numvnodes >= desiredvnodes)
2072 return EBUSY;
2073
2074 if (vcache_hashsize != desiredvnodes)
2075 vcache_reinit();
2076
2077 return 0;
2078 }
2079
2080 void
2081 vnpanic(vnode_t *vp, const char *fmt, ...)
2082 {
2083 va_list ap;
2084
2085 #ifdef DIAGNOSTIC
2086 vprint(NULL, vp);
2087 #endif
2088 va_start(ap, fmt);
2089 vpanic(fmt, ap);
2090 va_end(ap);
2091 }
2092
2093 void
2094 vshareilock(vnode_t *tvp, vnode_t *fvp)
2095 {
2096 kmutex_t *oldlock;
2097
2098 oldlock = tvp->v_interlock;
2099 mutex_obj_hold(fvp->v_interlock);
2100 tvp->v_interlock = fvp->v_interlock;
2101 mutex_obj_free(oldlock);
2102 }
2103
2104 void
2105 vshareklist(vnode_t *tvp, vnode_t *fvp)
2106 {
2107 /*
2108 * If two vnodes share klist state, they must also share
2109 * an interlock.
2110 */
2111 KASSERT(tvp->v_interlock == fvp->v_interlock);
2112
2113 /*
2114 * We make the following assumptions:
2115 *
2116 * ==> Some other synchronization is happening outside of
2117 * our view to make this safe.
2118 *
2119 * ==> That the "to" vnode will have the necessary references
2120 * on the "from" vnode so that the storage for the klist
2121 * won't be yanked out from beneath us (the vnode_impl).
2122 *
2123 * ==> If "from" is also sharing, we then assume that "from"
2124 * has the necessary references, and so on.
2125 */
2126 tvp->v_klist = fvp->v_klist;
2127 }
2128