vfs_vnode.c revision 1.153.2.1 1 /* $NetBSD: vfs_vnode.c,v 1.153.2.1 2025/08/02 05:57:44 perseant Exp $ */
2
3 /*-
4 * Copyright (c) 1997-2011, 2019, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /*
34 * Copyright (c) 1989, 1993
35 * The Regents of the University of California. All rights reserved.
36 * (c) UNIX System Laboratories, Inc.
37 * All or some portions of this file are derived from material licensed
38 * to the University of California by American Telephone and Telegraph
39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40 * the permission of UNIX System Laboratories, Inc.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
67 */
68
69 /*
70 * The vnode cache subsystem.
71 *
72 * Life-cycle
73 *
74 * Normally, there are two points where new vnodes are created:
75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode
76 * starts in one of the following ways:
77 *
78 * - Allocation, via vcache_get(9) or vcache_new(9).
79 * - Reclamation of inactive vnode, via vcache_vget(9).
80 *
81 * Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9)
82 * was another, traditional way. Currently, only the draining thread
83 * recycles the vnodes. This behaviour might be revisited.
84 *
85 * The life-cycle ends when the last reference is dropped, usually
86 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform
87 * the file system that vnode is inactive. Via this call, file system
88 * indicates whether vnode can be recycled (usually, it checks its own
89 * references, e.g. count of links, whether the file was removed).
90 *
91 * Depending on indication, vnode can be put into a free list (cache),
92 * or cleaned via vcache_reclaim, which calls VOP_RECLAIM(9) to
93 * disassociate underlying file system from the vnode, and finally
94 * destroyed.
95 *
96 * Vnode state
97 *
98 * Vnode is always in one of six states:
99 * - MARKER This is a marker vnode to help list traversal. It
100 * will never change its state.
101 * - LOADING Vnode is associating underlying file system and not
102 * yet ready to use.
103 * - LOADED Vnode has associated underlying file system and is
104 * ready to use.
105 * - BLOCKED Vnode is active but cannot get new references.
106 * - RECLAIMING Vnode is disassociating from the underlying file
107 * system.
108 * - RECLAIMED Vnode has disassociated from underlying file system
109 * and is dead.
110 *
111 * Valid state changes are:
112 * LOADING -> LOADED
113 * Vnode has been initialised in vcache_get() or
114 * vcache_new() and is ready to use.
115 * BLOCKED -> RECLAIMING
116 * Vnode starts disassociation from underlying file
117 * system in vcache_reclaim().
118 * RECLAIMING -> RECLAIMED
119 * Vnode finished disassociation from underlying file
120 * system in vcache_reclaim().
121 * LOADED -> BLOCKED
122 * Either vcache_rekey*() is changing the vnode key or
123 * vrelel() is about to call VOP_INACTIVE().
124 * BLOCKED -> LOADED
125 * The block condition is over.
126 * LOADING -> RECLAIMED
127 * Either vcache_get() or vcache_new() failed to
128 * associate the underlying file system or vcache_rekey*()
129 * drops a vnode used as placeholder.
130 *
131 * Of these states LOADING, BLOCKED and RECLAIMING are intermediate
132 * and it is possible to wait for state change.
133 *
134 * State is protected with v_interlock with one exception:
135 * to change from LOADING both v_interlock and vcache_lock must be held
136 * so it is possible to check "state == LOADING" without holding
137 * v_interlock. See vcache_get() for details.
138 *
139 * Reference counting
140 *
141 * Vnode is considered active, if reference count (vnode_t::v_usecount)
142 * is non-zero. It is maintained using: vref(9) and vrele(9), as well
143 * as vput(9), routines. Common points holding references are e.g.
144 * file openings, current working directory, mount points, etc.
145 *
146 * v_usecount is adjusted with atomic operations, however to change
147 * from a non-zero value to zero the interlock must also be held.
148 */
149
150 #include <sys/cdefs.h>
151 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.153.2.1 2025/08/02 05:57:44 perseant Exp $");
152
153 #ifdef _KERNEL_OPT
154 #include "opt_pax.h"
155 #endif
156
157 #include <sys/param.h>
158 #include <sys/types.h>
159
160 #include <sys/atomic.h>
161 #include <sys/buf.h>
162 #include <sys/conf.h>
163 #include <sys/device.h>
164 #include <sys/fstrans.h>
165 #include <sys/hash.h>
166 #include <sys/kauth.h>
167 #include <sys/kernel.h>
168 #include <sys/kmem.h>
169 #include <sys/module.h>
170 #include <sys/mount.h>
171 #include <sys/namei.h>
172 #include <sys/pax.h>
173 #include <sys/sdt.h>
174 #include <sys/syscallargs.h>
175 #include <sys/sysctl.h>
176 #include <sys/systm.h>
177 #include <sys/threadpool.h>
178 #include <sys/vnode_impl.h>
179 #include <sys/wapbl.h>
180
181 #include <miscfs/deadfs/deadfs.h>
182 #include <miscfs/specfs/specdev.h>
183
184 #include <uvm/uvm.h>
185 #include <uvm/uvm_readahead.h>
186 #include <uvm/uvm_stat.h>
187
188 /* Flags to vrelel. */
189 #define VRELEL_ASYNC 0x0001 /* Always defer to vrele thread. */
190
191 #define LRU_VRELE 0
192 #define LRU_FREE 1
193 #define LRU_HOLD 2
194 #define LRU_COUNT 3
195
196 /*
197 * There are three lru lists: one holds vnodes waiting for async release,
198 * one is for vnodes which have no buffer/page references and one for those
199 * which do (i.e. v_holdcnt is non-zero). We put the lists into a single,
200 * private cache line as vnodes migrate between them while under the same
201 * lock (vdrain_lock).
202 */
203
204 typedef struct {
205 vnode_impl_t *li_marker;
206 } lru_iter_t;
207
208 u_int numvnodes __cacheline_aligned;
209 static vnodelst_t lru_list[LRU_COUNT] __cacheline_aligned;
210 static struct threadpool *threadpool;
211 static struct threadpool_job vdrain_job;
212 static struct threadpool_job vrele_job;
213 static kmutex_t vdrain_lock __cacheline_aligned;
214 SLIST_HEAD(hashhead, vnode_impl);
215 static kmutex_t vcache_lock __cacheline_aligned;
216 static kcondvar_t vcache_cv;
217 static u_int vcache_hashsize;
218 static u_long vcache_hashmask;
219 static struct hashhead *vcache_hashtab;
220 static pool_cache_t vcache_pool;
221 static void lru_requeue(vnode_t *, vnodelst_t *);
222 static vnodelst_t * lru_which(vnode_t *);
223 static vnode_impl_t * lru_iter_first(int, lru_iter_t *);
224 static vnode_impl_t * lru_iter_next(lru_iter_t *);
225 static void lru_iter_release(lru_iter_t *);
226 static vnode_impl_t * vcache_alloc(void);
227 static void vcache_dealloc(vnode_impl_t *);
228 static void vcache_free(vnode_impl_t *);
229 static void vcache_init(void);
230 static void vcache_reinit(void);
231 static void vcache_reclaim(vnode_t *);
232 static void vrele_deferred(vnode_impl_t *);
233 static void vrelel(vnode_t *, int, int);
234 static void vnpanic(vnode_t *, const char *, ...)
235 __printflike(2, 3);
236 static bool vdrain_one(u_int);
237 static void vdrain_task(struct threadpool_job *);
238 static void vrele_task(struct threadpool_job *);
239
240 /* Routines having to do with the management of the vnode table. */
241
242 /*
243 * The high bit of v_usecount is a gate for vcache_tryvget(). It's set
244 * only when the vnode state is LOADED.
245 * The next bit of v_usecount is a flag for vrelel(). It's set
246 * from vcache_vget() and vcache_tryvget() whenever the operation succeeds.
247 */
248 #define VUSECOUNT_MASK 0x3fffffff
249 #define VUSECOUNT_GATE 0x80000000
250 #define VUSECOUNT_VGET 0x40000000
251
252 /*
253 * Return the current usecount of a vnode.
254 */
255 inline int
256 vrefcnt(struct vnode *vp)
257 {
258
259 return atomic_load_relaxed(&vp->v_usecount) & VUSECOUNT_MASK;
260 }
261
262 /* Vnode state operations and diagnostics. */
263
264 #if defined(DIAGNOSTIC)
265
266 #define VSTATE_VALID(state) \
267 ((state) != VS_ACTIVE && (state) != VS_MARKER)
268 #define VSTATE_GET(vp) \
269 vstate_assert_get((vp), __func__, __LINE__)
270 #define VSTATE_CHANGE(vp, from, to) \
271 vstate_assert_change((vp), (from), (to), __func__, __LINE__)
272 #define VSTATE_WAIT_STABLE(vp) \
273 vstate_assert_wait_stable((vp), __func__, __LINE__)
274
275 void
276 _vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line,
277 bool has_lock)
278 {
279 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
280 int refcnt = vrefcnt(vp);
281
282 if (!has_lock) {
283 enum vnode_state vstate = atomic_load_relaxed(&vip->vi_state);
284
285 if (state == VS_ACTIVE && refcnt > 0 &&
286 (vstate == VS_LOADED || vstate == VS_BLOCKED))
287 return;
288 if (vstate == state)
289 return;
290 mutex_enter((vp)->v_interlock);
291 }
292
293 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
294
295 if ((state == VS_ACTIVE && refcnt > 0 &&
296 (vip->vi_state == VS_LOADED || vip->vi_state == VS_BLOCKED)) ||
297 vip->vi_state == state) {
298 if (!has_lock)
299 mutex_exit((vp)->v_interlock);
300 return;
301 }
302 vnpanic(vp, "state is %s, usecount %d, expected %s at %s:%d",
303 vstate_name(vip->vi_state), refcnt,
304 vstate_name(state), func, line);
305 }
306
307 static enum vnode_state
308 vstate_assert_get(vnode_t *vp, const char *func, int line)
309 {
310 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
311
312 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
313 if (! VSTATE_VALID(vip->vi_state))
314 vnpanic(vp, "state is %s at %s:%d",
315 vstate_name(vip->vi_state), func, line);
316
317 return vip->vi_state;
318 }
319
320 static void
321 vstate_assert_wait_stable(vnode_t *vp, const char *func, int line)
322 {
323 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
324
325 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
326 if (! VSTATE_VALID(vip->vi_state))
327 vnpanic(vp, "state is %s at %s:%d",
328 vstate_name(vip->vi_state), func, line);
329
330 while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED)
331 cv_wait(&vp->v_cv, vp->v_interlock);
332
333 if (! VSTATE_VALID(vip->vi_state))
334 vnpanic(vp, "state is %s at %s:%d",
335 vstate_name(vip->vi_state), func, line);
336 }
337
338 static void
339 vstate_assert_change(vnode_t *vp, enum vnode_state from, enum vnode_state to,
340 const char *func, int line)
341 {
342 bool gated = (atomic_load_relaxed(&vp->v_usecount) & VUSECOUNT_GATE);
343 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
344
345 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
346 if (from == VS_LOADING)
347 KASSERTMSG(mutex_owned(&vcache_lock), "at %s:%d", func, line);
348
349 if (! VSTATE_VALID(from))
350 vnpanic(vp, "from is %s at %s:%d",
351 vstate_name(from), func, line);
352 if (! VSTATE_VALID(to))
353 vnpanic(vp, "to is %s at %s:%d",
354 vstate_name(to), func, line);
355 if (vip->vi_state != from)
356 vnpanic(vp, "from is %s, expected %s at %s:%d\n",
357 vstate_name(vip->vi_state), vstate_name(from), func, line);
358 if ((from == VS_LOADED) != gated)
359 vnpanic(vp, "state is %s, gate %d does not match at %s:%d\n",
360 vstate_name(vip->vi_state), gated, func, line);
361
362 /* Open/close the gate for vcache_tryvget(). */
363 if (to == VS_LOADED) {
364 membar_release();
365 atomic_or_uint(&vp->v_usecount, VUSECOUNT_GATE);
366 } else {
367 atomic_and_uint(&vp->v_usecount, ~VUSECOUNT_GATE);
368 }
369
370 atomic_store_relaxed(&vip->vi_state, to);
371 if (from == VS_LOADING)
372 cv_broadcast(&vcache_cv);
373 if (to == VS_LOADED || to == VS_RECLAIMED)
374 cv_broadcast(&vp->v_cv);
375 }
376
377 #else /* defined(DIAGNOSTIC) */
378
379 #define VSTATE_GET(vp) \
380 (VNODE_TO_VIMPL((vp))->vi_state)
381 #define VSTATE_CHANGE(vp, from, to) \
382 vstate_change((vp), (from), (to))
383 #define VSTATE_WAIT_STABLE(vp) \
384 vstate_wait_stable((vp))
385 void
386 _vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line,
387 bool has_lock)
388 {
389
390 }
391
392 static void
393 vstate_wait_stable(vnode_t *vp)
394 {
395 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
396
397 while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED)
398 cv_wait(&vp->v_cv, vp->v_interlock);
399 }
400
401 static void
402 vstate_change(vnode_t *vp, enum vnode_state from, enum vnode_state to)
403 {
404 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
405
406 /* Open/close the gate for vcache_tryvget(). */
407 if (to == VS_LOADED) {
408 membar_release();
409 atomic_or_uint(&vp->v_usecount, VUSECOUNT_GATE);
410 } else {
411 atomic_and_uint(&vp->v_usecount, ~VUSECOUNT_GATE);
412 }
413
414 atomic_store_relaxed(&vip->vi_state, to);
415 if (from == VS_LOADING)
416 cv_broadcast(&vcache_cv);
417 if (to == VS_LOADED || to == VS_RECLAIMED)
418 cv_broadcast(&vp->v_cv);
419 }
420
421 #endif /* defined(DIAGNOSTIC) */
422
423 void
424 vfs_vnode_sysinit(void)
425 {
426 int error __diagused, i;
427
428 dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL);
429 KASSERT(dead_rootmount != NULL);
430 dead_rootmount->mnt_iflag |= IMNT_MPSAFE;
431
432 mutex_init(&vdrain_lock, MUTEX_DEFAULT, IPL_NONE);
433 for (i = 0; i < LRU_COUNT; i++) {
434 TAILQ_INIT(&lru_list[i]);
435 }
436 vcache_init();
437
438 error = threadpool_get(&threadpool, PRI_NONE);
439 KASSERTMSG((error == 0), "threadpool_get failed: %d", error);
440 threadpool_job_init(&vdrain_job, vdrain_task, &vdrain_lock, "vdrain");
441 threadpool_job_init(&vrele_job, vrele_task, &vdrain_lock, "vrele");
442 }
443
444 /*
445 * Allocate a new marker vnode.
446 */
447 vnode_t *
448 vnalloc_marker(struct mount *mp)
449 {
450 vnode_impl_t *vip;
451 vnode_t *vp;
452
453 vip = pool_cache_get(vcache_pool, PR_WAITOK);
454 memset(vip, 0, sizeof(*vip));
455 vp = VIMPL_TO_VNODE(vip);
456 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1);
457 vp->v_mount = mp;
458 vp->v_type = VBAD;
459 vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
460 klist_init(&vip->vi_klist.vk_klist);
461 vp->v_klist = &vip->vi_klist;
462 vip->vi_state = VS_MARKER;
463
464 return vp;
465 }
466
467 /*
468 * Free a marker vnode.
469 */
470 void
471 vnfree_marker(vnode_t *vp)
472 {
473 vnode_impl_t *vip;
474
475 vip = VNODE_TO_VIMPL(vp);
476 KASSERT(vip->vi_state == VS_MARKER);
477 mutex_obj_free(vp->v_interlock);
478 uvm_obj_destroy(&vp->v_uobj, true);
479 klist_fini(&vip->vi_klist.vk_klist);
480 pool_cache_put(vcache_pool, vip);
481 }
482
483 /*
484 * Test a vnode for being a marker vnode.
485 */
486 bool
487 vnis_marker(vnode_t *vp)
488 {
489
490 return (VNODE_TO_VIMPL(vp)->vi_state == VS_MARKER);
491 }
492
493 /*
494 * Return the lru list this node should be on.
495 */
496 static vnodelst_t *
497 lru_which(vnode_t *vp)
498 {
499
500 KASSERT(mutex_owned(vp->v_interlock));
501
502 if (vp->v_holdcnt > 0)
503 return &lru_list[LRU_HOLD];
504 else
505 return &lru_list[LRU_FREE];
506 }
507
508 /*
509 * Put vnode to end of given list.
510 * Both the current and the new list may be NULL, used on vnode alloc/free.
511 * Adjust numvnodes and signal vdrain thread if there is work.
512 */
513 static void
514 lru_requeue(vnode_t *vp, vnodelst_t *listhd)
515 {
516 vnode_impl_t *vip;
517 int d;
518
519 /*
520 * If the vnode is on the correct list, and was put there recently,
521 * then leave it be, thus avoiding huge cache and lock contention.
522 */
523 vip = VNODE_TO_VIMPL(vp);
524 if (listhd == vip->vi_lrulisthd &&
525 (getticks() - vip->vi_lrulisttm) < hz) {
526 return;
527 }
528
529 mutex_enter(&vdrain_lock);
530 d = 0;
531 if (vip->vi_lrulisthd != NULL)
532 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
533 else
534 d++;
535 vip->vi_lrulisthd = listhd;
536 vip->vi_lrulisttm = getticks();
537 if (vip->vi_lrulisthd != NULL)
538 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
539 else
540 d--;
541 if (d != 0) {
542 /*
543 * Looks strange? This is not a bug. Don't store
544 * numvnodes unless there is a change - avoid false
545 * sharing on MP.
546 */
547 numvnodes += d;
548 }
549 if (listhd == &lru_list[LRU_VRELE])
550 threadpool_schedule_job(threadpool, &vrele_job);
551 if (d > 0 && numvnodes > desiredvnodes)
552 threadpool_schedule_job(threadpool, &vdrain_job);
553 if (d > 0 && numvnodes > desiredvnodes + desiredvnodes / 16)
554 kpause("vnfull", false, MAX(1, mstohz(10)), &vdrain_lock);
555 mutex_exit(&vdrain_lock);
556 }
557
558 /*
559 * LRU list iterator.
560 * Caller holds vdrain_lock.
561 */
562 static vnode_impl_t *
563 lru_iter_first(int idx, lru_iter_t *iterp)
564 {
565 vnode_impl_t *marker;
566
567 KASSERT(mutex_owned(&vdrain_lock));
568
569 mutex_exit(&vdrain_lock);
570 marker = VNODE_TO_VIMPL(vnalloc_marker(NULL));
571 mutex_enter(&vdrain_lock);
572 marker->vi_lrulisthd = &lru_list[idx];
573 iterp->li_marker = marker;
574
575 TAILQ_INSERT_HEAD(marker->vi_lrulisthd, marker, vi_lrulist);
576
577 return lru_iter_next(iterp);
578 }
579
580 static vnode_impl_t *
581 lru_iter_next(lru_iter_t *iter)
582 {
583 vnode_impl_t *vip, *marker;
584 vnodelst_t *listhd;
585
586 KASSERT(mutex_owned(&vdrain_lock));
587
588 marker = iter->li_marker;
589 listhd = marker->vi_lrulisthd;
590
591 while ((vip = TAILQ_NEXT(marker, vi_lrulist))) {
592 TAILQ_REMOVE(listhd, marker, vi_lrulist);
593 TAILQ_INSERT_AFTER(listhd, vip, marker, vi_lrulist);
594 if (!vnis_marker(VIMPL_TO_VNODE(vip)))
595 break;
596 }
597
598 return vip;
599 }
600
601 static void
602 lru_iter_release(lru_iter_t *iter)
603 {
604 vnode_impl_t *marker;
605
606 KASSERT(mutex_owned(&vdrain_lock));
607
608 marker = iter->li_marker;
609 TAILQ_REMOVE(marker->vi_lrulisthd, marker, vi_lrulist);
610
611 mutex_exit(&vdrain_lock);
612 vnfree_marker(VIMPL_TO_VNODE(marker));
613 mutex_enter(&vdrain_lock);
614 }
615
616 /*
617 * Release deferred vrele vnodes for this mount.
618 * Called with file system suspended.
619 */
620 void
621 vrele_flush(struct mount *mp)
622 {
623 lru_iter_t iter;
624 vnode_impl_t *vip;
625
626 KASSERT(fstrans_is_owner(mp));
627
628 mutex_enter(&vdrain_lock);
629 for (vip = lru_iter_first(LRU_VRELE, &iter); vip != NULL;
630 vip = lru_iter_next(&iter)) {
631 if (VIMPL_TO_VNODE(vip)->v_mount != mp)
632 continue;
633 vrele_deferred(vip);
634 }
635 lru_iter_release(&iter);
636 mutex_exit(&vdrain_lock);
637 }
638
639 /*
640 * One pass through the LRU lists to keep the number of allocated
641 * vnodes below target. Returns true if target met.
642 */
643 static bool
644 vdrain_one(u_int target)
645 {
646 int ix, lists[] = { LRU_FREE, LRU_HOLD };
647 lru_iter_t iter;
648 vnode_impl_t *vip;
649 vnode_t *vp;
650 struct mount *mp;
651
652 KASSERT(mutex_owned(&vdrain_lock));
653
654 for (ix = 0; ix < __arraycount(lists); ix++) {
655 for (vip = lru_iter_first(lists[ix], &iter); vip != NULL;
656 vip = lru_iter_next(&iter)) {
657 if (numvnodes < target) {
658 lru_iter_release(&iter);
659 return true;
660 }
661
662 vp = VIMPL_TO_VNODE(vip);
663
664 /* Probe usecount (unlocked). */
665 if (vrefcnt(vp) > 0)
666 continue;
667 /* Try v_interlock -- we lock the wrong direction! */
668 if (!mutex_tryenter(vp->v_interlock))
669 continue;
670 /* Probe usecount and state. */
671 if (vrefcnt(vp) > 0 || VSTATE_GET(vp) != VS_LOADED) {
672 mutex_exit(vp->v_interlock);
673 continue;
674 }
675 mutex_exit(&vdrain_lock);
676
677 mp = vp->v_mount;
678 if (fstrans_start_nowait(mp) != 0) {
679 mutex_exit(vp->v_interlock);
680 mutex_enter(&vdrain_lock);
681 continue;
682 }
683
684 if (vcache_vget(vp) == 0) {
685 if (!vrecycle(vp)) {
686 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
687 mutex_enter(vp->v_interlock);
688 vrelel(vp, 0, LK_EXCLUSIVE);
689 }
690 }
691 fstrans_done(mp);
692
693 mutex_enter(&vdrain_lock);
694 }
695 lru_iter_release(&iter);
696 }
697
698 return false;
699 }
700
701 /*
702 * threadpool task to keep the number of vnodes below desiredvnodes.
703 */
704 static void
705 vdrain_task(struct threadpool_job *job)
706 {
707 u_int target;
708
709 target = desiredvnodes - desiredvnodes / 16;
710
711 mutex_enter(&vdrain_lock);
712
713 while (!vdrain_one(target))
714 kpause("vdrain", false, 1, &vdrain_lock);
715
716 threadpool_job_done(job);
717 mutex_exit(&vdrain_lock);
718 }
719
720 /*
721 * threadpool task to process asynchronous vrele.
722 */
723 static void
724 vrele_task(struct threadpool_job *job)
725 {
726 int skipped;
727 lru_iter_t iter;
728 vnode_impl_t *vip;
729 struct mount *mp;
730
731 mutex_enter(&vdrain_lock);
732 while ((vip = lru_iter_first(LRU_VRELE, &iter)) != NULL) {
733 for (skipped = 0; vip != NULL; vip = lru_iter_next(&iter)) {
734 mp = VIMPL_TO_VNODE(vip)->v_mount;
735 if (fstrans_start_nowait(mp) == 0) {
736 vrele_deferred(vip);
737 fstrans_done(mp);
738 } else {
739 skipped++;
740 }
741 }
742
743 lru_iter_release(&iter);
744 if (skipped) {
745 kpause("vrele", false, MAX(1, mstohz(10)),
746 &vdrain_lock);
747 }
748 }
749
750 threadpool_job_done(job);
751 lru_iter_release(&iter);
752 mutex_exit(&vdrain_lock);
753 }
754
755 /*
756 * Try to drop reference on a vnode. Abort if we are releasing the
757 * last reference. Note: this _must_ succeed if not the last reference.
758 */
759 static bool
760 vtryrele(vnode_t *vp)
761 {
762 u_int use, next;
763
764 membar_release();
765 for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
766 if (__predict_false((use & VUSECOUNT_MASK) == 1)) {
767 return false;
768 }
769 KASSERT((use & VUSECOUNT_MASK) > 1);
770 next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
771 if (__predict_true(next == use)) {
772 return true;
773 }
774 }
775 }
776
777 /*
778 * vput: unlock and release the reference.
779 */
780 void
781 vput(vnode_t *vp)
782 {
783 int lktype;
784
785 /*
786 * Do an unlocked check of the usecount. If it looks like we're not
787 * about to drop the last reference, then unlock the vnode and try
788 * to drop the reference. If it ends up being the last reference
789 * after all, vrelel() can fix it all up. Most of the time this
790 * will all go to plan.
791 */
792 if (vrefcnt(vp) > 1) {
793 VOP_UNLOCK(vp);
794 if (vtryrele(vp)) {
795 return;
796 }
797 lktype = LK_NONE;
798 } else {
799 lktype = VOP_ISLOCKED(vp);
800 KASSERT(lktype != LK_NONE);
801 }
802 mutex_enter(vp->v_interlock);
803 vrelel(vp, 0, lktype);
804 }
805
806 /*
807 * Release a vnode from the deferred list.
808 */
809 static void
810 vrele_deferred(vnode_impl_t *vip)
811 {
812 vnode_t *vp;
813
814 KASSERT(mutex_owned(&vdrain_lock));
815 KASSERT(vip->vi_lrulisthd == &lru_list[LRU_VRELE]);
816
817 vp = VIMPL_TO_VNODE(vip);
818
819 /*
820 * First remove the vnode from the vrele list.
821 * Put it on the last lru list, the last vrele()
822 * will put it back onto the right list before
823 * its usecount reaches zero.
824 */
825 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
826 vip->vi_lrulisthd = &lru_list[LRU_HOLD];
827 vip->vi_lrulisttm = getticks();
828 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
829
830 mutex_exit(&vdrain_lock);
831
832 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
833 mutex_enter(vp->v_interlock);
834 vrelel(vp, 0, LK_EXCLUSIVE);
835
836 mutex_enter(&vdrain_lock);
837 }
838
839 /*
840 * Vnode release. If reference count drops to zero, call inactive
841 * routine and either return to freelist or free to the pool.
842 */
843 static void
844 vrelel(vnode_t *vp, int flags, int lktype)
845 {
846 const bool async = ((flags & VRELEL_ASYNC) != 0);
847 bool recycle, defer, objlock_held;
848 u_int use, next;
849 int error;
850
851 objlock_held = false;
852
853 retry:
854 KASSERT(mutex_owned(vp->v_interlock));
855
856 if (__predict_false(vp->v_op == dead_vnodeop_p &&
857 VSTATE_GET(vp) != VS_RECLAIMED)) {
858 vnpanic(vp, "dead but not clean");
859 }
860
861 /*
862 * If not the last reference, just unlock and drop the reference count.
863 *
864 * Otherwise make sure we pass a point in time where we hold the
865 * last reference with VGET flag unset.
866 */
867 for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
868 if (__predict_false((use & VUSECOUNT_MASK) > 1)) {
869 if (objlock_held) {
870 objlock_held = false;
871 rw_exit(vp->v_uobj.vmobjlock);
872 }
873 if (lktype != LK_NONE) {
874 mutex_exit(vp->v_interlock);
875 lktype = LK_NONE;
876 VOP_UNLOCK(vp);
877 mutex_enter(vp->v_interlock);
878 }
879 if (vtryrele(vp)) {
880 mutex_exit(vp->v_interlock);
881 return;
882 }
883 next = atomic_load_relaxed(&vp->v_usecount);
884 continue;
885 }
886 KASSERT((use & VUSECOUNT_MASK) == 1);
887 next = use & ~VUSECOUNT_VGET;
888 if (next != use) {
889 next = atomic_cas_uint(&vp->v_usecount, use, next);
890 }
891 if (__predict_true(next == use)) {
892 break;
893 }
894 }
895 membar_acquire();
896 if (vrefcnt(vp) <= 0 || vp->v_writecount != 0) {
897 vnpanic(vp, "%s: bad ref count", __func__);
898 }
899
900 #ifdef DIAGNOSTIC
901 if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
902 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
903 vprint("vrelel: missing VOP_CLOSE()", vp);
904 }
905 #endif
906
907 /*
908 * If already clean there is no need to lock, defer or
909 * deactivate this node.
910 */
911 if (VSTATE_GET(vp) == VS_RECLAIMED) {
912 if (objlock_held) {
913 objlock_held = false;
914 rw_exit(vp->v_uobj.vmobjlock);
915 }
916 if (lktype != LK_NONE) {
917 mutex_exit(vp->v_interlock);
918 lktype = LK_NONE;
919 VOP_UNLOCK(vp);
920 mutex_enter(vp->v_interlock);
921 }
922 goto out;
923 }
924
925 /*
926 * First try to get the vnode locked for VOP_INACTIVE().
927 * Defer vnode release to vrele task if caller requests
928 * it explicitly, is the pagedaemon or the lock failed.
929 */
930 defer = false;
931 if ((curlwp == uvm.pagedaemon_lwp) || async) {
932 defer = true;
933 } else if (lktype == LK_SHARED) {
934 /* Excellent chance of getting, if the last ref. */
935 error = vn_lock(vp, LK_UPGRADE | LK_RETRY | LK_NOWAIT);
936 if (error != 0) {
937 defer = true;
938 } else {
939 lktype = LK_EXCLUSIVE;
940 }
941 } else if (lktype == LK_NONE) {
942 /* Excellent chance of getting, if the last ref. */
943 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
944 if (error != 0) {
945 defer = true;
946 } else {
947 lktype = LK_EXCLUSIVE;
948 }
949 }
950 KASSERT(mutex_owned(vp->v_interlock));
951 if (defer) {
952 /*
953 * Defer reclaim to the vrele task; it's not safe to
954 * clean it here. We donate it our last reference.
955 */
956 if (lktype != LK_NONE) {
957 mutex_exit(vp->v_interlock);
958 VOP_UNLOCK(vp);
959 mutex_enter(vp->v_interlock);
960 }
961 lru_requeue(vp, &lru_list[LRU_VRELE]);
962 mutex_exit(vp->v_interlock);
963 return;
964 }
965 KASSERT(lktype == LK_EXCLUSIVE);
966
967 /* If the node gained another reference, retry. */
968 use = atomic_load_relaxed(&vp->v_usecount);
969 if ((use & VUSECOUNT_VGET) != 0) {
970 goto retry;
971 }
972 KASSERT((use & VUSECOUNT_MASK) == 1);
973
974 if ((vp->v_iflag & (VI_TEXT|VI_EXECMAP|VI_WRMAP)) != 0 ||
975 (vp->v_vflag & VV_MAPPED) != 0) {
976 /* Take care of space accounting. */
977 if (!objlock_held) {
978 objlock_held = true;
979 if (!rw_tryenter(vp->v_uobj.vmobjlock, RW_WRITER)) {
980 mutex_exit(vp->v_interlock);
981 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
982 mutex_enter(vp->v_interlock);
983 goto retry;
984 }
985 }
986 if ((vp->v_iflag & VI_EXECMAP) != 0) {
987 cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages);
988 }
989 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP);
990 vp->v_vflag &= ~VV_MAPPED;
991 }
992 if (objlock_held) {
993 objlock_held = false;
994 rw_exit(vp->v_uobj.vmobjlock);
995 }
996
997 /*
998 * Deactivate the vnode, but preserve our reference across
999 * the call to VOP_INACTIVE().
1000 *
1001 * If VOP_INACTIVE() indicates that the file has been
1002 * deleted, then recycle the vnode.
1003 *
1004 * Note that VOP_INACTIVE() will not drop the vnode lock.
1005 */
1006 mutex_exit(vp->v_interlock);
1007 recycle = false;
1008 VOP_INACTIVE(vp, &recycle);
1009 if (!recycle) {
1010 lktype = LK_NONE;
1011 VOP_UNLOCK(vp);
1012 }
1013 mutex_enter(vp->v_interlock);
1014
1015 /*
1016 * Block new references then check again to see if a
1017 * new reference was acquired in the meantime. If
1018 * it was, restore the vnode state and try again.
1019 */
1020 if (recycle) {
1021 VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
1022 use = atomic_load_relaxed(&vp->v_usecount);
1023 if ((use & VUSECOUNT_VGET) != 0) {
1024 VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
1025 goto retry;
1026 }
1027 KASSERT((use & VUSECOUNT_MASK) == 1);
1028 }
1029
1030 /*
1031 * Recycle the vnode if the file is now unused (unlinked).
1032 */
1033 if (recycle) {
1034 VSTATE_ASSERT(vp, VS_BLOCKED);
1035 KASSERT(lktype == LK_EXCLUSIVE);
1036 /* vcache_reclaim drops the lock. */
1037 lktype = LK_NONE;
1038 vcache_reclaim(vp);
1039 }
1040 KASSERT(vrefcnt(vp) > 0);
1041 KASSERT(lktype == LK_NONE);
1042
1043 out:
1044 for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
1045 if (__predict_false((use & VUSECOUNT_VGET) != 0 &&
1046 (use & VUSECOUNT_MASK) == 1)) {
1047 /* Gained and released another reference, retry. */
1048 goto retry;
1049 }
1050 next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
1051 if (__predict_true(next == use)) {
1052 if (__predict_false((use & VUSECOUNT_MASK) != 1)) {
1053 /* Gained another reference. */
1054 mutex_exit(vp->v_interlock);
1055 return;
1056 }
1057 break;
1058 }
1059 }
1060 membar_acquire();
1061
1062 if (VSTATE_GET(vp) == VS_RECLAIMED && vp->v_holdcnt == 0) {
1063 /*
1064 * It's clean so destroy it. It isn't referenced
1065 * anywhere since it has been reclaimed.
1066 */
1067 vcache_free(VNODE_TO_VIMPL(vp));
1068 } else {
1069 /*
1070 * Otherwise, put it back onto the freelist. It
1071 * can't be destroyed while still associated with
1072 * a file system.
1073 */
1074 lru_requeue(vp, lru_which(vp));
1075 mutex_exit(vp->v_interlock);
1076 }
1077 }
1078
1079 void
1080 vrele(vnode_t *vp)
1081 {
1082
1083 if (vtryrele(vp)) {
1084 return;
1085 }
1086 mutex_enter(vp->v_interlock);
1087 vrelel(vp, 0, LK_NONE);
1088 }
1089
1090 /*
1091 * Asynchronous vnode release, vnode is released in different context.
1092 */
1093 void
1094 vrele_async(vnode_t *vp)
1095 {
1096
1097 if (vtryrele(vp)) {
1098 return;
1099 }
1100 mutex_enter(vp->v_interlock);
1101 vrelel(vp, VRELEL_ASYNC, LK_NONE);
1102 }
1103
1104 /*
1105 * Vnode reference, where a reference is already held by some other
1106 * object (for example, a file structure).
1107 *
1108 * NB: lockless code sequences may rely on this not blocking.
1109 */
1110 void
1111 vref(vnode_t *vp)
1112 {
1113
1114 KASSERT(vrefcnt(vp) > 0);
1115
1116 atomic_inc_uint(&vp->v_usecount);
1117 }
1118
1119 /*
1120 * Page or buffer structure gets a reference.
1121 * Called with v_interlock held.
1122 */
1123 void
1124 vholdl(vnode_t *vp)
1125 {
1126
1127 KASSERT(mutex_owned(vp->v_interlock));
1128
1129 if (vp->v_holdcnt++ == 0 && vrefcnt(vp) == 0)
1130 lru_requeue(vp, lru_which(vp));
1131 }
1132
1133 /*
1134 * Page or buffer structure gets a reference.
1135 */
1136 void
1137 vhold(vnode_t *vp)
1138 {
1139
1140 mutex_enter(vp->v_interlock);
1141 vholdl(vp);
1142 mutex_exit(vp->v_interlock);
1143 }
1144
1145 /*
1146 * Page or buffer structure frees a reference.
1147 * Called with v_interlock held.
1148 */
1149 void
1150 holdrelel(vnode_t *vp)
1151 {
1152
1153 KASSERT(mutex_owned(vp->v_interlock));
1154
1155 if (vp->v_holdcnt <= 0) {
1156 vnpanic(vp, "%s: holdcnt vp %p", __func__, vp);
1157 }
1158
1159 vp->v_holdcnt--;
1160 if (vp->v_holdcnt == 0 && vrefcnt(vp) == 0)
1161 lru_requeue(vp, lru_which(vp));
1162 }
1163
1164 /*
1165 * Page or buffer structure frees a reference.
1166 */
1167 void
1168 holdrele(vnode_t *vp)
1169 {
1170
1171 mutex_enter(vp->v_interlock);
1172 holdrelel(vp);
1173 mutex_exit(vp->v_interlock);
1174 }
1175
1176 /*
1177 * Recycle an unused vnode if caller holds the last reference.
1178 */
1179 bool
1180 vrecycle(vnode_t *vp)
1181 {
1182 int error __diagused;
1183
1184 mutex_enter(vp->v_interlock);
1185
1186 /* If the vnode is already clean we're done. */
1187 VSTATE_WAIT_STABLE(vp);
1188 if (VSTATE_GET(vp) != VS_LOADED) {
1189 VSTATE_ASSERT(vp, VS_RECLAIMED);
1190 vrelel(vp, 0, LK_NONE);
1191 return true;
1192 }
1193
1194 /* Prevent further references until the vnode is locked. */
1195 VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
1196
1197 /* Make sure we hold the last reference. */
1198 if (vrefcnt(vp) != 1) {
1199 VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
1200 mutex_exit(vp->v_interlock);
1201 return false;
1202 }
1203
1204 mutex_exit(vp->v_interlock);
1205
1206 /*
1207 * On a leaf file system this lock will always succeed as we hold
1208 * the last reference and prevent further references.
1209 * On layered file systems waiting for the lock would open a can of
1210 * deadlocks as the lower vnodes may have other active references.
1211 */
1212 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
1213
1214 mutex_enter(vp->v_interlock);
1215 if (error) {
1216 VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
1217 mutex_exit(vp->v_interlock);
1218 return false;
1219 }
1220
1221 KASSERT(vrefcnt(vp) == 1);
1222 vcache_reclaim(vp);
1223 vrelel(vp, 0, LK_NONE);
1224
1225 return true;
1226 }
1227
1228 /*
1229 * Helper for vrevoke() to propagate suspension from lastmp
1230 * to thismp. Both args may be NULL.
1231 * Returns the currently suspended file system or NULL.
1232 */
1233 static struct mount *
1234 vrevoke_suspend_next(struct mount *lastmp, struct mount *thismp)
1235 {
1236 int error;
1237
1238 if (lastmp == thismp)
1239 return thismp;
1240
1241 if (lastmp != NULL)
1242 vfs_resume(lastmp);
1243
1244 if (thismp == NULL)
1245 return NULL;
1246
1247 do {
1248 error = vfs_suspend(thismp, 0);
1249 } while (error == EINTR || error == ERESTART);
1250
1251 if (error == 0)
1252 return thismp;
1253
1254 KASSERT(error == EOPNOTSUPP || error == ENOENT);
1255 return NULL;
1256 }
1257
1258 /*
1259 * Eliminate all activity associated with the requested vnode
1260 * and with all vnodes aliased to the requested vnode.
1261 */
1262 void
1263 vrevoke(vnode_t *vp)
1264 {
1265 struct mount *mp;
1266 vnode_t *vq;
1267 enum vtype type;
1268 dev_t dev;
1269
1270 KASSERT(vrefcnt(vp) > 0);
1271
1272 mp = vrevoke_suspend_next(NULL, vp->v_mount);
1273
1274 mutex_enter(vp->v_interlock);
1275 VSTATE_WAIT_STABLE(vp);
1276 if (VSTATE_GET(vp) == VS_RECLAIMED) {
1277 mutex_exit(vp->v_interlock);
1278 } else if (vp->v_type != VBLK && vp->v_type != VCHR) {
1279 atomic_inc_uint(&vp->v_usecount);
1280 mutex_exit(vp->v_interlock);
1281 vgone(vp);
1282 } else {
1283 dev = vp->v_rdev;
1284 type = vp->v_type;
1285 mutex_exit(vp->v_interlock);
1286
1287 while (spec_node_lookup_by_dev(type, dev, VDEAD_NOWAIT, &vq)
1288 == 0) {
1289 mp = vrevoke_suspend_next(mp, vq->v_mount);
1290 vgone(vq);
1291 }
1292 }
1293 vrevoke_suspend_next(mp, NULL);
1294 }
1295
1296 /*
1297 * Eliminate all activity associated with a vnode in preparation for
1298 * reuse. Drops a reference from the vnode.
1299 */
1300 void
1301 vgone(vnode_t *vp)
1302 {
1303 int lktype;
1304
1305 KASSERT(vp->v_mount == dead_rootmount ||
1306 fstrans_is_owner(vp->v_mount));
1307
1308 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1309 lktype = LK_EXCLUSIVE;
1310 mutex_enter(vp->v_interlock);
1311 VSTATE_WAIT_STABLE(vp);
1312 if (VSTATE_GET(vp) == VS_LOADED) {
1313 VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
1314 vcache_reclaim(vp);
1315 lktype = LK_NONE;
1316 }
1317 VSTATE_ASSERT(vp, VS_RECLAIMED);
1318 vrelel(vp, 0, lktype);
1319 }
1320
1321 static inline uint32_t
1322 vcache_hash(const struct vcache_key *key)
1323 {
1324 uint32_t hash = HASH32_BUF_INIT;
1325
1326 KASSERT(key->vk_key_len > 0);
1327
1328 hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash);
1329 hash = hash32_buf(key->vk_key, key->vk_key_len, hash);
1330 return hash;
1331 }
1332
1333 static int
1334 vcache_stats(struct hashstat_sysctl *hs, bool fill)
1335 {
1336 vnode_impl_t *vip;
1337 uint64_t chain;
1338
1339 strlcpy(hs->hash_name, "vcache", sizeof(hs->hash_name));
1340 strlcpy(hs->hash_desc, "vnode cache hash", sizeof(hs->hash_desc));
1341 if (!fill)
1342 return 0;
1343
1344 hs->hash_size = vcache_hashmask + 1;
1345
1346 for (size_t i = 0; i < hs->hash_size; i++) {
1347 chain = 0;
1348 mutex_enter(&vcache_lock);
1349 SLIST_FOREACH(vip, &vcache_hashtab[i], vi_hash) {
1350 chain++;
1351 }
1352 mutex_exit(&vcache_lock);
1353 if (chain > 0) {
1354 hs->hash_used++;
1355 hs->hash_items += chain;
1356 if (chain > hs->hash_maxchain)
1357 hs->hash_maxchain = chain;
1358 }
1359 preempt_point();
1360 }
1361
1362 return 0;
1363 }
1364
1365 static void
1366 vcache_init(void)
1367 {
1368
1369 vcache_pool = pool_cache_init(sizeof(vnode_impl_t), coherency_unit,
1370 0, 0, "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL);
1371 KASSERT(vcache_pool != NULL);
1372 mutex_init(&vcache_lock, MUTEX_DEFAULT, IPL_NONE);
1373 cv_init(&vcache_cv, "vcache");
1374 vcache_hashsize = desiredvnodes;
1375 vcache_hashtab = hashinit(desiredvnodes, HASH_SLIST, true,
1376 &vcache_hashmask);
1377 hashstat_register("vcache", vcache_stats);
1378 }
1379
1380 static void
1381 vcache_reinit(void)
1382 {
1383 int i;
1384 uint32_t hash;
1385 u_long oldmask, newmask;
1386 struct hashhead *oldtab, *newtab;
1387 vnode_impl_t *vip;
1388
1389 newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask);
1390 mutex_enter(&vcache_lock);
1391 oldtab = vcache_hashtab;
1392 oldmask = vcache_hashmask;
1393 vcache_hashsize = desiredvnodes;
1394 vcache_hashtab = newtab;
1395 vcache_hashmask = newmask;
1396 for (i = 0; i <= oldmask; i++) {
1397 while ((vip = SLIST_FIRST(&oldtab[i])) != NULL) {
1398 SLIST_REMOVE(&oldtab[i], vip, vnode_impl, vi_hash);
1399 hash = vcache_hash(&vip->vi_key);
1400 SLIST_INSERT_HEAD(&newtab[hash & vcache_hashmask],
1401 vip, vi_hash);
1402 }
1403 }
1404 mutex_exit(&vcache_lock);
1405 hashdone(oldtab, HASH_SLIST, oldmask);
1406 }
1407
1408 static inline vnode_impl_t *
1409 vcache_hash_lookup(const struct vcache_key *key, uint32_t hash)
1410 {
1411 struct hashhead *hashp;
1412 vnode_impl_t *vip;
1413
1414 KASSERT(mutex_owned(&vcache_lock));
1415
1416 hashp = &vcache_hashtab[hash & vcache_hashmask];
1417 SLIST_FOREACH(vip, hashp, vi_hash) {
1418 if (key->vk_mount != vip->vi_key.vk_mount)
1419 continue;
1420 if (key->vk_key_len != vip->vi_key.vk_key_len)
1421 continue;
1422 if (memcmp(key->vk_key, vip->vi_key.vk_key, key->vk_key_len))
1423 continue;
1424 return vip;
1425 }
1426 return NULL;
1427 }
1428
1429 /*
1430 * Allocate a new, uninitialized vcache node.
1431 */
1432 static vnode_impl_t *
1433 vcache_alloc(void)
1434 {
1435 vnode_impl_t *vip;
1436 vnode_t *vp;
1437
1438 vip = pool_cache_get(vcache_pool, PR_WAITOK);
1439 vp = VIMPL_TO_VNODE(vip);
1440 memset(vip, 0, sizeof(*vip));
1441
1442 rw_init(&vip->vi_lock);
1443 vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
1444
1445 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1);
1446 klist_init(&vip->vi_klist.vk_klist);
1447 vp->v_klist = &vip->vi_klist;
1448 cv_init(&vp->v_cv, "vnode");
1449 cache_vnode_init(vp);
1450
1451 vp->v_usecount = 1;
1452 vp->v_type = VNON;
1453 vp->v_size = vp->v_writesize = VSIZENOTSET;
1454
1455 vip->vi_state = VS_LOADING;
1456
1457 lru_requeue(vp, &lru_list[LRU_FREE]);
1458
1459 return vip;
1460 }
1461
1462 /*
1463 * Deallocate a vcache node in state VS_LOADING.
1464 *
1465 * vcache_lock held on entry and released on return.
1466 */
1467 static void
1468 vcache_dealloc(vnode_impl_t *vip)
1469 {
1470 vnode_t *vp;
1471
1472 KASSERT(mutex_owned(&vcache_lock));
1473
1474 vp = VIMPL_TO_VNODE(vip);
1475 vfs_ref(dead_rootmount);
1476 vfs_insmntque(vp, dead_rootmount);
1477 mutex_enter(vp->v_interlock);
1478 vp->v_op = dead_vnodeop_p;
1479 VSTATE_CHANGE(vp, VS_LOADING, VS_RECLAIMED);
1480 mutex_exit(&vcache_lock);
1481 vrelel(vp, 0, LK_NONE);
1482 }
1483
1484 /*
1485 * Free an unused, unreferenced vcache node.
1486 * v_interlock locked on entry.
1487 */
1488 static void
1489 vcache_free(vnode_impl_t *vip)
1490 {
1491 vnode_t *vp;
1492
1493 vp = VIMPL_TO_VNODE(vip);
1494 KASSERT(mutex_owned(vp->v_interlock));
1495
1496 KASSERT(vrefcnt(vp) == 0);
1497 KASSERT(vp->v_holdcnt == 0);
1498 KASSERT(vp->v_writecount == 0);
1499 lru_requeue(vp, NULL);
1500 mutex_exit(vp->v_interlock);
1501
1502 vfs_insmntque(vp, NULL);
1503 if (vp->v_type == VBLK || vp->v_type == VCHR)
1504 spec_node_destroy(vp);
1505
1506 mutex_obj_free(vp->v_interlock);
1507 rw_destroy(&vip->vi_lock);
1508 uvm_obj_destroy(&vp->v_uobj, true);
1509 KASSERT(vp->v_klist == &vip->vi_klist);
1510 klist_fini(&vip->vi_klist.vk_klist);
1511 cv_destroy(&vp->v_cv);
1512 cache_vnode_fini(vp);
1513 pool_cache_put(vcache_pool, vip);
1514 }
1515
1516 /*
1517 * Try to get an initial reference on this cached vnode.
1518 * Returns zero on success or EBUSY if the vnode state is not LOADED.
1519 *
1520 * NB: lockless code sequences may rely on this not blocking.
1521 */
1522 int
1523 vcache_tryvget(vnode_t *vp)
1524 {
1525 u_int use, next;
1526
1527 for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
1528 if (__predict_false((use & VUSECOUNT_GATE) == 0)) {
1529 return SET_ERROR(EBUSY);
1530 }
1531 next = atomic_cas_uint(&vp->v_usecount,
1532 use, (use + 1) | VUSECOUNT_VGET);
1533 if (__predict_true(next == use)) {
1534 membar_acquire();
1535 return 0;
1536 }
1537 }
1538 }
1539
1540 /*
1541 * Try to get an initial reference on this cached vnode.
1542 * Returns zero on success and ENOENT if the vnode has been reclaimed.
1543 * Will wait for the vnode state to be stable.
1544 *
1545 * v_interlock locked on entry and unlocked on exit.
1546 */
1547 int
1548 vcache_vget(vnode_t *vp)
1549 {
1550 int error;
1551
1552 KASSERT(mutex_owned(vp->v_interlock));
1553
1554 /* Increment hold count to prevent vnode from disappearing. */
1555 vp->v_holdcnt++;
1556 VSTATE_WAIT_STABLE(vp);
1557 vp->v_holdcnt--;
1558
1559 /* If this was the last reference to a reclaimed vnode free it now. */
1560 if (__predict_false(VSTATE_GET(vp) == VS_RECLAIMED)) {
1561 if (vp->v_holdcnt == 0 && vrefcnt(vp) == 0)
1562 vcache_free(VNODE_TO_VIMPL(vp));
1563 else
1564 mutex_exit(vp->v_interlock);
1565 return SET_ERROR(ENOENT);
1566 }
1567 VSTATE_ASSERT(vp, VS_LOADED);
1568 error = vcache_tryvget(vp);
1569 KASSERT(error == 0);
1570 mutex_exit(vp->v_interlock);
1571
1572 return 0;
1573 }
1574
1575 /*
1576 * Get a vnode / fs node pair by key and return it referenced through vpp.
1577 */
1578 int
1579 vcache_get(struct mount *mp, const void *key, size_t key_len,
1580 struct vnode **vpp)
1581 {
1582 int error;
1583 uint32_t hash;
1584 const void *new_key;
1585 struct vnode *vp;
1586 struct vcache_key vcache_key;
1587 vnode_impl_t *vip, *new_vip;
1588
1589 new_key = NULL;
1590 *vpp = NULL;
1591
1592 vcache_key.vk_mount = mp;
1593 vcache_key.vk_key = key;
1594 vcache_key.vk_key_len = key_len;
1595 hash = vcache_hash(&vcache_key);
1596
1597 again:
1598 mutex_enter(&vcache_lock);
1599 vip = vcache_hash_lookup(&vcache_key, hash);
1600
1601 /* If found, take a reference or retry. */
1602 if (__predict_true(vip != NULL)) {
1603 /*
1604 * If the vnode is loading we cannot take the v_interlock
1605 * here as it might change during load (see uvm_obj_setlock()).
1606 * As changing state from VS_LOADING requires both vcache_lock
1607 * and v_interlock it is safe to test with vcache_lock held.
1608 *
1609 * Wait for vnodes changing state from VS_LOADING and retry.
1610 */
1611 if (__predict_false(vip->vi_state == VS_LOADING)) {
1612 cv_wait(&vcache_cv, &vcache_lock);
1613 mutex_exit(&vcache_lock);
1614 goto again;
1615 }
1616 vp = VIMPL_TO_VNODE(vip);
1617 mutex_enter(vp->v_interlock);
1618 mutex_exit(&vcache_lock);
1619 error = vcache_vget(vp);
1620 if (error == ENOENT)
1621 goto again;
1622 if (error == 0)
1623 *vpp = vp;
1624 KASSERT((error != 0) == (*vpp == NULL));
1625 return error;
1626 }
1627 mutex_exit(&vcache_lock);
1628
1629 /* Allocate and initialize a new vcache / vnode pair. */
1630 error = vfs_busy(mp);
1631 if (error)
1632 return error;
1633 new_vip = vcache_alloc();
1634 new_vip->vi_key = vcache_key;
1635 vp = VIMPL_TO_VNODE(new_vip);
1636 mutex_enter(&vcache_lock);
1637 vip = vcache_hash_lookup(&vcache_key, hash);
1638 if (vip == NULL) {
1639 SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask],
1640 new_vip, vi_hash);
1641 vip = new_vip;
1642 }
1643
1644 /* If another thread beat us inserting this node, retry. */
1645 if (vip != new_vip) {
1646 vcache_dealloc(new_vip);
1647 vfs_unbusy(mp);
1648 goto again;
1649 }
1650 mutex_exit(&vcache_lock);
1651
1652 /* Load the fs node. Exclusive as new_node is VS_LOADING. */
1653 error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key);
1654 if (error) {
1655 mutex_enter(&vcache_lock);
1656 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
1657 new_vip, vnode_impl, vi_hash);
1658 vcache_dealloc(new_vip);
1659 vfs_unbusy(mp);
1660 KASSERT(*vpp == NULL);
1661 return error;
1662 }
1663 KASSERT(new_key != NULL);
1664 KASSERT(memcmp(key, new_key, key_len) == 0);
1665 KASSERT(vp->v_op != NULL);
1666 vfs_insmntque(vp, mp);
1667 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1668 vp->v_vflag |= VV_MPSAFE;
1669 vfs_ref(mp);
1670 vfs_unbusy(mp);
1671
1672 /* Finished loading, finalize node. */
1673 mutex_enter(&vcache_lock);
1674 new_vip->vi_key.vk_key = new_key;
1675 mutex_enter(vp->v_interlock);
1676 VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED);
1677 mutex_exit(vp->v_interlock);
1678 mutex_exit(&vcache_lock);
1679 *vpp = vp;
1680 return 0;
1681 }
1682
1683 /*
1684 * Create a new vnode / fs node pair and return it referenced through vpp.
1685 */
1686 int
1687 vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap,
1688 kauth_cred_t cred, void *extra, struct vnode **vpp)
1689 {
1690 int error;
1691 uint32_t hash;
1692 struct vnode *vp, *ovp;
1693 vnode_impl_t *vip, *ovip;
1694
1695 *vpp = NULL;
1696
1697 /* Allocate and initialize a new vcache / vnode pair. */
1698 error = vfs_busy(mp);
1699 if (error)
1700 return error;
1701 vip = vcache_alloc();
1702 vip->vi_key.vk_mount = mp;
1703 vp = VIMPL_TO_VNODE(vip);
1704
1705 /* Create and load the fs node. */
1706 error = VFS_NEWVNODE(mp, dvp, vp, vap, cred, extra,
1707 &vip->vi_key.vk_key_len, &vip->vi_key.vk_key);
1708 if (error) {
1709 mutex_enter(&vcache_lock);
1710 vcache_dealloc(vip);
1711 vfs_unbusy(mp);
1712 KASSERT(*vpp == NULL);
1713 return error;
1714 }
1715 KASSERT(vp->v_op != NULL);
1716 KASSERT((vip->vi_key.vk_key_len == 0) == (mp == dead_rootmount));
1717 if (vip->vi_key.vk_key_len > 0) {
1718 KASSERT(vip->vi_key.vk_key != NULL);
1719 hash = vcache_hash(&vip->vi_key);
1720
1721 /*
1722 * Wait for previous instance to be reclaimed,
1723 * then insert new node.
1724 */
1725 mutex_enter(&vcache_lock);
1726 while ((ovip = vcache_hash_lookup(&vip->vi_key, hash))) {
1727 ovp = VIMPL_TO_VNODE(ovip);
1728 mutex_enter(ovp->v_interlock);
1729 mutex_exit(&vcache_lock);
1730 error = vcache_vget(ovp);
1731 KASSERT(error == ENOENT);
1732 mutex_enter(&vcache_lock);
1733 }
1734 SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask],
1735 vip, vi_hash);
1736 mutex_exit(&vcache_lock);
1737 }
1738 vfs_insmntque(vp, mp);
1739 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1740 vp->v_vflag |= VV_MPSAFE;
1741 vfs_ref(mp);
1742 vfs_unbusy(mp);
1743
1744 /* Finished loading, finalize node. */
1745 mutex_enter(&vcache_lock);
1746 mutex_enter(vp->v_interlock);
1747 VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED);
1748 mutex_exit(&vcache_lock);
1749 mutex_exit(vp->v_interlock);
1750 *vpp = vp;
1751 return 0;
1752 }
1753
1754 /*
1755 * Prepare key change: update old cache nodes key and lock new cache node.
1756 * Return an error if the new node already exists.
1757 */
1758 int
1759 vcache_rekey_enter(struct mount *mp, struct vnode *vp,
1760 const void *old_key, size_t old_key_len,
1761 const void *new_key, size_t new_key_len)
1762 {
1763 uint32_t old_hash, new_hash;
1764 struct vcache_key old_vcache_key, new_vcache_key;
1765 vnode_impl_t *vip, *new_vip;
1766
1767 old_vcache_key.vk_mount = mp;
1768 old_vcache_key.vk_key = old_key;
1769 old_vcache_key.vk_key_len = old_key_len;
1770 old_hash = vcache_hash(&old_vcache_key);
1771
1772 new_vcache_key.vk_mount = mp;
1773 new_vcache_key.vk_key = new_key;
1774 new_vcache_key.vk_key_len = new_key_len;
1775 new_hash = vcache_hash(&new_vcache_key);
1776
1777 new_vip = vcache_alloc();
1778 new_vip->vi_key = new_vcache_key;
1779
1780 /* Insert locked new node used as placeholder. */
1781 mutex_enter(&vcache_lock);
1782 vip = vcache_hash_lookup(&new_vcache_key, new_hash);
1783 if (vip != NULL) {
1784 vcache_dealloc(new_vip);
1785 return SET_ERROR(EEXIST);
1786 }
1787 SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask],
1788 new_vip, vi_hash);
1789
1790 /* Replace old nodes key with the temporary copy. */
1791 vip = vcache_hash_lookup(&old_vcache_key, old_hash);
1792 KASSERT(vip != NULL);
1793 KASSERT(VIMPL_TO_VNODE(vip) == vp);
1794 KASSERT(vip->vi_key.vk_key != old_vcache_key.vk_key);
1795 vip->vi_key = old_vcache_key;
1796 mutex_exit(&vcache_lock);
1797 return 0;
1798 }
1799
1800 /*
1801 * Key change complete: update old node and remove placeholder.
1802 */
1803 void
1804 vcache_rekey_exit(struct mount *mp, struct vnode *vp,
1805 const void *old_key, size_t old_key_len,
1806 const void *new_key, size_t new_key_len)
1807 {
1808 uint32_t old_hash, new_hash;
1809 struct vcache_key old_vcache_key, new_vcache_key;
1810 vnode_impl_t *vip, *new_vip;
1811 struct vnode *new_vp;
1812
1813 old_vcache_key.vk_mount = mp;
1814 old_vcache_key.vk_key = old_key;
1815 old_vcache_key.vk_key_len = old_key_len;
1816 old_hash = vcache_hash(&old_vcache_key);
1817
1818 new_vcache_key.vk_mount = mp;
1819 new_vcache_key.vk_key = new_key;
1820 new_vcache_key.vk_key_len = new_key_len;
1821 new_hash = vcache_hash(&new_vcache_key);
1822
1823 mutex_enter(&vcache_lock);
1824
1825 /* Lookup old and new node. */
1826 vip = vcache_hash_lookup(&old_vcache_key, old_hash);
1827 KASSERT(vip != NULL);
1828 KASSERT(VIMPL_TO_VNODE(vip) == vp);
1829
1830 new_vip = vcache_hash_lookup(&new_vcache_key, new_hash);
1831 KASSERT(new_vip != NULL);
1832 KASSERT(new_vip->vi_key.vk_key_len == new_key_len);
1833 new_vp = VIMPL_TO_VNODE(new_vip);
1834 mutex_enter(new_vp->v_interlock);
1835 VSTATE_ASSERT(VIMPL_TO_VNODE(new_vip), VS_LOADING);
1836 mutex_exit(new_vp->v_interlock);
1837
1838 /* Rekey old node and put it onto its new hashlist. */
1839 vip->vi_key = new_vcache_key;
1840 if (old_hash != new_hash) {
1841 SLIST_REMOVE(&vcache_hashtab[old_hash & vcache_hashmask],
1842 vip, vnode_impl, vi_hash);
1843 SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask],
1844 vip, vi_hash);
1845 }
1846
1847 /* Remove new node used as placeholder. */
1848 SLIST_REMOVE(&vcache_hashtab[new_hash & vcache_hashmask],
1849 new_vip, vnode_impl, vi_hash);
1850 vcache_dealloc(new_vip);
1851 }
1852
1853 /*
1854 * Disassociate the underlying file system from a vnode.
1855 *
1856 * Must be called with vnode locked and will return unlocked.
1857 * Must be called with the interlock held, and will return with it held.
1858 */
1859 static void
1860 vcache_reclaim(vnode_t *vp)
1861 {
1862 lwp_t *l = curlwp;
1863 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
1864 struct mount *mp = vp->v_mount;
1865 uint32_t hash;
1866 uint8_t temp_buf[64], *temp_key;
1867 size_t temp_key_len;
1868 bool recycle;
1869 int error;
1870
1871 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
1872 KASSERT(mutex_owned(vp->v_interlock));
1873 KASSERT(vrefcnt(vp) != 0);
1874
1875 temp_key_len = vip->vi_key.vk_key_len;
1876 /*
1877 * Prevent the vnode from being recycled or brought into use
1878 * while we clean it out.
1879 */
1880 VSTATE_CHANGE(vp, VS_BLOCKED, VS_RECLAIMING);
1881
1882 /*
1883 * Send NOTE_REVOKE now, before we call VOP_RECLAIM(),
1884 * because VOP_RECLAIM() could cause vp->v_klist to
1885 * become invalid. Don't check for interest in NOTE_REVOKE
1886 * here; it's always posted because it sets EV_EOF.
1887 *
1888 * Once it's been posted, reset vp->v_klist to point to
1889 * our own local storage, in case we were sharing with
1890 * someone else.
1891 */
1892 KNOTE(&vp->v_klist->vk_klist, NOTE_REVOKE);
1893 vp->v_klist = &vip->vi_klist;
1894 mutex_exit(vp->v_interlock);
1895
1896 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
1897 mutex_enter(vp->v_interlock);
1898 if ((vp->v_iflag & VI_EXECMAP) != 0) {
1899 cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages);
1900 }
1901 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
1902 vp->v_iflag |= VI_DEADCHECK; /* for genfs_getpages() */
1903 mutex_exit(vp->v_interlock);
1904 rw_exit(vp->v_uobj.vmobjlock);
1905
1906 /*
1907 * With vnode state set to reclaiming, purge name cache immediately
1908 * to prevent new handles on vnode, and wait for existing threads
1909 * trying to get a handle to notice VS_RECLAIMED status and abort.
1910 */
1911 cache_purge(vp);
1912
1913 /* Replace the vnode key with a temporary copy. */
1914 if (vip->vi_key.vk_key_len > sizeof(temp_buf)) {
1915 temp_key = kmem_alloc(temp_key_len, KM_SLEEP);
1916 } else {
1917 temp_key = temp_buf;
1918 }
1919 if (vip->vi_key.vk_key_len > 0) {
1920 mutex_enter(&vcache_lock);
1921 memcpy(temp_key, vip->vi_key.vk_key, temp_key_len);
1922 vip->vi_key.vk_key = temp_key;
1923 mutex_exit(&vcache_lock);
1924 }
1925
1926 fstrans_start(mp);
1927
1928 /*
1929 * Clean out any cached data associated with the vnode.
1930 */
1931 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
1932 if (error != 0) {
1933 if (wapbl_vphaswapbl(vp))
1934 WAPBL_DISCARD(wapbl_vptomp(vp));
1935 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
1936 }
1937 KASSERTMSG((error == 0), "vinvalbuf failed: %d", error);
1938 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1939 if (vp->v_type == VBLK || vp->v_type == VCHR) {
1940 spec_node_revoke(vp);
1941 }
1942
1943 /*
1944 * Disassociate the underlying file system from the vnode.
1945 * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks
1946 * the vnode, and may destroy the vnode so that VOP_UNLOCK
1947 * would no longer function.
1948 */
1949 VOP_INACTIVE(vp, &recycle);
1950 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
1951 if (VOP_RECLAIM(vp)) {
1952 vnpanic(vp, "%s: cannot reclaim", __func__);
1953 }
1954
1955 KASSERT(vp->v_data == NULL);
1956 KASSERT((vp->v_iflag & VI_PAGES) == 0);
1957
1958 if (vp->v_type == VREG && vp->v_ractx != NULL) {
1959 uvm_ra_freectx(vp->v_ractx);
1960 vp->v_ractx = NULL;
1961 }
1962
1963 if (vip->vi_key.vk_key_len > 0) {
1964 /* Remove from vnode cache. */
1965 hash = vcache_hash(&vip->vi_key);
1966 mutex_enter(&vcache_lock);
1967 KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash));
1968 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
1969 vip, vnode_impl, vi_hash);
1970 mutex_exit(&vcache_lock);
1971 }
1972 if (temp_key != temp_buf)
1973 kmem_free(temp_key, temp_key_len);
1974
1975 /* Done with purge, notify sleepers of the grim news. */
1976 mutex_enter(vp->v_interlock);
1977 vp->v_op = dead_vnodeop_p;
1978 VSTATE_CHANGE(vp, VS_RECLAIMING, VS_RECLAIMED);
1979 vp->v_tag = VT_NON;
1980 mutex_exit(vp->v_interlock);
1981
1982 /*
1983 * Move to dead mount. Must be after changing the operations
1984 * vector as vnode operations enter the mount before using the
1985 * operations vector. See sys/kern/vnode_if.c.
1986 */
1987 vp->v_vflag &= ~VV_ROOT;
1988 vfs_ref(dead_rootmount);
1989 vfs_insmntque(vp, dead_rootmount);
1990
1991 #ifdef PAX_SEGVGUARD
1992 pax_segvguard_cleanup(vp);
1993 #endif /* PAX_SEGVGUARD */
1994
1995 mutex_enter(vp->v_interlock);
1996 fstrans_done(mp);
1997 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1998 }
1999
2000 /*
2001 * Disassociate the underlying file system from an open device vnode
2002 * and make it anonymous.
2003 *
2004 * Vnode unlocked on entry, drops a reference to the vnode.
2005 */
2006 void
2007 vcache_make_anon(vnode_t *vp)
2008 {
2009 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
2010 uint32_t hash;
2011 bool recycle;
2012
2013 KASSERT(vp->v_type == VBLK || vp->v_type == VCHR);
2014 KASSERT(vp->v_mount == dead_rootmount ||
2015 fstrans_is_owner(vp->v_mount));
2016 VSTATE_ASSERT_UNLOCKED(vp, VS_ACTIVE);
2017
2018 /* Remove from vnode cache. */
2019 hash = vcache_hash(&vip->vi_key);
2020 mutex_enter(&vcache_lock);
2021 KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash));
2022 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
2023 vip, vnode_impl, vi_hash);
2024 vip->vi_key.vk_mount = dead_rootmount;
2025 vip->vi_key.vk_key_len = 0;
2026 vip->vi_key.vk_key = NULL;
2027 mutex_exit(&vcache_lock);
2028
2029 /*
2030 * Disassociate the underlying file system from the vnode.
2031 * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks
2032 * the vnode, and may destroy the vnode so that VOP_UNLOCK
2033 * would no longer function.
2034 */
2035 if (vn_lock(vp, LK_EXCLUSIVE)) {
2036 vnpanic(vp, "%s: cannot lock", __func__);
2037 }
2038 VOP_INACTIVE(vp, &recycle);
2039 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
2040 if (VOP_RECLAIM(vp)) {
2041 vnpanic(vp, "%s: cannot reclaim", __func__);
2042 }
2043
2044 /* Purge name cache. */
2045 cache_purge(vp);
2046
2047 /* Done with purge, change operations vector. */
2048 mutex_enter(vp->v_interlock);
2049 vp->v_op = spec_vnodeop_p;
2050 vp->v_vflag |= VV_MPSAFE;
2051 mutex_exit(vp->v_interlock);
2052
2053 /*
2054 * Move to dead mount. Must be after changing the operations
2055 * vector as vnode operations enter the mount before using the
2056 * operations vector. See sys/kern/vnode_if.c.
2057 */
2058 vfs_ref(dead_rootmount);
2059 vfs_insmntque(vp, dead_rootmount);
2060
2061 vrele(vp);
2062 }
2063
2064 /*
2065 * Update outstanding I/O count and do wakeup if requested.
2066 */
2067 void
2068 vwakeup(struct buf *bp)
2069 {
2070 vnode_t *vp;
2071
2072 if ((vp = bp->b_vp) == NULL)
2073 return;
2074
2075 KASSERT(bp->b_objlock == vp->v_interlock);
2076 KASSERT(mutex_owned(bp->b_objlock));
2077
2078 if (--vp->v_numoutput < 0)
2079 vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp);
2080 if (vp->v_numoutput == 0)
2081 cv_broadcast(&vp->v_cv);
2082 }
2083
2084 /*
2085 * Test a vnode for being or becoming dead. Returns one of:
2086 * EBUSY: vnode is becoming dead, with "flags == VDEAD_NOWAIT" only.
2087 * ENOENT: vnode is dead.
2088 * 0: otherwise.
2089 *
2090 * Whenever this function returns a non-zero value all future
2091 * calls will also return a non-zero value.
2092 */
2093 int
2094 vdead_check(struct vnode *vp, int flags)
2095 {
2096
2097 KASSERT(mutex_owned(vp->v_interlock));
2098
2099 if (! ISSET(flags, VDEAD_NOWAIT))
2100 VSTATE_WAIT_STABLE(vp);
2101
2102 if (VSTATE_GET(vp) == VS_RECLAIMING) {
2103 KASSERT(ISSET(flags, VDEAD_NOWAIT));
2104 return SET_ERROR(EBUSY);
2105 } else if (VSTATE_GET(vp) == VS_RECLAIMED) {
2106 return SET_ERROR(ENOENT);
2107 }
2108
2109 return 0;
2110 }
2111
2112 int
2113 vfs_drainvnodes(void)
2114 {
2115
2116 mutex_enter(&vdrain_lock);
2117
2118 if (!vdrain_one(desiredvnodes)) {
2119 mutex_exit(&vdrain_lock);
2120 return SET_ERROR(EBUSY);
2121 }
2122
2123 mutex_exit(&vdrain_lock);
2124
2125 if (vcache_hashsize != desiredvnodes)
2126 vcache_reinit();
2127
2128 return 0;
2129 }
2130
2131 void
2132 vnpanic(vnode_t *vp, const char *fmt, ...)
2133 {
2134 va_list ap;
2135
2136 #ifdef DIAGNOSTIC
2137 vprint(NULL, vp);
2138 #endif
2139 va_start(ap, fmt);
2140 vpanic(fmt, ap);
2141 va_end(ap);
2142 }
2143
2144 void
2145 vshareilock(vnode_t *tvp, vnode_t *fvp)
2146 {
2147 kmutex_t *oldlock;
2148
2149 oldlock = tvp->v_interlock;
2150 mutex_obj_hold(fvp->v_interlock);
2151 tvp->v_interlock = fvp->v_interlock;
2152 mutex_obj_free(oldlock);
2153 }
2154
2155 void
2156 vshareklist(vnode_t *tvp, vnode_t *fvp)
2157 {
2158 /*
2159 * If two vnodes share klist state, they must also share
2160 * an interlock.
2161 */
2162 KASSERT(tvp->v_interlock == fvp->v_interlock);
2163
2164 /*
2165 * We make the following assumptions:
2166 *
2167 * ==> Some other synchronization is happening outside of
2168 * our view to make this safe.
2169 *
2170 * ==> That the "to" vnode will have the necessary references
2171 * on the "from" vnode so that the storage for the klist
2172 * won't be yanked out from beneath us (the vnode_impl).
2173 *
2174 * ==> If "from" is also sharing, we then assume that "from"
2175 * has the necessary references, and so on.
2176 */
2177 tvp->v_klist = fvp->v_klist;
2178 }
2179