vfs_vnode.c revision 1.153 1 /* $NetBSD: vfs_vnode.c,v 1.153 2023/11/27 16:13:59 hannken Exp $ */
2
3 /*-
4 * Copyright (c) 1997-2011, 2019, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /*
34 * Copyright (c) 1989, 1993
35 * The Regents of the University of California. All rights reserved.
36 * (c) UNIX System Laboratories, Inc.
37 * All or some portions of this file are derived from material licensed
38 * to the University of California by American Telephone and Telegraph
39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40 * the permission of UNIX System Laboratories, Inc.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
67 */
68
69 /*
70 * The vnode cache subsystem.
71 *
72 * Life-cycle
73 *
74 * Normally, there are two points where new vnodes are created:
75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode
76 * starts in one of the following ways:
77 *
78 * - Allocation, via vcache_get(9) or vcache_new(9).
79 * - Reclamation of inactive vnode, via vcache_vget(9).
80 *
81 * Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9)
82 * was another, traditional way. Currently, only the draining thread
83 * recycles the vnodes. This behaviour might be revisited.
84 *
85 * The life-cycle ends when the last reference is dropped, usually
86 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform
87 * the file system that vnode is inactive. Via this call, file system
88 * indicates whether vnode can be recycled (usually, it checks its own
89 * references, e.g. count of links, whether the file was removed).
90 *
91 * Depending on indication, vnode can be put into a free list (cache),
92 * or cleaned via vcache_reclaim, which calls VOP_RECLAIM(9) to
93 * disassociate underlying file system from the vnode, and finally
94 * destroyed.
95 *
96 * Vnode state
97 *
98 * Vnode is always in one of six states:
99 * - MARKER This is a marker vnode to help list traversal. It
100 * will never change its state.
101 * - LOADING Vnode is associating underlying file system and not
102 * yet ready to use.
103 * - LOADED Vnode has associated underlying file system and is
104 * ready to use.
105 * - BLOCKED Vnode is active but cannot get new references.
106 * - RECLAIMING Vnode is disassociating from the underlying file
107 * system.
108 * - RECLAIMED Vnode has disassociated from underlying file system
109 * and is dead.
110 *
111 * Valid state changes are:
112 * LOADING -> LOADED
113 * Vnode has been initialised in vcache_get() or
114 * vcache_new() and is ready to use.
115 * BLOCKED -> RECLAIMING
116 * Vnode starts disassociation from underlying file
117 * system in vcache_reclaim().
118 * RECLAIMING -> RECLAIMED
119 * Vnode finished disassociation from underlying file
120 * system in vcache_reclaim().
121 * LOADED -> BLOCKED
122 * Either vcache_rekey*() is changing the vnode key or
123 * vrelel() is about to call VOP_INACTIVE().
124 * BLOCKED -> LOADED
125 * The block condition is over.
126 * LOADING -> RECLAIMED
127 * Either vcache_get() or vcache_new() failed to
128 * associate the underlying file system or vcache_rekey*()
129 * drops a vnode used as placeholder.
130 *
131 * Of these states LOADING, BLOCKED and RECLAIMING are intermediate
132 * and it is possible to wait for state change.
133 *
134 * State is protected with v_interlock with one exception:
135 * to change from LOADING both v_interlock and vcache_lock must be held
136 * so it is possible to check "state == LOADING" without holding
137 * v_interlock. See vcache_get() for details.
138 *
139 * Reference counting
140 *
141 * Vnode is considered active, if reference count (vnode_t::v_usecount)
142 * is non-zero. It is maintained using: vref(9) and vrele(9), as well
143 * as vput(9), routines. Common points holding references are e.g.
144 * file openings, current working directory, mount points, etc.
145 *
146 * v_usecount is adjusted with atomic operations, however to change
147 * from a non-zero value to zero the interlock must also be held.
148 */
149
150 #include <sys/cdefs.h>
151 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.153 2023/11/27 16:13:59 hannken Exp $");
152
153 #ifdef _KERNEL_OPT
154 #include "opt_pax.h"
155 #endif
156
157 #include <sys/param.h>
158 #include <sys/kernel.h>
159
160 #include <sys/atomic.h>
161 #include <sys/buf.h>
162 #include <sys/conf.h>
163 #include <sys/device.h>
164 #include <sys/hash.h>
165 #include <sys/kauth.h>
166 #include <sys/kmem.h>
167 #include <sys/module.h>
168 #include <sys/mount.h>
169 #include <sys/namei.h>
170 #include <sys/pax.h>
171 #include <sys/syscallargs.h>
172 #include <sys/sysctl.h>
173 #include <sys/systm.h>
174 #include <sys/threadpool.h>
175 #include <sys/vnode_impl.h>
176 #include <sys/wapbl.h>
177 #include <sys/fstrans.h>
178
179 #include <miscfs/deadfs/deadfs.h>
180 #include <miscfs/specfs/specdev.h>
181
182 #include <uvm/uvm.h>
183 #include <uvm/uvm_readahead.h>
184 #include <uvm/uvm_stat.h>
185
186 /* Flags to vrelel. */
187 #define VRELEL_ASYNC 0x0001 /* Always defer to vrele thread. */
188
189 #define LRU_VRELE 0
190 #define LRU_FREE 1
191 #define LRU_HOLD 2
192 #define LRU_COUNT 3
193
194 /*
195 * There are three lru lists: one holds vnodes waiting for async release,
196 * one is for vnodes which have no buffer/page references and one for those
197 * which do (i.e. v_holdcnt is non-zero). We put the lists into a single,
198 * private cache line as vnodes migrate between them while under the same
199 * lock (vdrain_lock).
200 */
201
202 typedef struct {
203 vnode_impl_t *li_marker;
204 } lru_iter_t;
205
206 u_int numvnodes __cacheline_aligned;
207 static vnodelst_t lru_list[LRU_COUNT] __cacheline_aligned;
208 static struct threadpool *threadpool;
209 static struct threadpool_job vdrain_job;
210 static struct threadpool_job vrele_job;
211 static kmutex_t vdrain_lock __cacheline_aligned;
212 SLIST_HEAD(hashhead, vnode_impl);
213 static kmutex_t vcache_lock __cacheline_aligned;
214 static kcondvar_t vcache_cv;
215 static u_int vcache_hashsize;
216 static u_long vcache_hashmask;
217 static struct hashhead *vcache_hashtab;
218 static pool_cache_t vcache_pool;
219 static void lru_requeue(vnode_t *, vnodelst_t *);
220 static vnodelst_t * lru_which(vnode_t *);
221 static vnode_impl_t * lru_iter_first(int, lru_iter_t *);
222 static vnode_impl_t * lru_iter_next(lru_iter_t *);
223 static void lru_iter_release(lru_iter_t *);
224 static vnode_impl_t * vcache_alloc(void);
225 static void vcache_dealloc(vnode_impl_t *);
226 static void vcache_free(vnode_impl_t *);
227 static void vcache_init(void);
228 static void vcache_reinit(void);
229 static void vcache_reclaim(vnode_t *);
230 static void vrele_deferred(vnode_impl_t *);
231 static void vrelel(vnode_t *, int, int);
232 static void vnpanic(vnode_t *, const char *, ...)
233 __printflike(2, 3);
234 static bool vdrain_one(u_int);
235 static void vdrain_task(struct threadpool_job *);
236 static void vrele_task(struct threadpool_job *);
237
238 /* Routines having to do with the management of the vnode table. */
239
240 /*
241 * The high bit of v_usecount is a gate for vcache_tryvget(). It's set
242 * only when the vnode state is LOADED.
243 * The next bit of v_usecount is a flag for vrelel(). It's set
244 * from vcache_vget() and vcache_tryvget() whenever the operation succeeds.
245 */
246 #define VUSECOUNT_MASK 0x3fffffff
247 #define VUSECOUNT_GATE 0x80000000
248 #define VUSECOUNT_VGET 0x40000000
249
250 /*
251 * Return the current usecount of a vnode.
252 */
253 inline int
254 vrefcnt(struct vnode *vp)
255 {
256
257 return atomic_load_relaxed(&vp->v_usecount) & VUSECOUNT_MASK;
258 }
259
260 /* Vnode state operations and diagnostics. */
261
262 #if defined(DIAGNOSTIC)
263
264 #define VSTATE_VALID(state) \
265 ((state) != VS_ACTIVE && (state) != VS_MARKER)
266 #define VSTATE_GET(vp) \
267 vstate_assert_get((vp), __func__, __LINE__)
268 #define VSTATE_CHANGE(vp, from, to) \
269 vstate_assert_change((vp), (from), (to), __func__, __LINE__)
270 #define VSTATE_WAIT_STABLE(vp) \
271 vstate_assert_wait_stable((vp), __func__, __LINE__)
272
273 void
274 _vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line,
275 bool has_lock)
276 {
277 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
278 int refcnt = vrefcnt(vp);
279
280 if (!has_lock) {
281 enum vnode_state vstate = atomic_load_relaxed(&vip->vi_state);
282
283 if (state == VS_ACTIVE && refcnt > 0 &&
284 (vstate == VS_LOADED || vstate == VS_BLOCKED))
285 return;
286 if (vstate == state)
287 return;
288 mutex_enter((vp)->v_interlock);
289 }
290
291 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
292
293 if ((state == VS_ACTIVE && refcnt > 0 &&
294 (vip->vi_state == VS_LOADED || vip->vi_state == VS_BLOCKED)) ||
295 vip->vi_state == state) {
296 if (!has_lock)
297 mutex_exit((vp)->v_interlock);
298 return;
299 }
300 vnpanic(vp, "state is %s, usecount %d, expected %s at %s:%d",
301 vstate_name(vip->vi_state), refcnt,
302 vstate_name(state), func, line);
303 }
304
305 static enum vnode_state
306 vstate_assert_get(vnode_t *vp, const char *func, int line)
307 {
308 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
309
310 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
311 if (! VSTATE_VALID(vip->vi_state))
312 vnpanic(vp, "state is %s at %s:%d",
313 vstate_name(vip->vi_state), func, line);
314
315 return vip->vi_state;
316 }
317
318 static void
319 vstate_assert_wait_stable(vnode_t *vp, const char *func, int line)
320 {
321 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
322
323 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
324 if (! VSTATE_VALID(vip->vi_state))
325 vnpanic(vp, "state is %s at %s:%d",
326 vstate_name(vip->vi_state), func, line);
327
328 while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED)
329 cv_wait(&vp->v_cv, vp->v_interlock);
330
331 if (! VSTATE_VALID(vip->vi_state))
332 vnpanic(vp, "state is %s at %s:%d",
333 vstate_name(vip->vi_state), func, line);
334 }
335
336 static void
337 vstate_assert_change(vnode_t *vp, enum vnode_state from, enum vnode_state to,
338 const char *func, int line)
339 {
340 bool gated = (atomic_load_relaxed(&vp->v_usecount) & VUSECOUNT_GATE);
341 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
342
343 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
344 if (from == VS_LOADING)
345 KASSERTMSG(mutex_owned(&vcache_lock), "at %s:%d", func, line);
346
347 if (! VSTATE_VALID(from))
348 vnpanic(vp, "from is %s at %s:%d",
349 vstate_name(from), func, line);
350 if (! VSTATE_VALID(to))
351 vnpanic(vp, "to is %s at %s:%d",
352 vstate_name(to), func, line);
353 if (vip->vi_state != from)
354 vnpanic(vp, "from is %s, expected %s at %s:%d\n",
355 vstate_name(vip->vi_state), vstate_name(from), func, line);
356 if ((from == VS_LOADED) != gated)
357 vnpanic(vp, "state is %s, gate %d does not match at %s:%d\n",
358 vstate_name(vip->vi_state), gated, func, line);
359
360 /* Open/close the gate for vcache_tryvget(). */
361 if (to == VS_LOADED) {
362 membar_release();
363 atomic_or_uint(&vp->v_usecount, VUSECOUNT_GATE);
364 } else {
365 atomic_and_uint(&vp->v_usecount, ~VUSECOUNT_GATE);
366 }
367
368 atomic_store_relaxed(&vip->vi_state, to);
369 if (from == VS_LOADING)
370 cv_broadcast(&vcache_cv);
371 if (to == VS_LOADED || to == VS_RECLAIMED)
372 cv_broadcast(&vp->v_cv);
373 }
374
375 #else /* defined(DIAGNOSTIC) */
376
377 #define VSTATE_GET(vp) \
378 (VNODE_TO_VIMPL((vp))->vi_state)
379 #define VSTATE_CHANGE(vp, from, to) \
380 vstate_change((vp), (from), (to))
381 #define VSTATE_WAIT_STABLE(vp) \
382 vstate_wait_stable((vp))
383 void
384 _vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line,
385 bool has_lock)
386 {
387
388 }
389
390 static void
391 vstate_wait_stable(vnode_t *vp)
392 {
393 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
394
395 while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED)
396 cv_wait(&vp->v_cv, vp->v_interlock);
397 }
398
399 static void
400 vstate_change(vnode_t *vp, enum vnode_state from, enum vnode_state to)
401 {
402 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
403
404 /* Open/close the gate for vcache_tryvget(). */
405 if (to == VS_LOADED) {
406 membar_release();
407 atomic_or_uint(&vp->v_usecount, VUSECOUNT_GATE);
408 } else {
409 atomic_and_uint(&vp->v_usecount, ~VUSECOUNT_GATE);
410 }
411
412 atomic_store_relaxed(&vip->vi_state, to);
413 if (from == VS_LOADING)
414 cv_broadcast(&vcache_cv);
415 if (to == VS_LOADED || to == VS_RECLAIMED)
416 cv_broadcast(&vp->v_cv);
417 }
418
419 #endif /* defined(DIAGNOSTIC) */
420
421 void
422 vfs_vnode_sysinit(void)
423 {
424 int error __diagused, i;
425
426 dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL);
427 KASSERT(dead_rootmount != NULL);
428 dead_rootmount->mnt_iflag |= IMNT_MPSAFE;
429
430 mutex_init(&vdrain_lock, MUTEX_DEFAULT, IPL_NONE);
431 for (i = 0; i < LRU_COUNT; i++) {
432 TAILQ_INIT(&lru_list[i]);
433 }
434 vcache_init();
435
436 error = threadpool_get(&threadpool, PRI_NONE);
437 KASSERTMSG((error == 0), "threadpool_get failed: %d", error);
438 threadpool_job_init(&vdrain_job, vdrain_task, &vdrain_lock, "vdrain");
439 threadpool_job_init(&vrele_job, vrele_task, &vdrain_lock, "vrele");
440 }
441
442 /*
443 * Allocate a new marker vnode.
444 */
445 vnode_t *
446 vnalloc_marker(struct mount *mp)
447 {
448 vnode_impl_t *vip;
449 vnode_t *vp;
450
451 vip = pool_cache_get(vcache_pool, PR_WAITOK);
452 memset(vip, 0, sizeof(*vip));
453 vp = VIMPL_TO_VNODE(vip);
454 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1);
455 vp->v_mount = mp;
456 vp->v_type = VBAD;
457 vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
458 klist_init(&vip->vi_klist.vk_klist);
459 vp->v_klist = &vip->vi_klist;
460 vip->vi_state = VS_MARKER;
461
462 return vp;
463 }
464
465 /*
466 * Free a marker vnode.
467 */
468 void
469 vnfree_marker(vnode_t *vp)
470 {
471 vnode_impl_t *vip;
472
473 vip = VNODE_TO_VIMPL(vp);
474 KASSERT(vip->vi_state == VS_MARKER);
475 mutex_obj_free(vp->v_interlock);
476 uvm_obj_destroy(&vp->v_uobj, true);
477 klist_fini(&vip->vi_klist.vk_klist);
478 pool_cache_put(vcache_pool, vip);
479 }
480
481 /*
482 * Test a vnode for being a marker vnode.
483 */
484 bool
485 vnis_marker(vnode_t *vp)
486 {
487
488 return (VNODE_TO_VIMPL(vp)->vi_state == VS_MARKER);
489 }
490
491 /*
492 * Return the lru list this node should be on.
493 */
494 static vnodelst_t *
495 lru_which(vnode_t *vp)
496 {
497
498 KASSERT(mutex_owned(vp->v_interlock));
499
500 if (vp->v_holdcnt > 0)
501 return &lru_list[LRU_HOLD];
502 else
503 return &lru_list[LRU_FREE];
504 }
505
506 /*
507 * Put vnode to end of given list.
508 * Both the current and the new list may be NULL, used on vnode alloc/free.
509 * Adjust numvnodes and signal vdrain thread if there is work.
510 */
511 static void
512 lru_requeue(vnode_t *vp, vnodelst_t *listhd)
513 {
514 vnode_impl_t *vip;
515 int d;
516
517 /*
518 * If the vnode is on the correct list, and was put there recently,
519 * then leave it be, thus avoiding huge cache and lock contention.
520 */
521 vip = VNODE_TO_VIMPL(vp);
522 if (listhd == vip->vi_lrulisthd &&
523 (getticks() - vip->vi_lrulisttm) < hz) {
524 return;
525 }
526
527 mutex_enter(&vdrain_lock);
528 d = 0;
529 if (vip->vi_lrulisthd != NULL)
530 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
531 else
532 d++;
533 vip->vi_lrulisthd = listhd;
534 vip->vi_lrulisttm = getticks();
535 if (vip->vi_lrulisthd != NULL)
536 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
537 else
538 d--;
539 if (d != 0) {
540 /*
541 * Looks strange? This is not a bug. Don't store
542 * numvnodes unless there is a change - avoid false
543 * sharing on MP.
544 */
545 numvnodes += d;
546 }
547 if (listhd == &lru_list[LRU_VRELE])
548 threadpool_schedule_job(threadpool, &vrele_job);
549 if (d > 0 && numvnodes > desiredvnodes)
550 threadpool_schedule_job(threadpool, &vdrain_job);
551 if (d > 0 && numvnodes > desiredvnodes + desiredvnodes / 16)
552 kpause("vnfull", false, MAX(1, mstohz(10)), &vdrain_lock);
553 mutex_exit(&vdrain_lock);
554 }
555
556 /*
557 * LRU list iterator.
558 * Caller holds vdrain_lock.
559 */
560 static vnode_impl_t *
561 lru_iter_first(int idx, lru_iter_t *iterp)
562 {
563 vnode_impl_t *marker;
564
565 KASSERT(mutex_owned(&vdrain_lock));
566
567 mutex_exit(&vdrain_lock);
568 marker = VNODE_TO_VIMPL(vnalloc_marker(NULL));
569 mutex_enter(&vdrain_lock);
570 marker->vi_lrulisthd = &lru_list[idx];
571 iterp->li_marker = marker;
572
573 TAILQ_INSERT_HEAD(marker->vi_lrulisthd, marker, vi_lrulist);
574
575 return lru_iter_next(iterp);
576 }
577
578 static vnode_impl_t *
579 lru_iter_next(lru_iter_t *iter)
580 {
581 vnode_impl_t *vip, *marker;
582 vnodelst_t *listhd;
583
584 KASSERT(mutex_owned(&vdrain_lock));
585
586 marker = iter->li_marker;
587 listhd = marker->vi_lrulisthd;
588
589 while ((vip = TAILQ_NEXT(marker, vi_lrulist))) {
590 TAILQ_REMOVE(listhd, marker, vi_lrulist);
591 TAILQ_INSERT_AFTER(listhd, vip, marker, vi_lrulist);
592 if (!vnis_marker(VIMPL_TO_VNODE(vip)))
593 break;
594 }
595
596 return vip;
597 }
598
599 static void
600 lru_iter_release(lru_iter_t *iter)
601 {
602 vnode_impl_t *marker;
603
604 KASSERT(mutex_owned(&vdrain_lock));
605
606 marker = iter->li_marker;
607 TAILQ_REMOVE(marker->vi_lrulisthd, marker, vi_lrulist);
608
609 mutex_exit(&vdrain_lock);
610 vnfree_marker(VIMPL_TO_VNODE(marker));
611 mutex_enter(&vdrain_lock);
612 }
613
614 /*
615 * Release deferred vrele vnodes for this mount.
616 * Called with file system suspended.
617 */
618 void
619 vrele_flush(struct mount *mp)
620 {
621 lru_iter_t iter;
622 vnode_impl_t *vip;
623
624 KASSERT(fstrans_is_owner(mp));
625
626 mutex_enter(&vdrain_lock);
627 for (vip = lru_iter_first(LRU_VRELE, &iter); vip != NULL;
628 vip = lru_iter_next(&iter)) {
629 if (VIMPL_TO_VNODE(vip)->v_mount != mp)
630 continue;
631 vrele_deferred(vip);
632 }
633 lru_iter_release(&iter);
634 mutex_exit(&vdrain_lock);
635 }
636
637 /*
638 * One pass through the LRU lists to keep the number of allocated
639 * vnodes below target. Returns true if target met.
640 */
641 static bool
642 vdrain_one(u_int target)
643 {
644 int ix, lists[] = { LRU_FREE, LRU_HOLD };
645 lru_iter_t iter;
646 vnode_impl_t *vip;
647 vnode_t *vp;
648 struct mount *mp;
649
650 KASSERT(mutex_owned(&vdrain_lock));
651
652 for (ix = 0; ix < __arraycount(lists); ix++) {
653 for (vip = lru_iter_first(lists[ix], &iter); vip != NULL;
654 vip = lru_iter_next(&iter)) {
655 if (numvnodes < target) {
656 lru_iter_release(&iter);
657 return true;
658 }
659
660 vp = VIMPL_TO_VNODE(vip);
661
662 /* Probe usecount (unlocked). */
663 if (vrefcnt(vp) > 0)
664 continue;
665 /* Try v_interlock -- we lock the wrong direction! */
666 if (!mutex_tryenter(vp->v_interlock))
667 continue;
668 /* Probe usecount and state. */
669 if (vrefcnt(vp) > 0 || VSTATE_GET(vp) != VS_LOADED) {
670 mutex_exit(vp->v_interlock);
671 continue;
672 }
673 mutex_exit(&vdrain_lock);
674
675 mp = vp->v_mount;
676 if (fstrans_start_nowait(mp) != 0) {
677 mutex_exit(vp->v_interlock);
678 mutex_enter(&vdrain_lock);
679 continue;
680 }
681
682 if (vcache_vget(vp) == 0) {
683 if (!vrecycle(vp)) {
684 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
685 mutex_enter(vp->v_interlock);
686 vrelel(vp, 0, LK_EXCLUSIVE);
687 }
688 }
689 fstrans_done(mp);
690
691 mutex_enter(&vdrain_lock);
692 }
693 lru_iter_release(&iter);
694 }
695
696 return false;
697 }
698
699 /*
700 * threadpool task to keep the number of vnodes below desiredvnodes.
701 */
702 static void
703 vdrain_task(struct threadpool_job *job)
704 {
705 u_int target;
706
707 target = desiredvnodes - desiredvnodes / 16;
708
709 mutex_enter(&vdrain_lock);
710
711 while (!vdrain_one(target))
712 kpause("vdrain", false, 1, &vdrain_lock);
713
714 threadpool_job_done(job);
715 mutex_exit(&vdrain_lock);
716 }
717
718 /*
719 * threadpool task to process asynchronous vrele.
720 */
721 static void
722 vrele_task(struct threadpool_job *job)
723 {
724 int skipped;
725 lru_iter_t iter;
726 vnode_impl_t *vip;
727 struct mount *mp;
728
729 mutex_enter(&vdrain_lock);
730 while ((vip = lru_iter_first(LRU_VRELE, &iter)) != NULL) {
731 for (skipped = 0; vip != NULL; vip = lru_iter_next(&iter)) {
732 mp = VIMPL_TO_VNODE(vip)->v_mount;
733 if (fstrans_start_nowait(mp) == 0) {
734 vrele_deferred(vip);
735 fstrans_done(mp);
736 } else {
737 skipped++;
738 }
739 }
740
741 lru_iter_release(&iter);
742 if (skipped)
743 kpause("vrele", false, MAX(1, mstohz(10)), &vdrain_lock);
744 }
745
746 threadpool_job_done(job);
747 lru_iter_release(&iter);
748 mutex_exit(&vdrain_lock);
749 }
750
751 /*
752 * Try to drop reference on a vnode. Abort if we are releasing the
753 * last reference. Note: this _must_ succeed if not the last reference.
754 */
755 static bool
756 vtryrele(vnode_t *vp)
757 {
758 u_int use, next;
759
760 membar_release();
761 for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
762 if (__predict_false((use & VUSECOUNT_MASK) == 1)) {
763 return false;
764 }
765 KASSERT((use & VUSECOUNT_MASK) > 1);
766 next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
767 if (__predict_true(next == use)) {
768 return true;
769 }
770 }
771 }
772
773 /*
774 * vput: unlock and release the reference.
775 */
776 void
777 vput(vnode_t *vp)
778 {
779 int lktype;
780
781 /*
782 * Do an unlocked check of the usecount. If it looks like we're not
783 * about to drop the last reference, then unlock the vnode and try
784 * to drop the reference. If it ends up being the last reference
785 * after all, vrelel() can fix it all up. Most of the time this
786 * will all go to plan.
787 */
788 if (vrefcnt(vp) > 1) {
789 VOP_UNLOCK(vp);
790 if (vtryrele(vp)) {
791 return;
792 }
793 lktype = LK_NONE;
794 } else {
795 lktype = VOP_ISLOCKED(vp);
796 KASSERT(lktype != LK_NONE);
797 }
798 mutex_enter(vp->v_interlock);
799 vrelel(vp, 0, lktype);
800 }
801
802 /*
803 * Release a vnode from the deferred list.
804 */
805 static void
806 vrele_deferred(vnode_impl_t *vip)
807 {
808 vnode_t *vp;
809
810 KASSERT(mutex_owned(&vdrain_lock));
811 KASSERT(vip->vi_lrulisthd == &lru_list[LRU_VRELE]);
812
813 vp = VIMPL_TO_VNODE(vip);
814
815 /*
816 * First remove the vnode from the vrele list.
817 * Put it on the last lru list, the last vrele()
818 * will put it back onto the right list before
819 * its usecount reaches zero.
820 */
821 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
822 vip->vi_lrulisthd = &lru_list[LRU_HOLD];
823 vip->vi_lrulisttm = getticks();
824 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
825
826 mutex_exit(&vdrain_lock);
827
828 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
829 mutex_enter(vp->v_interlock);
830 vrelel(vp, 0, LK_EXCLUSIVE);
831
832 mutex_enter(&vdrain_lock);
833 }
834
835 /*
836 * Vnode release. If reference count drops to zero, call inactive
837 * routine and either return to freelist or free to the pool.
838 */
839 static void
840 vrelel(vnode_t *vp, int flags, int lktype)
841 {
842 const bool async = ((flags & VRELEL_ASYNC) != 0);
843 bool recycle, defer, objlock_held;
844 u_int use, next;
845 int error;
846
847 objlock_held = false;
848
849 retry:
850 KASSERT(mutex_owned(vp->v_interlock));
851
852 if (__predict_false(vp->v_op == dead_vnodeop_p &&
853 VSTATE_GET(vp) != VS_RECLAIMED)) {
854 vnpanic(vp, "dead but not clean");
855 }
856
857 /*
858 * If not the last reference, just unlock and drop the reference count.
859 *
860 * Otherwise make sure we pass a point in time where we hold the
861 * last reference with VGET flag unset.
862 */
863 for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
864 if (__predict_false((use & VUSECOUNT_MASK) > 1)) {
865 if (objlock_held) {
866 objlock_held = false;
867 rw_exit(vp->v_uobj.vmobjlock);
868 }
869 if (lktype != LK_NONE) {
870 mutex_exit(vp->v_interlock);
871 lktype = LK_NONE;
872 VOP_UNLOCK(vp);
873 mutex_enter(vp->v_interlock);
874 }
875 if (vtryrele(vp)) {
876 mutex_exit(vp->v_interlock);
877 return;
878 }
879 next = atomic_load_relaxed(&vp->v_usecount);
880 continue;
881 }
882 KASSERT((use & VUSECOUNT_MASK) == 1);
883 next = use & ~VUSECOUNT_VGET;
884 if (next != use) {
885 next = atomic_cas_uint(&vp->v_usecount, use, next);
886 }
887 if (__predict_true(next == use)) {
888 break;
889 }
890 }
891 membar_acquire();
892 if (vrefcnt(vp) <= 0 || vp->v_writecount != 0) {
893 vnpanic(vp, "%s: bad ref count", __func__);
894 }
895
896 #ifdef DIAGNOSTIC
897 if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
898 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
899 vprint("vrelel: missing VOP_CLOSE()", vp);
900 }
901 #endif
902
903 /*
904 * If already clean there is no need to lock, defer or
905 * deactivate this node.
906 */
907 if (VSTATE_GET(vp) == VS_RECLAIMED) {
908 if (objlock_held) {
909 objlock_held = false;
910 rw_exit(vp->v_uobj.vmobjlock);
911 }
912 if (lktype != LK_NONE) {
913 mutex_exit(vp->v_interlock);
914 lktype = LK_NONE;
915 VOP_UNLOCK(vp);
916 mutex_enter(vp->v_interlock);
917 }
918 goto out;
919 }
920
921 /*
922 * First try to get the vnode locked for VOP_INACTIVE().
923 * Defer vnode release to vrele task if caller requests
924 * it explicitly, is the pagedaemon or the lock failed.
925 */
926 defer = false;
927 if ((curlwp == uvm.pagedaemon_lwp) || async) {
928 defer = true;
929 } else if (lktype == LK_SHARED) {
930 /* Excellent chance of getting, if the last ref. */
931 error = vn_lock(vp, LK_UPGRADE | LK_RETRY | LK_NOWAIT);
932 if (error != 0) {
933 defer = true;
934 } else {
935 lktype = LK_EXCLUSIVE;
936 }
937 } else if (lktype == LK_NONE) {
938 /* Excellent chance of getting, if the last ref. */
939 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
940 if (error != 0) {
941 defer = true;
942 } else {
943 lktype = LK_EXCLUSIVE;
944 }
945 }
946 KASSERT(mutex_owned(vp->v_interlock));
947 if (defer) {
948 /*
949 * Defer reclaim to the vrele task; it's not safe to
950 * clean it here. We donate it our last reference.
951 */
952 if (lktype != LK_NONE) {
953 mutex_exit(vp->v_interlock);
954 VOP_UNLOCK(vp);
955 mutex_enter(vp->v_interlock);
956 }
957 lru_requeue(vp, &lru_list[LRU_VRELE]);
958 mutex_exit(vp->v_interlock);
959 return;
960 }
961 KASSERT(lktype == LK_EXCLUSIVE);
962
963 /* If the node gained another reference, retry. */
964 use = atomic_load_relaxed(&vp->v_usecount);
965 if ((use & VUSECOUNT_VGET) != 0) {
966 goto retry;
967 }
968 KASSERT((use & VUSECOUNT_MASK) == 1);
969
970 if ((vp->v_iflag & (VI_TEXT|VI_EXECMAP|VI_WRMAP)) != 0 ||
971 (vp->v_vflag & VV_MAPPED) != 0) {
972 /* Take care of space accounting. */
973 if (!objlock_held) {
974 objlock_held = true;
975 if (!rw_tryenter(vp->v_uobj.vmobjlock, RW_WRITER)) {
976 mutex_exit(vp->v_interlock);
977 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
978 mutex_enter(vp->v_interlock);
979 goto retry;
980 }
981 }
982 if ((vp->v_iflag & VI_EXECMAP) != 0) {
983 cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages);
984 }
985 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP);
986 vp->v_vflag &= ~VV_MAPPED;
987 }
988 if (objlock_held) {
989 objlock_held = false;
990 rw_exit(vp->v_uobj.vmobjlock);
991 }
992
993 /*
994 * Deactivate the vnode, but preserve our reference across
995 * the call to VOP_INACTIVE().
996 *
997 * If VOP_INACTIVE() indicates that the file has been
998 * deleted, then recycle the vnode.
999 *
1000 * Note that VOP_INACTIVE() will not drop the vnode lock.
1001 */
1002 mutex_exit(vp->v_interlock);
1003 recycle = false;
1004 VOP_INACTIVE(vp, &recycle);
1005 if (!recycle) {
1006 lktype = LK_NONE;
1007 VOP_UNLOCK(vp);
1008 }
1009 mutex_enter(vp->v_interlock);
1010
1011 /*
1012 * Block new references then check again to see if a
1013 * new reference was acquired in the meantime. If
1014 * it was, restore the vnode state and try again.
1015 */
1016 if (recycle) {
1017 VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
1018 use = atomic_load_relaxed(&vp->v_usecount);
1019 if ((use & VUSECOUNT_VGET) != 0) {
1020 VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
1021 goto retry;
1022 }
1023 KASSERT((use & VUSECOUNT_MASK) == 1);
1024 }
1025
1026 /*
1027 * Recycle the vnode if the file is now unused (unlinked).
1028 */
1029 if (recycle) {
1030 VSTATE_ASSERT(vp, VS_BLOCKED);
1031 KASSERT(lktype == LK_EXCLUSIVE);
1032 /* vcache_reclaim drops the lock. */
1033 lktype = LK_NONE;
1034 vcache_reclaim(vp);
1035 }
1036 KASSERT(vrefcnt(vp) > 0);
1037 KASSERT(lktype == LK_NONE);
1038
1039 out:
1040 for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
1041 if (__predict_false((use & VUSECOUNT_VGET) != 0 &&
1042 (use & VUSECOUNT_MASK) == 1)) {
1043 /* Gained and released another reference, retry. */
1044 goto retry;
1045 }
1046 next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
1047 if (__predict_true(next == use)) {
1048 if (__predict_false((use & VUSECOUNT_MASK) != 1)) {
1049 /* Gained another reference. */
1050 mutex_exit(vp->v_interlock);
1051 return;
1052 }
1053 break;
1054 }
1055 }
1056 membar_acquire();
1057
1058 if (VSTATE_GET(vp) == VS_RECLAIMED && vp->v_holdcnt == 0) {
1059 /*
1060 * It's clean so destroy it. It isn't referenced
1061 * anywhere since it has been reclaimed.
1062 */
1063 vcache_free(VNODE_TO_VIMPL(vp));
1064 } else {
1065 /*
1066 * Otherwise, put it back onto the freelist. It
1067 * can't be destroyed while still associated with
1068 * a file system.
1069 */
1070 lru_requeue(vp, lru_which(vp));
1071 mutex_exit(vp->v_interlock);
1072 }
1073 }
1074
1075 void
1076 vrele(vnode_t *vp)
1077 {
1078
1079 if (vtryrele(vp)) {
1080 return;
1081 }
1082 mutex_enter(vp->v_interlock);
1083 vrelel(vp, 0, LK_NONE);
1084 }
1085
1086 /*
1087 * Asynchronous vnode release, vnode is released in different context.
1088 */
1089 void
1090 vrele_async(vnode_t *vp)
1091 {
1092
1093 if (vtryrele(vp)) {
1094 return;
1095 }
1096 mutex_enter(vp->v_interlock);
1097 vrelel(vp, VRELEL_ASYNC, LK_NONE);
1098 }
1099
1100 /*
1101 * Vnode reference, where a reference is already held by some other
1102 * object (for example, a file structure).
1103 *
1104 * NB: lockless code sequences may rely on this not blocking.
1105 */
1106 void
1107 vref(vnode_t *vp)
1108 {
1109
1110 KASSERT(vrefcnt(vp) > 0);
1111
1112 atomic_inc_uint(&vp->v_usecount);
1113 }
1114
1115 /*
1116 * Page or buffer structure gets a reference.
1117 * Called with v_interlock held.
1118 */
1119 void
1120 vholdl(vnode_t *vp)
1121 {
1122
1123 KASSERT(mutex_owned(vp->v_interlock));
1124
1125 if (vp->v_holdcnt++ == 0 && vrefcnt(vp) == 0)
1126 lru_requeue(vp, lru_which(vp));
1127 }
1128
1129 /*
1130 * Page or buffer structure gets a reference.
1131 */
1132 void
1133 vhold(vnode_t *vp)
1134 {
1135
1136 mutex_enter(vp->v_interlock);
1137 vholdl(vp);
1138 mutex_exit(vp->v_interlock);
1139 }
1140
1141 /*
1142 * Page or buffer structure frees a reference.
1143 * Called with v_interlock held.
1144 */
1145 void
1146 holdrelel(vnode_t *vp)
1147 {
1148
1149 KASSERT(mutex_owned(vp->v_interlock));
1150
1151 if (vp->v_holdcnt <= 0) {
1152 vnpanic(vp, "%s: holdcnt vp %p", __func__, vp);
1153 }
1154
1155 vp->v_holdcnt--;
1156 if (vp->v_holdcnt == 0 && vrefcnt(vp) == 0)
1157 lru_requeue(vp, lru_which(vp));
1158 }
1159
1160 /*
1161 * Page or buffer structure frees a reference.
1162 */
1163 void
1164 holdrele(vnode_t *vp)
1165 {
1166
1167 mutex_enter(vp->v_interlock);
1168 holdrelel(vp);
1169 mutex_exit(vp->v_interlock);
1170 }
1171
1172 /*
1173 * Recycle an unused vnode if caller holds the last reference.
1174 */
1175 bool
1176 vrecycle(vnode_t *vp)
1177 {
1178 int error __diagused;
1179
1180 mutex_enter(vp->v_interlock);
1181
1182 /* If the vnode is already clean we're done. */
1183 VSTATE_WAIT_STABLE(vp);
1184 if (VSTATE_GET(vp) != VS_LOADED) {
1185 VSTATE_ASSERT(vp, VS_RECLAIMED);
1186 vrelel(vp, 0, LK_NONE);
1187 return true;
1188 }
1189
1190 /* Prevent further references until the vnode is locked. */
1191 VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
1192
1193 /* Make sure we hold the last reference. */
1194 if (vrefcnt(vp) != 1) {
1195 VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
1196 mutex_exit(vp->v_interlock);
1197 return false;
1198 }
1199
1200 mutex_exit(vp->v_interlock);
1201
1202 /*
1203 * On a leaf file system this lock will always succeed as we hold
1204 * the last reference and prevent further references.
1205 * On layered file systems waiting for the lock would open a can of
1206 * deadlocks as the lower vnodes may have other active references.
1207 */
1208 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
1209
1210 mutex_enter(vp->v_interlock);
1211 if (error) {
1212 VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
1213 mutex_exit(vp->v_interlock);
1214 return false;
1215 }
1216
1217 KASSERT(vrefcnt(vp) == 1);
1218 vcache_reclaim(vp);
1219 vrelel(vp, 0, LK_NONE);
1220
1221 return true;
1222 }
1223
1224 /*
1225 * Helper for vrevoke() to propagate suspension from lastmp
1226 * to thismp. Both args may be NULL.
1227 * Returns the currently suspended file system or NULL.
1228 */
1229 static struct mount *
1230 vrevoke_suspend_next(struct mount *lastmp, struct mount *thismp)
1231 {
1232 int error;
1233
1234 if (lastmp == thismp)
1235 return thismp;
1236
1237 if (lastmp != NULL)
1238 vfs_resume(lastmp);
1239
1240 if (thismp == NULL)
1241 return NULL;
1242
1243 do {
1244 error = vfs_suspend(thismp, 0);
1245 } while (error == EINTR || error == ERESTART);
1246
1247 if (error == 0)
1248 return thismp;
1249
1250 KASSERT(error == EOPNOTSUPP || error == ENOENT);
1251 return NULL;
1252 }
1253
1254 /*
1255 * Eliminate all activity associated with the requested vnode
1256 * and with all vnodes aliased to the requested vnode.
1257 */
1258 void
1259 vrevoke(vnode_t *vp)
1260 {
1261 struct mount *mp;
1262 vnode_t *vq;
1263 enum vtype type;
1264 dev_t dev;
1265
1266 KASSERT(vrefcnt(vp) > 0);
1267
1268 mp = vrevoke_suspend_next(NULL, vp->v_mount);
1269
1270 mutex_enter(vp->v_interlock);
1271 VSTATE_WAIT_STABLE(vp);
1272 if (VSTATE_GET(vp) == VS_RECLAIMED) {
1273 mutex_exit(vp->v_interlock);
1274 } else if (vp->v_type != VBLK && vp->v_type != VCHR) {
1275 atomic_inc_uint(&vp->v_usecount);
1276 mutex_exit(vp->v_interlock);
1277 vgone(vp);
1278 } else {
1279 dev = vp->v_rdev;
1280 type = vp->v_type;
1281 mutex_exit(vp->v_interlock);
1282
1283 while (spec_node_lookup_by_dev(type, dev, VDEAD_NOWAIT, &vq)
1284 == 0) {
1285 mp = vrevoke_suspend_next(mp, vq->v_mount);
1286 vgone(vq);
1287 }
1288 }
1289 vrevoke_suspend_next(mp, NULL);
1290 }
1291
1292 /*
1293 * Eliminate all activity associated with a vnode in preparation for
1294 * reuse. Drops a reference from the vnode.
1295 */
1296 void
1297 vgone(vnode_t *vp)
1298 {
1299 int lktype;
1300
1301 KASSERT(vp->v_mount == dead_rootmount || fstrans_is_owner(vp->v_mount));
1302
1303 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1304 lktype = LK_EXCLUSIVE;
1305 mutex_enter(vp->v_interlock);
1306 VSTATE_WAIT_STABLE(vp);
1307 if (VSTATE_GET(vp) == VS_LOADED) {
1308 VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
1309 vcache_reclaim(vp);
1310 lktype = LK_NONE;
1311 }
1312 VSTATE_ASSERT(vp, VS_RECLAIMED);
1313 vrelel(vp, 0, lktype);
1314 }
1315
1316 static inline uint32_t
1317 vcache_hash(const struct vcache_key *key)
1318 {
1319 uint32_t hash = HASH32_BUF_INIT;
1320
1321 KASSERT(key->vk_key_len > 0);
1322
1323 hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash);
1324 hash = hash32_buf(key->vk_key, key->vk_key_len, hash);
1325 return hash;
1326 }
1327
1328 static int
1329 vcache_stats(struct hashstat_sysctl *hs, bool fill)
1330 {
1331 vnode_impl_t *vip;
1332 uint64_t chain;
1333
1334 strlcpy(hs->hash_name, "vcache", sizeof(hs->hash_name));
1335 strlcpy(hs->hash_desc, "vnode cache hash", sizeof(hs->hash_desc));
1336 if (!fill)
1337 return 0;
1338
1339 hs->hash_size = vcache_hashmask + 1;
1340
1341 for (size_t i = 0; i < hs->hash_size; i++) {
1342 chain = 0;
1343 mutex_enter(&vcache_lock);
1344 SLIST_FOREACH(vip, &vcache_hashtab[i], vi_hash) {
1345 chain++;
1346 }
1347 mutex_exit(&vcache_lock);
1348 if (chain > 0) {
1349 hs->hash_used++;
1350 hs->hash_items += chain;
1351 if (chain > hs->hash_maxchain)
1352 hs->hash_maxchain = chain;
1353 }
1354 preempt_point();
1355 }
1356
1357 return 0;
1358 }
1359
1360 static void
1361 vcache_init(void)
1362 {
1363
1364 vcache_pool = pool_cache_init(sizeof(vnode_impl_t), coherency_unit,
1365 0, 0, "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL);
1366 KASSERT(vcache_pool != NULL);
1367 mutex_init(&vcache_lock, MUTEX_DEFAULT, IPL_NONE);
1368 cv_init(&vcache_cv, "vcache");
1369 vcache_hashsize = desiredvnodes;
1370 vcache_hashtab = hashinit(desiredvnodes, HASH_SLIST, true,
1371 &vcache_hashmask);
1372 hashstat_register("vcache", vcache_stats);
1373 }
1374
1375 static void
1376 vcache_reinit(void)
1377 {
1378 int i;
1379 uint32_t hash;
1380 u_long oldmask, newmask;
1381 struct hashhead *oldtab, *newtab;
1382 vnode_impl_t *vip;
1383
1384 newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask);
1385 mutex_enter(&vcache_lock);
1386 oldtab = vcache_hashtab;
1387 oldmask = vcache_hashmask;
1388 vcache_hashsize = desiredvnodes;
1389 vcache_hashtab = newtab;
1390 vcache_hashmask = newmask;
1391 for (i = 0; i <= oldmask; i++) {
1392 while ((vip = SLIST_FIRST(&oldtab[i])) != NULL) {
1393 SLIST_REMOVE(&oldtab[i], vip, vnode_impl, vi_hash);
1394 hash = vcache_hash(&vip->vi_key);
1395 SLIST_INSERT_HEAD(&newtab[hash & vcache_hashmask],
1396 vip, vi_hash);
1397 }
1398 }
1399 mutex_exit(&vcache_lock);
1400 hashdone(oldtab, HASH_SLIST, oldmask);
1401 }
1402
1403 static inline vnode_impl_t *
1404 vcache_hash_lookup(const struct vcache_key *key, uint32_t hash)
1405 {
1406 struct hashhead *hashp;
1407 vnode_impl_t *vip;
1408
1409 KASSERT(mutex_owned(&vcache_lock));
1410
1411 hashp = &vcache_hashtab[hash & vcache_hashmask];
1412 SLIST_FOREACH(vip, hashp, vi_hash) {
1413 if (key->vk_mount != vip->vi_key.vk_mount)
1414 continue;
1415 if (key->vk_key_len != vip->vi_key.vk_key_len)
1416 continue;
1417 if (memcmp(key->vk_key, vip->vi_key.vk_key, key->vk_key_len))
1418 continue;
1419 return vip;
1420 }
1421 return NULL;
1422 }
1423
1424 /*
1425 * Allocate a new, uninitialized vcache node.
1426 */
1427 static vnode_impl_t *
1428 vcache_alloc(void)
1429 {
1430 vnode_impl_t *vip;
1431 vnode_t *vp;
1432
1433 vip = pool_cache_get(vcache_pool, PR_WAITOK);
1434 vp = VIMPL_TO_VNODE(vip);
1435 memset(vip, 0, sizeof(*vip));
1436
1437 rw_init(&vip->vi_lock);
1438 vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
1439
1440 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1);
1441 klist_init(&vip->vi_klist.vk_klist);
1442 vp->v_klist = &vip->vi_klist;
1443 cv_init(&vp->v_cv, "vnode");
1444 cache_vnode_init(vp);
1445
1446 vp->v_usecount = 1;
1447 vp->v_type = VNON;
1448 vp->v_size = vp->v_writesize = VSIZENOTSET;
1449
1450 vip->vi_state = VS_LOADING;
1451
1452 lru_requeue(vp, &lru_list[LRU_FREE]);
1453
1454 return vip;
1455 }
1456
1457 /*
1458 * Deallocate a vcache node in state VS_LOADING.
1459 *
1460 * vcache_lock held on entry and released on return.
1461 */
1462 static void
1463 vcache_dealloc(vnode_impl_t *vip)
1464 {
1465 vnode_t *vp;
1466
1467 KASSERT(mutex_owned(&vcache_lock));
1468
1469 vp = VIMPL_TO_VNODE(vip);
1470 vfs_ref(dead_rootmount);
1471 vfs_insmntque(vp, dead_rootmount);
1472 mutex_enter(vp->v_interlock);
1473 vp->v_op = dead_vnodeop_p;
1474 VSTATE_CHANGE(vp, VS_LOADING, VS_RECLAIMED);
1475 mutex_exit(&vcache_lock);
1476 vrelel(vp, 0, LK_NONE);
1477 }
1478
1479 /*
1480 * Free an unused, unreferenced vcache node.
1481 * v_interlock locked on entry.
1482 */
1483 static void
1484 vcache_free(vnode_impl_t *vip)
1485 {
1486 vnode_t *vp;
1487
1488 vp = VIMPL_TO_VNODE(vip);
1489 KASSERT(mutex_owned(vp->v_interlock));
1490
1491 KASSERT(vrefcnt(vp) == 0);
1492 KASSERT(vp->v_holdcnt == 0);
1493 KASSERT(vp->v_writecount == 0);
1494 lru_requeue(vp, NULL);
1495 mutex_exit(vp->v_interlock);
1496
1497 vfs_insmntque(vp, NULL);
1498 if (vp->v_type == VBLK || vp->v_type == VCHR)
1499 spec_node_destroy(vp);
1500
1501 mutex_obj_free(vp->v_interlock);
1502 rw_destroy(&vip->vi_lock);
1503 uvm_obj_destroy(&vp->v_uobj, true);
1504 KASSERT(vp->v_klist == &vip->vi_klist);
1505 klist_fini(&vip->vi_klist.vk_klist);
1506 cv_destroy(&vp->v_cv);
1507 cache_vnode_fini(vp);
1508 pool_cache_put(vcache_pool, vip);
1509 }
1510
1511 /*
1512 * Try to get an initial reference on this cached vnode.
1513 * Returns zero on success or EBUSY if the vnode state is not LOADED.
1514 *
1515 * NB: lockless code sequences may rely on this not blocking.
1516 */
1517 int
1518 vcache_tryvget(vnode_t *vp)
1519 {
1520 u_int use, next;
1521
1522 for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
1523 if (__predict_false((use & VUSECOUNT_GATE) == 0)) {
1524 return EBUSY;
1525 }
1526 next = atomic_cas_uint(&vp->v_usecount,
1527 use, (use + 1) | VUSECOUNT_VGET);
1528 if (__predict_true(next == use)) {
1529 membar_acquire();
1530 return 0;
1531 }
1532 }
1533 }
1534
1535 /*
1536 * Try to get an initial reference on this cached vnode.
1537 * Returns zero on success and ENOENT if the vnode has been reclaimed.
1538 * Will wait for the vnode state to be stable.
1539 *
1540 * v_interlock locked on entry and unlocked on exit.
1541 */
1542 int
1543 vcache_vget(vnode_t *vp)
1544 {
1545 int error;
1546
1547 KASSERT(mutex_owned(vp->v_interlock));
1548
1549 /* Increment hold count to prevent vnode from disappearing. */
1550 vp->v_holdcnt++;
1551 VSTATE_WAIT_STABLE(vp);
1552 vp->v_holdcnt--;
1553
1554 /* If this was the last reference to a reclaimed vnode free it now. */
1555 if (__predict_false(VSTATE_GET(vp) == VS_RECLAIMED)) {
1556 if (vp->v_holdcnt == 0 && vrefcnt(vp) == 0)
1557 vcache_free(VNODE_TO_VIMPL(vp));
1558 else
1559 mutex_exit(vp->v_interlock);
1560 return ENOENT;
1561 }
1562 VSTATE_ASSERT(vp, VS_LOADED);
1563 error = vcache_tryvget(vp);
1564 KASSERT(error == 0);
1565 mutex_exit(vp->v_interlock);
1566
1567 return 0;
1568 }
1569
1570 /*
1571 * Get a vnode / fs node pair by key and return it referenced through vpp.
1572 */
1573 int
1574 vcache_get(struct mount *mp, const void *key, size_t key_len,
1575 struct vnode **vpp)
1576 {
1577 int error;
1578 uint32_t hash;
1579 const void *new_key;
1580 struct vnode *vp;
1581 struct vcache_key vcache_key;
1582 vnode_impl_t *vip, *new_vip;
1583
1584 new_key = NULL;
1585 *vpp = NULL;
1586
1587 vcache_key.vk_mount = mp;
1588 vcache_key.vk_key = key;
1589 vcache_key.vk_key_len = key_len;
1590 hash = vcache_hash(&vcache_key);
1591
1592 again:
1593 mutex_enter(&vcache_lock);
1594 vip = vcache_hash_lookup(&vcache_key, hash);
1595
1596 /* If found, take a reference or retry. */
1597 if (__predict_true(vip != NULL)) {
1598 /*
1599 * If the vnode is loading we cannot take the v_interlock
1600 * here as it might change during load (see uvm_obj_setlock()).
1601 * As changing state from VS_LOADING requires both vcache_lock
1602 * and v_interlock it is safe to test with vcache_lock held.
1603 *
1604 * Wait for vnodes changing state from VS_LOADING and retry.
1605 */
1606 if (__predict_false(vip->vi_state == VS_LOADING)) {
1607 cv_wait(&vcache_cv, &vcache_lock);
1608 mutex_exit(&vcache_lock);
1609 goto again;
1610 }
1611 vp = VIMPL_TO_VNODE(vip);
1612 mutex_enter(vp->v_interlock);
1613 mutex_exit(&vcache_lock);
1614 error = vcache_vget(vp);
1615 if (error == ENOENT)
1616 goto again;
1617 if (error == 0)
1618 *vpp = vp;
1619 KASSERT((error != 0) == (*vpp == NULL));
1620 return error;
1621 }
1622 mutex_exit(&vcache_lock);
1623
1624 /* Allocate and initialize a new vcache / vnode pair. */
1625 error = vfs_busy(mp);
1626 if (error)
1627 return error;
1628 new_vip = vcache_alloc();
1629 new_vip->vi_key = vcache_key;
1630 vp = VIMPL_TO_VNODE(new_vip);
1631 mutex_enter(&vcache_lock);
1632 vip = vcache_hash_lookup(&vcache_key, hash);
1633 if (vip == NULL) {
1634 SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask],
1635 new_vip, vi_hash);
1636 vip = new_vip;
1637 }
1638
1639 /* If another thread beat us inserting this node, retry. */
1640 if (vip != new_vip) {
1641 vcache_dealloc(new_vip);
1642 vfs_unbusy(mp);
1643 goto again;
1644 }
1645 mutex_exit(&vcache_lock);
1646
1647 /* Load the fs node. Exclusive as new_node is VS_LOADING. */
1648 error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key);
1649 if (error) {
1650 mutex_enter(&vcache_lock);
1651 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
1652 new_vip, vnode_impl, vi_hash);
1653 vcache_dealloc(new_vip);
1654 vfs_unbusy(mp);
1655 KASSERT(*vpp == NULL);
1656 return error;
1657 }
1658 KASSERT(new_key != NULL);
1659 KASSERT(memcmp(key, new_key, key_len) == 0);
1660 KASSERT(vp->v_op != NULL);
1661 vfs_insmntque(vp, mp);
1662 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1663 vp->v_vflag |= VV_MPSAFE;
1664 vfs_ref(mp);
1665 vfs_unbusy(mp);
1666
1667 /* Finished loading, finalize node. */
1668 mutex_enter(&vcache_lock);
1669 new_vip->vi_key.vk_key = new_key;
1670 mutex_enter(vp->v_interlock);
1671 VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED);
1672 mutex_exit(vp->v_interlock);
1673 mutex_exit(&vcache_lock);
1674 *vpp = vp;
1675 return 0;
1676 }
1677
1678 /*
1679 * Create a new vnode / fs node pair and return it referenced through vpp.
1680 */
1681 int
1682 vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap,
1683 kauth_cred_t cred, void *extra, struct vnode **vpp)
1684 {
1685 int error;
1686 uint32_t hash;
1687 struct vnode *vp, *ovp;
1688 vnode_impl_t *vip, *ovip;
1689
1690 *vpp = NULL;
1691
1692 /* Allocate and initialize a new vcache / vnode pair. */
1693 error = vfs_busy(mp);
1694 if (error)
1695 return error;
1696 vip = vcache_alloc();
1697 vip->vi_key.vk_mount = mp;
1698 vp = VIMPL_TO_VNODE(vip);
1699
1700 /* Create and load the fs node. */
1701 error = VFS_NEWVNODE(mp, dvp, vp, vap, cred, extra,
1702 &vip->vi_key.vk_key_len, &vip->vi_key.vk_key);
1703 if (error) {
1704 mutex_enter(&vcache_lock);
1705 vcache_dealloc(vip);
1706 vfs_unbusy(mp);
1707 KASSERT(*vpp == NULL);
1708 return error;
1709 }
1710 KASSERT(vp->v_op != NULL);
1711 KASSERT((vip->vi_key.vk_key_len == 0) == (mp == dead_rootmount));
1712 if (vip->vi_key.vk_key_len > 0) {
1713 KASSERT(vip->vi_key.vk_key != NULL);
1714 hash = vcache_hash(&vip->vi_key);
1715
1716 /*
1717 * Wait for previous instance to be reclaimed,
1718 * then insert new node.
1719 */
1720 mutex_enter(&vcache_lock);
1721 while ((ovip = vcache_hash_lookup(&vip->vi_key, hash))) {
1722 ovp = VIMPL_TO_VNODE(ovip);
1723 mutex_enter(ovp->v_interlock);
1724 mutex_exit(&vcache_lock);
1725 error = vcache_vget(ovp);
1726 KASSERT(error == ENOENT);
1727 mutex_enter(&vcache_lock);
1728 }
1729 SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask],
1730 vip, vi_hash);
1731 mutex_exit(&vcache_lock);
1732 }
1733 vfs_insmntque(vp, mp);
1734 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1735 vp->v_vflag |= VV_MPSAFE;
1736 vfs_ref(mp);
1737 vfs_unbusy(mp);
1738
1739 /* Finished loading, finalize node. */
1740 mutex_enter(&vcache_lock);
1741 mutex_enter(vp->v_interlock);
1742 VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED);
1743 mutex_exit(&vcache_lock);
1744 mutex_exit(vp->v_interlock);
1745 *vpp = vp;
1746 return 0;
1747 }
1748
1749 /*
1750 * Prepare key change: update old cache nodes key and lock new cache node.
1751 * Return an error if the new node already exists.
1752 */
1753 int
1754 vcache_rekey_enter(struct mount *mp, struct vnode *vp,
1755 const void *old_key, size_t old_key_len,
1756 const void *new_key, size_t new_key_len)
1757 {
1758 uint32_t old_hash, new_hash;
1759 struct vcache_key old_vcache_key, new_vcache_key;
1760 vnode_impl_t *vip, *new_vip;
1761
1762 old_vcache_key.vk_mount = mp;
1763 old_vcache_key.vk_key = old_key;
1764 old_vcache_key.vk_key_len = old_key_len;
1765 old_hash = vcache_hash(&old_vcache_key);
1766
1767 new_vcache_key.vk_mount = mp;
1768 new_vcache_key.vk_key = new_key;
1769 new_vcache_key.vk_key_len = new_key_len;
1770 new_hash = vcache_hash(&new_vcache_key);
1771
1772 new_vip = vcache_alloc();
1773 new_vip->vi_key = new_vcache_key;
1774
1775 /* Insert locked new node used as placeholder. */
1776 mutex_enter(&vcache_lock);
1777 vip = vcache_hash_lookup(&new_vcache_key, new_hash);
1778 if (vip != NULL) {
1779 vcache_dealloc(new_vip);
1780 return EEXIST;
1781 }
1782 SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask],
1783 new_vip, vi_hash);
1784
1785 /* Replace old nodes key with the temporary copy. */
1786 vip = vcache_hash_lookup(&old_vcache_key, old_hash);
1787 KASSERT(vip != NULL);
1788 KASSERT(VIMPL_TO_VNODE(vip) == vp);
1789 KASSERT(vip->vi_key.vk_key != old_vcache_key.vk_key);
1790 vip->vi_key = old_vcache_key;
1791 mutex_exit(&vcache_lock);
1792 return 0;
1793 }
1794
1795 /*
1796 * Key change complete: update old node and remove placeholder.
1797 */
1798 void
1799 vcache_rekey_exit(struct mount *mp, struct vnode *vp,
1800 const void *old_key, size_t old_key_len,
1801 const void *new_key, size_t new_key_len)
1802 {
1803 uint32_t old_hash, new_hash;
1804 struct vcache_key old_vcache_key, new_vcache_key;
1805 vnode_impl_t *vip, *new_vip;
1806 struct vnode *new_vp;
1807
1808 old_vcache_key.vk_mount = mp;
1809 old_vcache_key.vk_key = old_key;
1810 old_vcache_key.vk_key_len = old_key_len;
1811 old_hash = vcache_hash(&old_vcache_key);
1812
1813 new_vcache_key.vk_mount = mp;
1814 new_vcache_key.vk_key = new_key;
1815 new_vcache_key.vk_key_len = new_key_len;
1816 new_hash = vcache_hash(&new_vcache_key);
1817
1818 mutex_enter(&vcache_lock);
1819
1820 /* Lookup old and new node. */
1821 vip = vcache_hash_lookup(&old_vcache_key, old_hash);
1822 KASSERT(vip != NULL);
1823 KASSERT(VIMPL_TO_VNODE(vip) == vp);
1824
1825 new_vip = vcache_hash_lookup(&new_vcache_key, new_hash);
1826 KASSERT(new_vip != NULL);
1827 KASSERT(new_vip->vi_key.vk_key_len == new_key_len);
1828 new_vp = VIMPL_TO_VNODE(new_vip);
1829 mutex_enter(new_vp->v_interlock);
1830 VSTATE_ASSERT(VIMPL_TO_VNODE(new_vip), VS_LOADING);
1831 mutex_exit(new_vp->v_interlock);
1832
1833 /* Rekey old node and put it onto its new hashlist. */
1834 vip->vi_key = new_vcache_key;
1835 if (old_hash != new_hash) {
1836 SLIST_REMOVE(&vcache_hashtab[old_hash & vcache_hashmask],
1837 vip, vnode_impl, vi_hash);
1838 SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask],
1839 vip, vi_hash);
1840 }
1841
1842 /* Remove new node used as placeholder. */
1843 SLIST_REMOVE(&vcache_hashtab[new_hash & vcache_hashmask],
1844 new_vip, vnode_impl, vi_hash);
1845 vcache_dealloc(new_vip);
1846 }
1847
1848 /*
1849 * Disassociate the underlying file system from a vnode.
1850 *
1851 * Must be called with vnode locked and will return unlocked.
1852 * Must be called with the interlock held, and will return with it held.
1853 */
1854 static void
1855 vcache_reclaim(vnode_t *vp)
1856 {
1857 lwp_t *l = curlwp;
1858 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
1859 struct mount *mp = vp->v_mount;
1860 uint32_t hash;
1861 uint8_t temp_buf[64], *temp_key;
1862 size_t temp_key_len;
1863 bool recycle;
1864 int error;
1865
1866 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
1867 KASSERT(mutex_owned(vp->v_interlock));
1868 KASSERT(vrefcnt(vp) != 0);
1869
1870 temp_key_len = vip->vi_key.vk_key_len;
1871 /*
1872 * Prevent the vnode from being recycled or brought into use
1873 * while we clean it out.
1874 */
1875 VSTATE_CHANGE(vp, VS_BLOCKED, VS_RECLAIMING);
1876
1877 /*
1878 * Send NOTE_REVOKE now, before we call VOP_RECLAIM(),
1879 * because VOP_RECLAIM() could cause vp->v_klist to
1880 * become invalid. Don't check for interest in NOTE_REVOKE
1881 * here; it's always posted because it sets EV_EOF.
1882 *
1883 * Once it's been posted, reset vp->v_klist to point to
1884 * our own local storage, in case we were sharing with
1885 * someone else.
1886 */
1887 KNOTE(&vp->v_klist->vk_klist, NOTE_REVOKE);
1888 vp->v_klist = &vip->vi_klist;
1889 mutex_exit(vp->v_interlock);
1890
1891 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
1892 mutex_enter(vp->v_interlock);
1893 if ((vp->v_iflag & VI_EXECMAP) != 0) {
1894 cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages);
1895 }
1896 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
1897 vp->v_iflag |= VI_DEADCHECK; /* for genfs_getpages() */
1898 mutex_exit(vp->v_interlock);
1899 rw_exit(vp->v_uobj.vmobjlock);
1900
1901 /*
1902 * With vnode state set to reclaiming, purge name cache immediately
1903 * to prevent new handles on vnode, and wait for existing threads
1904 * trying to get a handle to notice VS_RECLAIMED status and abort.
1905 */
1906 cache_purge(vp);
1907
1908 /* Replace the vnode key with a temporary copy. */
1909 if (vip->vi_key.vk_key_len > sizeof(temp_buf)) {
1910 temp_key = kmem_alloc(temp_key_len, KM_SLEEP);
1911 } else {
1912 temp_key = temp_buf;
1913 }
1914 if (vip->vi_key.vk_key_len > 0) {
1915 mutex_enter(&vcache_lock);
1916 memcpy(temp_key, vip->vi_key.vk_key, temp_key_len);
1917 vip->vi_key.vk_key = temp_key;
1918 mutex_exit(&vcache_lock);
1919 }
1920
1921 fstrans_start(mp);
1922
1923 /*
1924 * Clean out any cached data associated with the vnode.
1925 */
1926 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
1927 if (error != 0) {
1928 if (wapbl_vphaswapbl(vp))
1929 WAPBL_DISCARD(wapbl_vptomp(vp));
1930 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
1931 }
1932 KASSERTMSG((error == 0), "vinvalbuf failed: %d", error);
1933 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1934 if (vp->v_type == VBLK || vp->v_type == VCHR) {
1935 spec_node_revoke(vp);
1936 }
1937
1938 /*
1939 * Disassociate the underlying file system from the vnode.
1940 * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks
1941 * the vnode, and may destroy the vnode so that VOP_UNLOCK
1942 * would no longer function.
1943 */
1944 VOP_INACTIVE(vp, &recycle);
1945 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
1946 if (VOP_RECLAIM(vp)) {
1947 vnpanic(vp, "%s: cannot reclaim", __func__);
1948 }
1949
1950 KASSERT(vp->v_data == NULL);
1951 KASSERT((vp->v_iflag & VI_PAGES) == 0);
1952
1953 if (vp->v_type == VREG && vp->v_ractx != NULL) {
1954 uvm_ra_freectx(vp->v_ractx);
1955 vp->v_ractx = NULL;
1956 }
1957
1958 if (vip->vi_key.vk_key_len > 0) {
1959 /* Remove from vnode cache. */
1960 hash = vcache_hash(&vip->vi_key);
1961 mutex_enter(&vcache_lock);
1962 KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash));
1963 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
1964 vip, vnode_impl, vi_hash);
1965 mutex_exit(&vcache_lock);
1966 }
1967 if (temp_key != temp_buf)
1968 kmem_free(temp_key, temp_key_len);
1969
1970 /* Done with purge, notify sleepers of the grim news. */
1971 mutex_enter(vp->v_interlock);
1972 vp->v_op = dead_vnodeop_p;
1973 VSTATE_CHANGE(vp, VS_RECLAIMING, VS_RECLAIMED);
1974 vp->v_tag = VT_NON;
1975 mutex_exit(vp->v_interlock);
1976
1977 /*
1978 * Move to dead mount. Must be after changing the operations
1979 * vector as vnode operations enter the mount before using the
1980 * operations vector. See sys/kern/vnode_if.c.
1981 */
1982 vp->v_vflag &= ~VV_ROOT;
1983 vfs_ref(dead_rootmount);
1984 vfs_insmntque(vp, dead_rootmount);
1985
1986 #ifdef PAX_SEGVGUARD
1987 pax_segvguard_cleanup(vp);
1988 #endif /* PAX_SEGVGUARD */
1989
1990 mutex_enter(vp->v_interlock);
1991 fstrans_done(mp);
1992 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1993 }
1994
1995 /*
1996 * Disassociate the underlying file system from an open device vnode
1997 * and make it anonymous.
1998 *
1999 * Vnode unlocked on entry, drops a reference to the vnode.
2000 */
2001 void
2002 vcache_make_anon(vnode_t *vp)
2003 {
2004 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
2005 uint32_t hash;
2006 bool recycle;
2007
2008 KASSERT(vp->v_type == VBLK || vp->v_type == VCHR);
2009 KASSERT(vp->v_mount == dead_rootmount || fstrans_is_owner(vp->v_mount));
2010 VSTATE_ASSERT_UNLOCKED(vp, VS_ACTIVE);
2011
2012 /* Remove from vnode cache. */
2013 hash = vcache_hash(&vip->vi_key);
2014 mutex_enter(&vcache_lock);
2015 KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash));
2016 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
2017 vip, vnode_impl, vi_hash);
2018 vip->vi_key.vk_mount = dead_rootmount;
2019 vip->vi_key.vk_key_len = 0;
2020 vip->vi_key.vk_key = NULL;
2021 mutex_exit(&vcache_lock);
2022
2023 /*
2024 * Disassociate the underlying file system from the vnode.
2025 * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks
2026 * the vnode, and may destroy the vnode so that VOP_UNLOCK
2027 * would no longer function.
2028 */
2029 if (vn_lock(vp, LK_EXCLUSIVE)) {
2030 vnpanic(vp, "%s: cannot lock", __func__);
2031 }
2032 VOP_INACTIVE(vp, &recycle);
2033 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
2034 if (VOP_RECLAIM(vp)) {
2035 vnpanic(vp, "%s: cannot reclaim", __func__);
2036 }
2037
2038 /* Purge name cache. */
2039 cache_purge(vp);
2040
2041 /* Done with purge, change operations vector. */
2042 mutex_enter(vp->v_interlock);
2043 vp->v_op = spec_vnodeop_p;
2044 vp->v_vflag |= VV_MPSAFE;
2045 mutex_exit(vp->v_interlock);
2046
2047 /*
2048 * Move to dead mount. Must be after changing the operations
2049 * vector as vnode operations enter the mount before using the
2050 * operations vector. See sys/kern/vnode_if.c.
2051 */
2052 vfs_ref(dead_rootmount);
2053 vfs_insmntque(vp, dead_rootmount);
2054
2055 vrele(vp);
2056 }
2057
2058 /*
2059 * Update outstanding I/O count and do wakeup if requested.
2060 */
2061 void
2062 vwakeup(struct buf *bp)
2063 {
2064 vnode_t *vp;
2065
2066 if ((vp = bp->b_vp) == NULL)
2067 return;
2068
2069 KASSERT(bp->b_objlock == vp->v_interlock);
2070 KASSERT(mutex_owned(bp->b_objlock));
2071
2072 if (--vp->v_numoutput < 0)
2073 vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp);
2074 if (vp->v_numoutput == 0)
2075 cv_broadcast(&vp->v_cv);
2076 }
2077
2078 /*
2079 * Test a vnode for being or becoming dead. Returns one of:
2080 * EBUSY: vnode is becoming dead, with "flags == VDEAD_NOWAIT" only.
2081 * ENOENT: vnode is dead.
2082 * 0: otherwise.
2083 *
2084 * Whenever this function returns a non-zero value all future
2085 * calls will also return a non-zero value.
2086 */
2087 int
2088 vdead_check(struct vnode *vp, int flags)
2089 {
2090
2091 KASSERT(mutex_owned(vp->v_interlock));
2092
2093 if (! ISSET(flags, VDEAD_NOWAIT))
2094 VSTATE_WAIT_STABLE(vp);
2095
2096 if (VSTATE_GET(vp) == VS_RECLAIMING) {
2097 KASSERT(ISSET(flags, VDEAD_NOWAIT));
2098 return EBUSY;
2099 } else if (VSTATE_GET(vp) == VS_RECLAIMED) {
2100 return ENOENT;
2101 }
2102
2103 return 0;
2104 }
2105
2106 int
2107 vfs_drainvnodes(void)
2108 {
2109
2110 mutex_enter(&vdrain_lock);
2111
2112 if (!vdrain_one(desiredvnodes)) {
2113 mutex_exit(&vdrain_lock);
2114 return EBUSY;
2115 }
2116
2117 mutex_exit(&vdrain_lock);
2118
2119 if (vcache_hashsize != desiredvnodes)
2120 vcache_reinit();
2121
2122 return 0;
2123 }
2124
2125 void
2126 vnpanic(vnode_t *vp, const char *fmt, ...)
2127 {
2128 va_list ap;
2129
2130 #ifdef DIAGNOSTIC
2131 vprint(NULL, vp);
2132 #endif
2133 va_start(ap, fmt);
2134 vpanic(fmt, ap);
2135 va_end(ap);
2136 }
2137
2138 void
2139 vshareilock(vnode_t *tvp, vnode_t *fvp)
2140 {
2141 kmutex_t *oldlock;
2142
2143 oldlock = tvp->v_interlock;
2144 mutex_obj_hold(fvp->v_interlock);
2145 tvp->v_interlock = fvp->v_interlock;
2146 mutex_obj_free(oldlock);
2147 }
2148
2149 void
2150 vshareklist(vnode_t *tvp, vnode_t *fvp)
2151 {
2152 /*
2153 * If two vnodes share klist state, they must also share
2154 * an interlock.
2155 */
2156 KASSERT(tvp->v_interlock == fvp->v_interlock);
2157
2158 /*
2159 * We make the following assumptions:
2160 *
2161 * ==> Some other synchronization is happening outside of
2162 * our view to make this safe.
2163 *
2164 * ==> That the "to" vnode will have the necessary references
2165 * on the "from" vnode so that the storage for the klist
2166 * won't be yanked out from beneath us (the vnode_impl).
2167 *
2168 * ==> If "from" is also sharing, we then assume that "from"
2169 * has the necessary references, and so on.
2170 */
2171 tvp->v_klist = fvp->v_klist;
2172 }
2173