vfs_vnode.c revision 1.111 1 /* $NetBSD: vfs_vnode.c,v 1.111 2020/02/23 15:46:41 ad Exp $ */
2
3 /*-
4 * Copyright (c) 1997-2011, 2019, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /*
34 * Copyright (c) 1989, 1993
35 * The Regents of the University of California. All rights reserved.
36 * (c) UNIX System Laboratories, Inc.
37 * All or some portions of this file are derived from material licensed
38 * to the University of California by American Telephone and Telegraph
39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40 * the permission of UNIX System Laboratories, Inc.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
67 */
68
69 /*
70 * The vnode cache subsystem.
71 *
72 * Life-cycle
73 *
74 * Normally, there are two points where new vnodes are created:
75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode
76 * starts in one of the following ways:
77 *
78 * - Allocation, via vcache_get(9) or vcache_new(9).
79 * - Reclamation of inactive vnode, via vcache_vget(9).
80 *
81 * Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9)
82 * was another, traditional way. Currently, only the draining thread
83 * recycles the vnodes. This behaviour might be revisited.
84 *
85 * The life-cycle ends when the last reference is dropped, usually
86 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform
87 * the file system that vnode is inactive. Via this call, file system
88 * indicates whether vnode can be recycled (usually, it checks its own
89 * references, e.g. count of links, whether the file was removed).
90 *
91 * Depending on indication, vnode can be put into a free list (cache),
92 * or cleaned via vcache_reclaim, which calls VOP_RECLAIM(9) to
93 * disassociate underlying file system from the vnode, and finally
94 * destroyed.
95 *
96 * Vnode state
97 *
98 * Vnode is always in one of six states:
99 * - MARKER This is a marker vnode to help list traversal. It
100 * will never change its state.
101 * - LOADING Vnode is associating underlying file system and not
102 * yet ready to use.
103 * - LOADED Vnode has associated underlying file system and is
104 * ready to use.
105 * - BLOCKED Vnode is active but cannot get new references.
106 * - RECLAIMING Vnode is disassociating from the underlying file
107 * system.
108 * - RECLAIMED Vnode has disassociated from underlying file system
109 * and is dead.
110 *
111 * Valid state changes are:
112 * LOADING -> LOADED
113 * Vnode has been initialised in vcache_get() or
114 * vcache_new() and is ready to use.
115 * LOADED -> RECLAIMING
116 * Vnode starts disassociation from underlying file
117 * system in vcache_reclaim().
118 * RECLAIMING -> RECLAIMED
119 * Vnode finished disassociation from underlying file
120 * system in vcache_reclaim().
121 * LOADED -> BLOCKED
122 * Either vcache_rekey*() is changing the vnode key or
123 * vrelel() is about to call VOP_INACTIVE().
124 * BLOCKED -> LOADED
125 * The block condition is over.
126 * LOADING -> RECLAIMED
127 * Either vcache_get() or vcache_new() failed to
128 * associate the underlying file system or vcache_rekey*()
129 * drops a vnode used as placeholder.
130 *
131 * Of these states LOADING, BLOCKED and RECLAIMING are intermediate
132 * and it is possible to wait for state change.
133 *
134 * State is protected with v_interlock with one exception:
135 * to change from LOADING both v_interlock and vcache_lock must be held
136 * so it is possible to check "state == LOADING" without holding
137 * v_interlock. See vcache_get() for details.
138 *
139 * Reference counting
140 *
141 * Vnode is considered active, if reference count (vnode_t::v_usecount)
142 * is non-zero. It is maintained using: vref(9) and vrele(9), as well
143 * as vput(9), routines. Common points holding references are e.g.
144 * file openings, current working directory, mount points, etc.
145 *
146 */
147
148 #include <sys/cdefs.h>
149 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.111 2020/02/23 15:46:41 ad Exp $");
150
151 #ifdef _KERNEL_OPT
152 #include "opt_pax.h"
153 #endif
154
155 #include <sys/param.h>
156 #include <sys/kernel.h>
157
158 #include <sys/atomic.h>
159 #include <sys/buf.h>
160 #include <sys/conf.h>
161 #include <sys/device.h>
162 #include <sys/hash.h>
163 #include <sys/kauth.h>
164 #include <sys/kmem.h>
165 #include <sys/kthread.h>
166 #include <sys/module.h>
167 #include <sys/mount.h>
168 #include <sys/namei.h>
169 #include <sys/pax.h>
170 #include <sys/syscallargs.h>
171 #include <sys/sysctl.h>
172 #include <sys/systm.h>
173 #include <sys/vnode_impl.h>
174 #include <sys/wapbl.h>
175 #include <sys/fstrans.h>
176
177 #include <uvm/uvm.h>
178 #include <uvm/uvm_readahead.h>
179 #include <uvm/uvm_stat.h>
180
181 /* Flags to vrelel. */
182 #define VRELEL_ASYNC 0x0001 /* Always defer to vrele thread. */
183
184 #define LRU_VRELE 0
185 #define LRU_FREE 1
186 #define LRU_HOLD 2
187 #define LRU_COUNT 3
188
189 /*
190 * There are three lru lists: one holds vnodes waiting for async release,
191 * one is for vnodes which have no buffer/page references and one for those
192 * which do (i.e. v_holdcnt is non-zero). We put the lists into a single,
193 * private cache line as vnodes migrate between them while under the same
194 * lock (vdrain_lock).
195 */
196 u_int numvnodes __cacheline_aligned;
197 static vnodelst_t lru_list[LRU_COUNT] __cacheline_aligned;
198 static kmutex_t vdrain_lock __cacheline_aligned;
199 static kcondvar_t vdrain_cv;
200 static int vdrain_gen;
201 static kcondvar_t vdrain_gen_cv;
202 static bool vdrain_retry;
203 static lwp_t * vdrain_lwp;
204 SLIST_HEAD(hashhead, vnode_impl);
205 static kmutex_t vcache_lock __cacheline_aligned;
206 static kcondvar_t vcache_cv;
207 static u_int vcache_hashsize;
208 static u_long vcache_hashmask;
209 static struct hashhead *vcache_hashtab;
210 static pool_cache_t vcache_pool;
211 static void lru_requeue(vnode_t *, vnodelst_t *);
212 static vnodelst_t * lru_which(vnode_t *);
213 static vnode_impl_t * vcache_alloc(void);
214 static void vcache_dealloc(vnode_impl_t *);
215 static void vcache_free(vnode_impl_t *);
216 static void vcache_init(void);
217 static void vcache_reinit(void);
218 static void vcache_reclaim(vnode_t *);
219 static void vrelel(vnode_t *, int, int);
220 static void vdrain_thread(void *);
221 static void vnpanic(vnode_t *, const char *, ...)
222 __printflike(2, 3);
223
224 /* Routines having to do with the management of the vnode table. */
225 extern struct mount *dead_rootmount;
226 extern int (**dead_vnodeop_p)(void *);
227 extern int (**spec_vnodeop_p)(void *);
228 extern struct vfsops dead_vfsops;
229
230 /* Vnode state operations and diagnostics. */
231
232 #if defined(DIAGNOSTIC)
233
234 #define VSTATE_VALID(state) \
235 ((state) != VS_ACTIVE && (state) != VS_MARKER)
236 #define VSTATE_GET(vp) \
237 vstate_assert_get((vp), __func__, __LINE__)
238 #define VSTATE_CHANGE(vp, from, to) \
239 vstate_assert_change((vp), (from), (to), __func__, __LINE__)
240 #define VSTATE_WAIT_STABLE(vp) \
241 vstate_assert_wait_stable((vp), __func__, __LINE__)
242
243 void
244 _vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line,
245 bool has_lock)
246 {
247 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
248
249 if (!has_lock) {
250 /*
251 * Prevent predictive loads from the CPU, but check the state
252 * without loooking first.
253 */
254 membar_enter();
255 if (state == VS_ACTIVE && vp->v_usecount > 0 &&
256 (vip->vi_state == VS_LOADED || vip->vi_state == VS_BLOCKED))
257 return;
258 if (vip->vi_state == state)
259 return;
260 mutex_enter((vp)->v_interlock);
261 }
262
263 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
264
265 if ((state == VS_ACTIVE && vp->v_usecount > 0 &&
266 (vip->vi_state == VS_LOADED || vip->vi_state == VS_BLOCKED)) ||
267 vip->vi_state == state) {
268 if (!has_lock)
269 mutex_exit((vp)->v_interlock);
270 return;
271 }
272 vnpanic(vp, "state is %s, usecount %d, expected %s at %s:%d",
273 vstate_name(vip->vi_state), vp->v_usecount,
274 vstate_name(state), func, line);
275 }
276
277 static enum vnode_state
278 vstate_assert_get(vnode_t *vp, const char *func, int line)
279 {
280 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
281
282 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
283 if (! VSTATE_VALID(vip->vi_state))
284 vnpanic(vp, "state is %s at %s:%d",
285 vstate_name(vip->vi_state), func, line);
286
287 return vip->vi_state;
288 }
289
290 static void
291 vstate_assert_wait_stable(vnode_t *vp, const char *func, int line)
292 {
293 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
294
295 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
296 if (! VSTATE_VALID(vip->vi_state))
297 vnpanic(vp, "state is %s at %s:%d",
298 vstate_name(vip->vi_state), func, line);
299
300 while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED)
301 cv_wait(&vp->v_cv, vp->v_interlock);
302
303 if (! VSTATE_VALID(vip->vi_state))
304 vnpanic(vp, "state is %s at %s:%d",
305 vstate_name(vip->vi_state), func, line);
306 }
307
308 static void
309 vstate_assert_change(vnode_t *vp, enum vnode_state from, enum vnode_state to,
310 const char *func, int line)
311 {
312 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
313
314 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
315 if (from == VS_LOADING)
316 KASSERTMSG(mutex_owned(&vcache_lock), "at %s:%d", func, line);
317
318 if (! VSTATE_VALID(from))
319 vnpanic(vp, "from is %s at %s:%d",
320 vstate_name(from), func, line);
321 if (! VSTATE_VALID(to))
322 vnpanic(vp, "to is %s at %s:%d",
323 vstate_name(to), func, line);
324 if (vip->vi_state != from)
325 vnpanic(vp, "from is %s, expected %s at %s:%d\n",
326 vstate_name(vip->vi_state), vstate_name(from), func, line);
327 if ((from == VS_BLOCKED || to == VS_BLOCKED) && vp->v_usecount != 1)
328 vnpanic(vp, "%s to %s with usecount %d at %s:%d",
329 vstate_name(from), vstate_name(to), vp->v_usecount,
330 func, line);
331
332 vip->vi_state = to;
333 if (from == VS_LOADING)
334 cv_broadcast(&vcache_cv);
335 if (to == VS_LOADED || to == VS_RECLAIMED)
336 cv_broadcast(&vp->v_cv);
337 }
338
339 #else /* defined(DIAGNOSTIC) */
340
341 #define VSTATE_GET(vp) \
342 (VNODE_TO_VIMPL((vp))->vi_state)
343 #define VSTATE_CHANGE(vp, from, to) \
344 vstate_change((vp), (from), (to))
345 #define VSTATE_WAIT_STABLE(vp) \
346 vstate_wait_stable((vp))
347 void
348 _vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line,
349 bool has_lock)
350 {
351
352 }
353
354 static void
355 vstate_wait_stable(vnode_t *vp)
356 {
357 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
358
359 while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED)
360 cv_wait(&vp->v_cv, vp->v_interlock);
361 }
362
363 static void
364 vstate_change(vnode_t *vp, enum vnode_state from, enum vnode_state to)
365 {
366 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
367
368 vip->vi_state = to;
369 if (from == VS_LOADING)
370 cv_broadcast(&vcache_cv);
371 if (to == VS_LOADED || to == VS_RECLAIMED)
372 cv_broadcast(&vp->v_cv);
373 }
374
375 #endif /* defined(DIAGNOSTIC) */
376
377 void
378 vfs_vnode_sysinit(void)
379 {
380 int error __diagused, i;
381
382 dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL);
383 KASSERT(dead_rootmount != NULL);
384 dead_rootmount->mnt_iflag |= IMNT_MPSAFE;
385
386 mutex_init(&vdrain_lock, MUTEX_DEFAULT, IPL_NONE);
387 for (i = 0; i < LRU_COUNT; i++) {
388 TAILQ_INIT(&lru_list[i]);
389 }
390 vcache_init();
391
392 cv_init(&vdrain_cv, "vdrain");
393 cv_init(&vdrain_gen_cv, "vdrainwt");
394 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread,
395 NULL, &vdrain_lwp, "vdrain");
396 KASSERTMSG((error == 0), "kthread_create(vdrain) failed: %d", error);
397 }
398
399 /*
400 * Allocate a new marker vnode.
401 */
402 vnode_t *
403 vnalloc_marker(struct mount *mp)
404 {
405 vnode_impl_t *vip;
406 vnode_t *vp;
407
408 vip = pool_cache_get(vcache_pool, PR_WAITOK);
409 memset(vip, 0, sizeof(*vip));
410 vp = VIMPL_TO_VNODE(vip);
411 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1);
412 vp->v_mount = mp;
413 vp->v_type = VBAD;
414 vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
415 vip->vi_state = VS_MARKER;
416
417 return vp;
418 }
419
420 /*
421 * Free a marker vnode.
422 */
423 void
424 vnfree_marker(vnode_t *vp)
425 {
426 vnode_impl_t *vip;
427
428 vip = VNODE_TO_VIMPL(vp);
429 KASSERT(vip->vi_state == VS_MARKER);
430 mutex_obj_free(vp->v_interlock);
431 uvm_obj_destroy(&vp->v_uobj, true);
432 pool_cache_put(vcache_pool, vip);
433 }
434
435 /*
436 * Test a vnode for being a marker vnode.
437 */
438 bool
439 vnis_marker(vnode_t *vp)
440 {
441
442 return (VNODE_TO_VIMPL(vp)->vi_state == VS_MARKER);
443 }
444
445 /*
446 * Return the lru list this node should be on.
447 */
448 static vnodelst_t *
449 lru_which(vnode_t *vp)
450 {
451
452 KASSERT(mutex_owned(vp->v_interlock));
453
454 if (vp->v_holdcnt > 0)
455 return &lru_list[LRU_HOLD];
456 else
457 return &lru_list[LRU_FREE];
458 }
459
460 /*
461 * Put vnode to end of given list.
462 * Both the current and the new list may be NULL, used on vnode alloc/free.
463 * Adjust numvnodes and signal vdrain thread if there is work.
464 */
465 static void
466 lru_requeue(vnode_t *vp, vnodelst_t *listhd)
467 {
468 vnode_impl_t *vip;
469 int d;
470
471 /*
472 * If the vnode is on the correct list, and was put there recently,
473 * then leave it be, thus avoiding huge cache and lock contention.
474 */
475 vip = VNODE_TO_VIMPL(vp);
476 if (listhd == vip->vi_lrulisthd &&
477 (hardclock_ticks - vip->vi_lrulisttm) < hz) {
478 return;
479 }
480
481 mutex_enter(&vdrain_lock);
482 d = 0;
483 if (vip->vi_lrulisthd != NULL)
484 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
485 else
486 d++;
487 vip->vi_lrulisthd = listhd;
488 vip->vi_lrulisttm = hardclock_ticks;
489 if (vip->vi_lrulisthd != NULL)
490 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
491 else
492 d--;
493 if (d != 0) {
494 /*
495 * Looks strange? This is not a bug. Don't store
496 * numvnodes unless there is a change - avoid false
497 * sharing on MP.
498 */
499 numvnodes += d;
500 }
501 if (numvnodes > desiredvnodes || listhd == &lru_list[LRU_VRELE])
502 cv_broadcast(&vdrain_cv);
503 mutex_exit(&vdrain_lock);
504 }
505
506 /*
507 * Release deferred vrele vnodes for this mount.
508 * Called with file system suspended.
509 */
510 void
511 vrele_flush(struct mount *mp)
512 {
513 vnode_impl_t *vip, *marker;
514 vnode_t *vp;
515
516 KASSERT(fstrans_is_owner(mp));
517
518 marker = VNODE_TO_VIMPL(vnalloc_marker(NULL));
519
520 mutex_enter(&vdrain_lock);
521 TAILQ_INSERT_HEAD(&lru_list[LRU_VRELE], marker, vi_lrulist);
522
523 while ((vip = TAILQ_NEXT(marker, vi_lrulist))) {
524 TAILQ_REMOVE(&lru_list[LRU_VRELE], marker, vi_lrulist);
525 TAILQ_INSERT_AFTER(&lru_list[LRU_VRELE], vip, marker,
526 vi_lrulist);
527 vp = VIMPL_TO_VNODE(vip);
528 if (vnis_marker(vp))
529 continue;
530
531 KASSERT(vip->vi_lrulisthd == &lru_list[LRU_VRELE]);
532 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
533 vip->vi_lrulisthd = &lru_list[LRU_HOLD];
534 vip->vi_lrulisttm = hardclock_ticks;
535 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
536 mutex_exit(&vdrain_lock);
537
538 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
539 mutex_enter(vp->v_interlock);
540 vrelel(vp, 0, LK_EXCLUSIVE);
541
542 mutex_enter(&vdrain_lock);
543 }
544
545 TAILQ_REMOVE(&lru_list[LRU_VRELE], marker, vi_lrulist);
546 mutex_exit(&vdrain_lock);
547
548 vnfree_marker(VIMPL_TO_VNODE(marker));
549 }
550
551 /*
552 * Reclaim a cached vnode. Used from vdrain_thread only.
553 */
554 static __inline void
555 vdrain_remove(vnode_t *vp)
556 {
557 struct mount *mp;
558
559 KASSERT(mutex_owned(&vdrain_lock));
560
561 /* Probe usecount (unlocked). */
562 if (vp->v_usecount > 0)
563 return;
564 /* Try v_interlock -- we lock the wrong direction! */
565 if (!mutex_tryenter(vp->v_interlock))
566 return;
567 /* Probe usecount and state. */
568 if (vp->v_usecount > 0 || VSTATE_GET(vp) != VS_LOADED) {
569 mutex_exit(vp->v_interlock);
570 return;
571 }
572 mp = vp->v_mount;
573 if (fstrans_start_nowait(mp) != 0) {
574 mutex_exit(vp->v_interlock);
575 return;
576 }
577 vdrain_retry = true;
578 mutex_exit(&vdrain_lock);
579
580 if (vcache_vget(vp) == 0) {
581 if (!vrecycle(vp)) {
582 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
583 mutex_enter(vp->v_interlock);
584 vrelel(vp, 0, LK_EXCLUSIVE);
585 }
586 }
587 fstrans_done(mp);
588
589 mutex_enter(&vdrain_lock);
590 }
591
592 /*
593 * Release a cached vnode. Used from vdrain_thread only.
594 */
595 static __inline void
596 vdrain_vrele(vnode_t *vp)
597 {
598 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
599 struct mount *mp;
600
601 KASSERT(mutex_owned(&vdrain_lock));
602
603 mp = vp->v_mount;
604 if (fstrans_start_nowait(mp) != 0)
605 return;
606
607 /*
608 * First remove the vnode from the vrele list.
609 * Put it on the last lru list, the last vrele()
610 * will put it back onto the right list before
611 * its v_usecount reaches zero.
612 */
613 KASSERT(vip->vi_lrulisthd == &lru_list[LRU_VRELE]);
614 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
615 vip->vi_lrulisthd = &lru_list[LRU_HOLD];
616 vip->vi_lrulisttm = hardclock_ticks;
617 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
618
619 vdrain_retry = true;
620 mutex_exit(&vdrain_lock);
621
622 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
623 mutex_enter(vp->v_interlock);
624 vrelel(vp, 0, LK_EXCLUSIVE);
625 fstrans_done(mp);
626
627 mutex_enter(&vdrain_lock);
628 }
629
630 /*
631 * Helper thread to keep the number of vnodes below desiredvnodes
632 * and release vnodes from asynchronous vrele.
633 */
634 static void
635 vdrain_thread(void *cookie)
636 {
637 int i;
638 u_int target;
639 vnode_impl_t *vip, *marker;
640
641 marker = VNODE_TO_VIMPL(vnalloc_marker(NULL));
642
643 mutex_enter(&vdrain_lock);
644
645 for (;;) {
646 vdrain_retry = false;
647 target = desiredvnodes - desiredvnodes/10;
648
649 for (i = 0; i < LRU_COUNT; i++) {
650 TAILQ_INSERT_HEAD(&lru_list[i], marker, vi_lrulist);
651 while ((vip = TAILQ_NEXT(marker, vi_lrulist))) {
652 TAILQ_REMOVE(&lru_list[i], marker, vi_lrulist);
653 TAILQ_INSERT_AFTER(&lru_list[i], vip, marker,
654 vi_lrulist);
655 if (vnis_marker(VIMPL_TO_VNODE(vip)))
656 continue;
657 if (i == LRU_VRELE)
658 vdrain_vrele(VIMPL_TO_VNODE(vip));
659 else if (numvnodes < target)
660 break;
661 else
662 vdrain_remove(VIMPL_TO_VNODE(vip));
663 }
664 TAILQ_REMOVE(&lru_list[i], marker, vi_lrulist);
665 }
666
667 if (vdrain_retry) {
668 mutex_exit(&vdrain_lock);
669 yield();
670 mutex_enter(&vdrain_lock);
671 } else {
672 vdrain_gen++;
673 cv_broadcast(&vdrain_gen_cv);
674 cv_wait(&vdrain_cv, &vdrain_lock);
675 }
676 }
677 }
678
679 /*
680 * vput: unlock and release the reference.
681 */
682 void
683 vput(vnode_t *vp)
684 {
685 int lktype;
686
687 if ((vp->v_vflag & VV_LOCKSWORK) == 0) {
688 lktype = LK_EXCLUSIVE;
689 } else {
690 lktype = VOP_ISLOCKED(vp);
691 KASSERT(lktype != LK_NONE);
692 }
693 mutex_enter(vp->v_interlock);
694 vrelel(vp, 0, lktype);
695 }
696
697 /*
698 * Vnode release. If reference count drops to zero, call inactive
699 * routine and either return to freelist or free to the pool.
700 */
701 static void
702 vrelel(vnode_t *vp, int flags, int lktype)
703 {
704 const bool async = ((flags & VRELEL_ASYNC) != 0);
705 bool recycle, defer;
706 int error;
707
708 KASSERT(mutex_owned(vp->v_interlock));
709
710 if (__predict_false(vp->v_op == dead_vnodeop_p &&
711 VSTATE_GET(vp) != VS_RECLAIMED)) {
712 vnpanic(vp, "dead but not clean");
713 }
714
715 /*
716 * If not the last reference, just drop the reference count
717 * and unlock.
718 */
719 if (vp->v_usecount > 1) {
720 if (lktype != LK_NONE) {
721 VOP_UNLOCK(vp);
722 }
723 vp->v_usecount--;
724 mutex_exit(vp->v_interlock);
725 return;
726 }
727 if (vp->v_usecount <= 0 || vp->v_writecount != 0) {
728 vnpanic(vp, "%s: bad ref count", __func__);
729 }
730
731 #ifdef DIAGNOSTIC
732 if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
733 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
734 vprint("vrelel: missing VOP_CLOSE()", vp);
735 }
736 #endif
737
738 /*
739 * First try to get the vnode locked for VOP_INACTIVE().
740 * Defer vnode release to vdrain_thread if caller requests
741 * it explicitly, is the pagedaemon or the lock failed.
742 */
743 defer = false;
744 if ((curlwp == uvm.pagedaemon_lwp) || async) {
745 defer = true;
746 } else if (lktype == LK_SHARED) {
747 /* Excellent chance of getting, if the last ref. */
748 error = vn_lock(vp, LK_UPGRADE | LK_RETRY |
749 LK_NOWAIT);
750 if (error != 0) {
751 defer = true;
752 } else {
753 lktype = LK_EXCLUSIVE;
754 }
755 } else if (lktype == LK_NONE) {
756 /* Excellent chance of getting, if the last ref. */
757 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY |
758 LK_NOWAIT);
759 if (error != 0) {
760 defer = true;
761 } else {
762 lktype = LK_EXCLUSIVE;
763 }
764 }
765 KASSERT(mutex_owned(vp->v_interlock));
766 if (defer) {
767 /*
768 * Defer reclaim to the kthread; it's not safe to
769 * clean it here. We donate it our last reference.
770 */
771 if (lktype != LK_NONE) {
772 VOP_UNLOCK(vp);
773 }
774 lru_requeue(vp, &lru_list[LRU_VRELE]);
775 mutex_exit(vp->v_interlock);
776 return;
777 }
778 KASSERT(lktype == LK_EXCLUSIVE);
779
780 /*
781 * If not clean, deactivate the vnode, but preserve
782 * our reference across the call to VOP_INACTIVE().
783 */
784 if (VSTATE_GET(vp) == VS_RECLAIMED) {
785 VOP_UNLOCK(vp);
786 } else {
787 VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
788 mutex_exit(vp->v_interlock);
789
790 /*
791 * The vnode must not gain another reference while being
792 * deactivated. If VOP_INACTIVE() indicates that
793 * the described file has been deleted, then recycle
794 * the vnode.
795 *
796 * Note that VOP_INACTIVE() will not drop the vnode lock.
797 */
798 recycle = false;
799 VOP_INACTIVE(vp, &recycle);
800 if (!recycle)
801 VOP_UNLOCK(vp);
802 mutex_enter(vp->v_interlock);
803 VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
804 if (!recycle) {
805 if (vp->v_usecount > 1) {
806 vp->v_usecount--;
807 mutex_exit(vp->v_interlock);
808 return;
809 }
810 }
811
812 /*
813 * Take care of space accounting. We hold the last ref so
814 * it's OK to update VM related fields in v_iflag without
815 * holding vmobjlock: nobody else will be looking at them.
816 */
817 if ((vp->v_iflag & VI_EXECMAP) != 0 &&
818 vp->v_uobj.uo_npages != 0) {
819 cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages);
820 cpu_count(CPU_COUNT_FILEPAGES, vp->v_uobj.uo_npages);
821 }
822 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP);
823 vp->v_vflag &= ~VV_MAPPED;
824
825 /*
826 * Recycle the vnode if the file is now unused (unlinked),
827 * otherwise just free it.
828 */
829 if (recycle) {
830 VSTATE_ASSERT(vp, VS_LOADED);
831 /* vcache_reclaim drops the lock. */
832 vcache_reclaim(vp);
833 }
834 KASSERT(vp->v_usecount > 0);
835 }
836
837 vp->v_usecount--;
838 if (vp->v_usecount != 0) {
839 /* Gained another reference while being reclaimed. */
840 mutex_exit(vp->v_interlock);
841 return;
842 }
843
844 if (VSTATE_GET(vp) == VS_RECLAIMED && vp->v_holdcnt == 0) {
845 /*
846 * It's clean so destroy it. It isn't referenced
847 * anywhere since it has been reclaimed.
848 */
849 vcache_free(VNODE_TO_VIMPL(vp));
850 } else {
851 /*
852 * Otherwise, put it back onto the freelist. It
853 * can't be destroyed while still associated with
854 * a file system.
855 */
856 lru_requeue(vp, lru_which(vp));
857 mutex_exit(vp->v_interlock);
858 }
859 }
860
861 void
862 vrele(vnode_t *vp)
863 {
864
865 mutex_enter(vp->v_interlock);
866 vrelel(vp, 0, LK_NONE);
867 }
868
869 /*
870 * Asynchronous vnode release, vnode is released in different context.
871 */
872 void
873 vrele_async(vnode_t *vp)
874 {
875
876 mutex_enter(vp->v_interlock);
877 vrelel(vp, VRELEL_ASYNC, LK_NONE);
878 }
879
880 /*
881 * Vnode reference, where a reference is already held by some other
882 * object (for example, a file structure).
883 */
884 void
885 vref(vnode_t *vp)
886 {
887
888 KASSERT(vp->v_usecount != 0);
889
890 mutex_enter(vp->v_interlock);
891 vp->v_usecount++;
892 mutex_exit(vp->v_interlock);
893 }
894
895 /*
896 * Page or buffer structure gets a reference.
897 * Called with v_interlock held.
898 */
899 void
900 vholdl(vnode_t *vp)
901 {
902
903 KASSERT(mutex_owned(vp->v_interlock));
904
905 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0)
906 lru_requeue(vp, lru_which(vp));
907 }
908
909 /*
910 * Page or buffer structure frees a reference.
911 * Called with v_interlock held.
912 */
913 void
914 holdrelel(vnode_t *vp)
915 {
916
917 KASSERT(mutex_owned(vp->v_interlock));
918
919 if (vp->v_holdcnt <= 0) {
920 vnpanic(vp, "%s: holdcnt vp %p", __func__, vp);
921 }
922
923 vp->v_holdcnt--;
924 if (vp->v_holdcnt == 0 && vp->v_usecount == 0)
925 lru_requeue(vp, lru_which(vp));
926 }
927
928 /*
929 * Recycle an unused vnode if caller holds the last reference.
930 */
931 bool
932 vrecycle(vnode_t *vp)
933 {
934 int error __diagused;
935
936 mutex_enter(vp->v_interlock);
937
938 /* Make sure we hold the last reference. */
939 VSTATE_WAIT_STABLE(vp);
940 if (vp->v_usecount != 1) {
941 mutex_exit(vp->v_interlock);
942 return false;
943 }
944
945 /* If the vnode is already clean we're done. */
946 if (VSTATE_GET(vp) != VS_LOADED) {
947 VSTATE_ASSERT(vp, VS_RECLAIMED);
948 vrelel(vp, 0, LK_NONE);
949 return true;
950 }
951
952 /* Prevent further references until the vnode is locked. */
953 VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
954 mutex_exit(vp->v_interlock);
955
956 /*
957 * On a leaf file system this lock will always succeed as we hold
958 * the last reference and prevent further references.
959 * On layered file systems waiting for the lock would open a can of
960 * deadlocks as the lower vnodes may have other active references.
961 */
962 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
963
964 mutex_enter(vp->v_interlock);
965 VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
966
967 if (error) {
968 mutex_exit(vp->v_interlock);
969 return false;
970 }
971
972 KASSERT(vp->v_usecount == 1);
973 vcache_reclaim(vp);
974 vrelel(vp, 0, LK_NONE);
975
976 return true;
977 }
978
979 /*
980 * Helper for vrevoke() to propagate suspension from lastmp
981 * to thismp. Both args may be NULL.
982 * Returns the currently suspended file system or NULL.
983 */
984 static struct mount *
985 vrevoke_suspend_next(struct mount *lastmp, struct mount *thismp)
986 {
987 int error;
988
989 if (lastmp == thismp)
990 return thismp;
991
992 if (lastmp != NULL)
993 vfs_resume(lastmp);
994
995 if (thismp == NULL)
996 return NULL;
997
998 do {
999 error = vfs_suspend(thismp, 0);
1000 } while (error == EINTR || error == ERESTART);
1001
1002 if (error == 0)
1003 return thismp;
1004
1005 KASSERT(error == EOPNOTSUPP);
1006 return NULL;
1007 }
1008
1009 /*
1010 * Eliminate all activity associated with the requested vnode
1011 * and with all vnodes aliased to the requested vnode.
1012 */
1013 void
1014 vrevoke(vnode_t *vp)
1015 {
1016 struct mount *mp;
1017 vnode_t *vq;
1018 enum vtype type;
1019 dev_t dev;
1020
1021 KASSERT(vp->v_usecount > 0);
1022
1023 mp = vrevoke_suspend_next(NULL, vp->v_mount);
1024
1025 mutex_enter(vp->v_interlock);
1026 VSTATE_WAIT_STABLE(vp);
1027 if (VSTATE_GET(vp) == VS_RECLAIMED) {
1028 mutex_exit(vp->v_interlock);
1029 } else if (vp->v_type != VBLK && vp->v_type != VCHR) {
1030 vp->v_usecount++;
1031 mutex_exit(vp->v_interlock);
1032 vgone(vp);
1033 } else {
1034 dev = vp->v_rdev;
1035 type = vp->v_type;
1036 mutex_exit(vp->v_interlock);
1037
1038 while (spec_node_lookup_by_dev(type, dev, &vq) == 0) {
1039 mp = vrevoke_suspend_next(mp, vq->v_mount);
1040 vgone(vq);
1041 }
1042 }
1043 vrevoke_suspend_next(mp, NULL);
1044 }
1045
1046 /*
1047 * Eliminate all activity associated with a vnode in preparation for
1048 * reuse. Drops a reference from the vnode.
1049 */
1050 void
1051 vgone(vnode_t *vp)
1052 {
1053 int lktype;
1054
1055 KASSERT(vp->v_mount == dead_rootmount || fstrans_is_owner(vp->v_mount));
1056
1057 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1058 lktype = LK_EXCLUSIVE;
1059 mutex_enter(vp->v_interlock);
1060 VSTATE_WAIT_STABLE(vp);
1061 if (VSTATE_GET(vp) == VS_LOADED) {
1062 vcache_reclaim(vp);
1063 lktype = LK_NONE;
1064 }
1065 VSTATE_ASSERT(vp, VS_RECLAIMED);
1066 vrelel(vp, 0, lktype);
1067 }
1068
1069 static inline uint32_t
1070 vcache_hash(const struct vcache_key *key)
1071 {
1072 uint32_t hash = HASH32_BUF_INIT;
1073
1074 KASSERT(key->vk_key_len > 0);
1075
1076 hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash);
1077 hash = hash32_buf(key->vk_key, key->vk_key_len, hash);
1078 return hash;
1079 }
1080
1081 static void
1082 vcache_init(void)
1083 {
1084
1085 vcache_pool = pool_cache_init(sizeof(vnode_impl_t), 0, 0, 0,
1086 "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL);
1087 KASSERT(vcache_pool != NULL);
1088 mutex_init(&vcache_lock, MUTEX_DEFAULT, IPL_NONE);
1089 cv_init(&vcache_cv, "vcache");
1090 vcache_hashsize = desiredvnodes;
1091 vcache_hashtab = hashinit(desiredvnodes, HASH_SLIST, true,
1092 &vcache_hashmask);
1093 }
1094
1095 static void
1096 vcache_reinit(void)
1097 {
1098 int i;
1099 uint32_t hash;
1100 u_long oldmask, newmask;
1101 struct hashhead *oldtab, *newtab;
1102 vnode_impl_t *vip;
1103
1104 newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask);
1105 mutex_enter(&vcache_lock);
1106 oldtab = vcache_hashtab;
1107 oldmask = vcache_hashmask;
1108 vcache_hashsize = desiredvnodes;
1109 vcache_hashtab = newtab;
1110 vcache_hashmask = newmask;
1111 for (i = 0; i <= oldmask; i++) {
1112 while ((vip = SLIST_FIRST(&oldtab[i])) != NULL) {
1113 SLIST_REMOVE(&oldtab[i], vip, vnode_impl, vi_hash);
1114 hash = vcache_hash(&vip->vi_key);
1115 SLIST_INSERT_HEAD(&newtab[hash & vcache_hashmask],
1116 vip, vi_hash);
1117 }
1118 }
1119 mutex_exit(&vcache_lock);
1120 hashdone(oldtab, HASH_SLIST, oldmask);
1121 }
1122
1123 static inline vnode_impl_t *
1124 vcache_hash_lookup(const struct vcache_key *key, uint32_t hash)
1125 {
1126 struct hashhead *hashp;
1127 vnode_impl_t *vip;
1128
1129 KASSERT(mutex_owned(&vcache_lock));
1130
1131 hashp = &vcache_hashtab[hash & vcache_hashmask];
1132 SLIST_FOREACH(vip, hashp, vi_hash) {
1133 if (key->vk_mount != vip->vi_key.vk_mount)
1134 continue;
1135 if (key->vk_key_len != vip->vi_key.vk_key_len)
1136 continue;
1137 if (memcmp(key->vk_key, vip->vi_key.vk_key, key->vk_key_len))
1138 continue;
1139 return vip;
1140 }
1141 return NULL;
1142 }
1143
1144 /*
1145 * Allocate a new, uninitialized vcache node.
1146 */
1147 static vnode_impl_t *
1148 vcache_alloc(void)
1149 {
1150 vnode_impl_t *vip;
1151 vnode_t *vp;
1152
1153 vip = pool_cache_get(vcache_pool, PR_WAITOK);
1154 vp = VIMPL_TO_VNODE(vip);
1155 memset(vip, 0, sizeof(*vip));
1156
1157 vip->vi_lock = rw_obj_alloc();
1158 vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
1159
1160 /* SLIST_INIT(&vip->vi_hash); */
1161 TAILQ_INIT(&vip->vi_nclist);
1162 /* LIST_INIT(&vip->vi_dnclist); */
1163
1164 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1);
1165 cv_init(&vp->v_cv, "vnode");
1166
1167 vp->v_usecount = 1;
1168 vp->v_type = VNON;
1169 vp->v_size = vp->v_writesize = VSIZENOTSET;
1170
1171 vip->vi_state = VS_LOADING;
1172
1173 lru_requeue(vp, &lru_list[LRU_FREE]);
1174
1175 return vip;
1176 }
1177
1178 /*
1179 * Deallocate a vcache node in state VS_LOADING.
1180 *
1181 * vcache_lock held on entry and released on return.
1182 */
1183 static void
1184 vcache_dealloc(vnode_impl_t *vip)
1185 {
1186 vnode_t *vp;
1187
1188 KASSERT(mutex_owned(&vcache_lock));
1189
1190 vp = VIMPL_TO_VNODE(vip);
1191 vfs_ref(dead_rootmount);
1192 vfs_insmntque(vp, dead_rootmount);
1193 mutex_enter(vp->v_interlock);
1194 vp->v_op = dead_vnodeop_p;
1195 VSTATE_CHANGE(vp, VS_LOADING, VS_RECLAIMED);
1196 mutex_exit(&vcache_lock);
1197 vrelel(vp, 0, LK_NONE);
1198 }
1199
1200 /*
1201 * Free an unused, unreferenced vcache node.
1202 * v_interlock locked on entry.
1203 */
1204 static void
1205 vcache_free(vnode_impl_t *vip)
1206 {
1207 vnode_t *vp;
1208
1209 vp = VIMPL_TO_VNODE(vip);
1210 KASSERT(mutex_owned(vp->v_interlock));
1211
1212 KASSERT(vp->v_usecount == 0);
1213 KASSERT(vp->v_holdcnt == 0);
1214 KASSERT(vp->v_writecount == 0);
1215 lru_requeue(vp, NULL);
1216 mutex_exit(vp->v_interlock);
1217
1218 vfs_insmntque(vp, NULL);
1219 if (vp->v_type == VBLK || vp->v_type == VCHR)
1220 spec_node_destroy(vp);
1221
1222 mutex_obj_free(vp->v_interlock);
1223 rw_obj_free(vip->vi_lock);
1224 uvm_obj_destroy(&vp->v_uobj, true);
1225 cv_destroy(&vp->v_cv);
1226 pool_cache_put(vcache_pool, vip);
1227 }
1228
1229 /*
1230 * Try to get an initial reference on this cached vnode.
1231 * Returns zero on success, ENOENT if the vnode has been reclaimed and
1232 * EBUSY if the vnode state is unstable.
1233 *
1234 * v_interlock locked on entry and unlocked on exit.
1235 */
1236 int
1237 vcache_tryvget(vnode_t *vp)
1238 {
1239 int error = 0;
1240
1241 KASSERT(mutex_owned(vp->v_interlock));
1242
1243 if (__predict_false(VSTATE_GET(vp) == VS_RECLAIMED))
1244 error = ENOENT;
1245 else if (__predict_false(VSTATE_GET(vp) != VS_LOADED))
1246 error = EBUSY;
1247 else
1248 vp->v_usecount++;
1249
1250 mutex_exit(vp->v_interlock);
1251
1252 return error;
1253 }
1254
1255 /*
1256 * Try to get an initial reference on this cached vnode.
1257 * Returns zero on success and ENOENT if the vnode has been reclaimed.
1258 * Will wait for the vnode state to be stable.
1259 *
1260 * v_interlock locked on entry and unlocked on exit.
1261 */
1262 int
1263 vcache_vget(vnode_t *vp)
1264 {
1265
1266 KASSERT(mutex_owned(vp->v_interlock));
1267
1268 /* Increment hold count to prevent vnode from disappearing. */
1269 vp->v_holdcnt++;
1270 VSTATE_WAIT_STABLE(vp);
1271 vp->v_holdcnt--;
1272
1273 /* If this was the last reference to a reclaimed vnode free it now. */
1274 if (__predict_false(VSTATE_GET(vp) == VS_RECLAIMED)) {
1275 if (vp->v_holdcnt == 0 && vp->v_usecount == 0)
1276 vcache_free(VNODE_TO_VIMPL(vp));
1277 else
1278 mutex_exit(vp->v_interlock);
1279 return ENOENT;
1280 }
1281 VSTATE_ASSERT(vp, VS_LOADED);
1282 vp->v_usecount++;
1283 mutex_exit(vp->v_interlock);
1284
1285 return 0;
1286 }
1287
1288 /*
1289 * Get a vnode / fs node pair by key and return it referenced through vpp.
1290 */
1291 int
1292 vcache_get(struct mount *mp, const void *key, size_t key_len,
1293 struct vnode **vpp)
1294 {
1295 int error;
1296 uint32_t hash;
1297 const void *new_key;
1298 struct vnode *vp;
1299 struct vcache_key vcache_key;
1300 vnode_impl_t *vip, *new_vip;
1301
1302 new_key = NULL;
1303 *vpp = NULL;
1304
1305 vcache_key.vk_mount = mp;
1306 vcache_key.vk_key = key;
1307 vcache_key.vk_key_len = key_len;
1308 hash = vcache_hash(&vcache_key);
1309
1310 again:
1311 mutex_enter(&vcache_lock);
1312 vip = vcache_hash_lookup(&vcache_key, hash);
1313
1314 /* If found, take a reference or retry. */
1315 if (__predict_true(vip != NULL)) {
1316 /*
1317 * If the vnode is loading we cannot take the v_interlock
1318 * here as it might change during load (see uvm_obj_setlock()).
1319 * As changing state from VS_LOADING requires both vcache_lock
1320 * and v_interlock it is safe to test with vcache_lock held.
1321 *
1322 * Wait for vnodes changing state from VS_LOADING and retry.
1323 */
1324 if (__predict_false(vip->vi_state == VS_LOADING)) {
1325 cv_wait(&vcache_cv, &vcache_lock);
1326 mutex_exit(&vcache_lock);
1327 goto again;
1328 }
1329 vp = VIMPL_TO_VNODE(vip);
1330 mutex_enter(vp->v_interlock);
1331 mutex_exit(&vcache_lock);
1332 error = vcache_vget(vp);
1333 if (error == ENOENT)
1334 goto again;
1335 if (error == 0)
1336 *vpp = vp;
1337 KASSERT((error != 0) == (*vpp == NULL));
1338 return error;
1339 }
1340 mutex_exit(&vcache_lock);
1341
1342 /* Allocate and initialize a new vcache / vnode pair. */
1343 error = vfs_busy(mp);
1344 if (error)
1345 return error;
1346 new_vip = vcache_alloc();
1347 new_vip->vi_key = vcache_key;
1348 vp = VIMPL_TO_VNODE(new_vip);
1349 mutex_enter(&vcache_lock);
1350 vip = vcache_hash_lookup(&vcache_key, hash);
1351 if (vip == NULL) {
1352 SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask],
1353 new_vip, vi_hash);
1354 vip = new_vip;
1355 }
1356
1357 /* If another thread beat us inserting this node, retry. */
1358 if (vip != new_vip) {
1359 vcache_dealloc(new_vip);
1360 vfs_unbusy(mp);
1361 goto again;
1362 }
1363 mutex_exit(&vcache_lock);
1364
1365 /* Load the fs node. Exclusive as new_node is VS_LOADING. */
1366 error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key);
1367 if (error) {
1368 mutex_enter(&vcache_lock);
1369 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
1370 new_vip, vnode_impl, vi_hash);
1371 vcache_dealloc(new_vip);
1372 vfs_unbusy(mp);
1373 KASSERT(*vpp == NULL);
1374 return error;
1375 }
1376 KASSERT(new_key != NULL);
1377 KASSERT(memcmp(key, new_key, key_len) == 0);
1378 KASSERT(vp->v_op != NULL);
1379 vfs_insmntque(vp, mp);
1380 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1381 vp->v_vflag |= VV_MPSAFE;
1382 vfs_ref(mp);
1383 vfs_unbusy(mp);
1384
1385 /* Finished loading, finalize node. */
1386 mutex_enter(&vcache_lock);
1387 new_vip->vi_key.vk_key = new_key;
1388 mutex_enter(vp->v_interlock);
1389 VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED);
1390 mutex_exit(vp->v_interlock);
1391 mutex_exit(&vcache_lock);
1392 *vpp = vp;
1393 return 0;
1394 }
1395
1396 /*
1397 * Create a new vnode / fs node pair and return it referenced through vpp.
1398 */
1399 int
1400 vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap,
1401 kauth_cred_t cred, void *extra, struct vnode **vpp)
1402 {
1403 int error;
1404 uint32_t hash;
1405 struct vnode *vp, *ovp;
1406 vnode_impl_t *vip, *ovip;
1407
1408 *vpp = NULL;
1409
1410 /* Allocate and initialize a new vcache / vnode pair. */
1411 error = vfs_busy(mp);
1412 if (error)
1413 return error;
1414 vip = vcache_alloc();
1415 vip->vi_key.vk_mount = mp;
1416 vp = VIMPL_TO_VNODE(vip);
1417
1418 /* Create and load the fs node. */
1419 error = VFS_NEWVNODE(mp, dvp, vp, vap, cred, extra,
1420 &vip->vi_key.vk_key_len, &vip->vi_key.vk_key);
1421 if (error) {
1422 mutex_enter(&vcache_lock);
1423 vcache_dealloc(vip);
1424 vfs_unbusy(mp);
1425 KASSERT(*vpp == NULL);
1426 return error;
1427 }
1428 KASSERT(vp->v_op != NULL);
1429 KASSERT((vip->vi_key.vk_key_len == 0) == (mp == dead_rootmount));
1430 if (vip->vi_key.vk_key_len > 0) {
1431 KASSERT(vip->vi_key.vk_key != NULL);
1432 hash = vcache_hash(&vip->vi_key);
1433
1434 /*
1435 * Wait for previous instance to be reclaimed,
1436 * then insert new node.
1437 */
1438 mutex_enter(&vcache_lock);
1439 while ((ovip = vcache_hash_lookup(&vip->vi_key, hash))) {
1440 ovp = VIMPL_TO_VNODE(ovip);
1441 mutex_enter(ovp->v_interlock);
1442 mutex_exit(&vcache_lock);
1443 error = vcache_vget(ovp);
1444 KASSERT(error == ENOENT);
1445 mutex_enter(&vcache_lock);
1446 }
1447 SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask],
1448 vip, vi_hash);
1449 mutex_exit(&vcache_lock);
1450 }
1451 vfs_insmntque(vp, mp);
1452 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1453 vp->v_vflag |= VV_MPSAFE;
1454 vfs_ref(mp);
1455 vfs_unbusy(mp);
1456
1457 /* Finished loading, finalize node. */
1458 mutex_enter(&vcache_lock);
1459 mutex_enter(vp->v_interlock);
1460 VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED);
1461 mutex_exit(&vcache_lock);
1462 mutex_exit(vp->v_interlock);
1463 *vpp = vp;
1464 return 0;
1465 }
1466
1467 /*
1468 * Prepare key change: update old cache nodes key and lock new cache node.
1469 * Return an error if the new node already exists.
1470 */
1471 int
1472 vcache_rekey_enter(struct mount *mp, struct vnode *vp,
1473 const void *old_key, size_t old_key_len,
1474 const void *new_key, size_t new_key_len)
1475 {
1476 uint32_t old_hash, new_hash;
1477 struct vcache_key old_vcache_key, new_vcache_key;
1478 vnode_impl_t *vip, *new_vip;
1479
1480 old_vcache_key.vk_mount = mp;
1481 old_vcache_key.vk_key = old_key;
1482 old_vcache_key.vk_key_len = old_key_len;
1483 old_hash = vcache_hash(&old_vcache_key);
1484
1485 new_vcache_key.vk_mount = mp;
1486 new_vcache_key.vk_key = new_key;
1487 new_vcache_key.vk_key_len = new_key_len;
1488 new_hash = vcache_hash(&new_vcache_key);
1489
1490 new_vip = vcache_alloc();
1491 new_vip->vi_key = new_vcache_key;
1492
1493 /* Insert locked new node used as placeholder. */
1494 mutex_enter(&vcache_lock);
1495 vip = vcache_hash_lookup(&new_vcache_key, new_hash);
1496 if (vip != NULL) {
1497 vcache_dealloc(new_vip);
1498 return EEXIST;
1499 }
1500 SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask],
1501 new_vip, vi_hash);
1502
1503 /* Replace old nodes key with the temporary copy. */
1504 vip = vcache_hash_lookup(&old_vcache_key, old_hash);
1505 KASSERT(vip != NULL);
1506 KASSERT(VIMPL_TO_VNODE(vip) == vp);
1507 KASSERT(vip->vi_key.vk_key != old_vcache_key.vk_key);
1508 vip->vi_key = old_vcache_key;
1509 mutex_exit(&vcache_lock);
1510 return 0;
1511 }
1512
1513 /*
1514 * Key change complete: update old node and remove placeholder.
1515 */
1516 void
1517 vcache_rekey_exit(struct mount *mp, struct vnode *vp,
1518 const void *old_key, size_t old_key_len,
1519 const void *new_key, size_t new_key_len)
1520 {
1521 uint32_t old_hash, new_hash;
1522 struct vcache_key old_vcache_key, new_vcache_key;
1523 vnode_impl_t *vip, *new_vip;
1524 struct vnode *new_vp;
1525
1526 old_vcache_key.vk_mount = mp;
1527 old_vcache_key.vk_key = old_key;
1528 old_vcache_key.vk_key_len = old_key_len;
1529 old_hash = vcache_hash(&old_vcache_key);
1530
1531 new_vcache_key.vk_mount = mp;
1532 new_vcache_key.vk_key = new_key;
1533 new_vcache_key.vk_key_len = new_key_len;
1534 new_hash = vcache_hash(&new_vcache_key);
1535
1536 mutex_enter(&vcache_lock);
1537
1538 /* Lookup old and new node. */
1539 vip = vcache_hash_lookup(&old_vcache_key, old_hash);
1540 KASSERT(vip != NULL);
1541 KASSERT(VIMPL_TO_VNODE(vip) == vp);
1542
1543 new_vip = vcache_hash_lookup(&new_vcache_key, new_hash);
1544 KASSERT(new_vip != NULL);
1545 KASSERT(new_vip->vi_key.vk_key_len == new_key_len);
1546 new_vp = VIMPL_TO_VNODE(new_vip);
1547 mutex_enter(new_vp->v_interlock);
1548 VSTATE_ASSERT(VIMPL_TO_VNODE(new_vip), VS_LOADING);
1549 mutex_exit(new_vp->v_interlock);
1550
1551 /* Rekey old node and put it onto its new hashlist. */
1552 vip->vi_key = new_vcache_key;
1553 if (old_hash != new_hash) {
1554 SLIST_REMOVE(&vcache_hashtab[old_hash & vcache_hashmask],
1555 vip, vnode_impl, vi_hash);
1556 SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask],
1557 vip, vi_hash);
1558 }
1559
1560 /* Remove new node used as placeholder. */
1561 SLIST_REMOVE(&vcache_hashtab[new_hash & vcache_hashmask],
1562 new_vip, vnode_impl, vi_hash);
1563 vcache_dealloc(new_vip);
1564 }
1565
1566 /*
1567 * Disassociate the underlying file system from a vnode.
1568 *
1569 * Must be called with vnode locked and will return unlocked.
1570 * Must be called with the interlock held, and will return with it held.
1571 */
1572 static void
1573 vcache_reclaim(vnode_t *vp)
1574 {
1575 lwp_t *l = curlwp;
1576 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
1577 struct mount *mp = vp->v_mount;
1578 uint32_t hash;
1579 uint8_t temp_buf[64], *temp_key;
1580 size_t temp_key_len;
1581 bool recycle, active;
1582 int error;
1583
1584 KASSERT((vp->v_vflag & VV_LOCKSWORK) == 0 ||
1585 VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
1586 KASSERT(mutex_owned(vp->v_interlock));
1587 KASSERT(vp->v_usecount != 0);
1588
1589 active = (vp->v_usecount > 1);
1590 temp_key_len = vip->vi_key.vk_key_len;
1591 /*
1592 * Prevent the vnode from being recycled or brought into use
1593 * while we clean it out.
1594 */
1595 VSTATE_CHANGE(vp, VS_LOADED, VS_RECLAIMING);
1596 mutex_exit(vp->v_interlock);
1597
1598 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
1599 mutex_enter(vp->v_interlock);
1600 if ((vp->v_iflag & VI_EXECMAP) != 0 && vp->v_uobj.uo_npages != 0) {
1601 cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages);
1602 cpu_count(CPU_COUNT_FILEPAGES, vp->v_uobj.uo_npages);
1603 }
1604 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
1605 mutex_exit(vp->v_interlock);
1606 rw_exit(vp->v_uobj.vmobjlock);
1607
1608 /* Replace the vnode key with a temporary copy. */
1609 if (vip->vi_key.vk_key_len > sizeof(temp_buf)) {
1610 temp_key = kmem_alloc(temp_key_len, KM_SLEEP);
1611 } else {
1612 temp_key = temp_buf;
1613 }
1614 if (vip->vi_key.vk_key_len > 0) {
1615 mutex_enter(&vcache_lock);
1616 memcpy(temp_key, vip->vi_key.vk_key, temp_key_len);
1617 vip->vi_key.vk_key = temp_key;
1618 mutex_exit(&vcache_lock);
1619 }
1620
1621 fstrans_start(mp);
1622
1623 /*
1624 * Clean out any cached data associated with the vnode.
1625 * If purging an active vnode, it must be closed and
1626 * deactivated before being reclaimed.
1627 */
1628 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
1629 if (error != 0) {
1630 if (wapbl_vphaswapbl(vp))
1631 WAPBL_DISCARD(wapbl_vptomp(vp));
1632 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
1633 }
1634 KASSERTMSG((error == 0), "vinvalbuf failed: %d", error);
1635 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1636 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) {
1637 spec_node_revoke(vp);
1638 }
1639
1640 /*
1641 * Disassociate the underlying file system from the vnode.
1642 * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks
1643 * the vnode, and may destroy the vnode so that VOP_UNLOCK
1644 * would no longer function.
1645 */
1646 VOP_INACTIVE(vp, &recycle);
1647 KASSERT((vp->v_vflag & VV_LOCKSWORK) == 0 ||
1648 VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
1649 if (VOP_RECLAIM(vp)) {
1650 vnpanic(vp, "%s: cannot reclaim", __func__);
1651 }
1652
1653 KASSERT(vp->v_data == NULL);
1654 KASSERT(vp->v_uobj.uo_npages == 0);
1655
1656 if (vp->v_type == VREG && vp->v_ractx != NULL) {
1657 uvm_ra_freectx(vp->v_ractx);
1658 vp->v_ractx = NULL;
1659 }
1660
1661 /* Purge name cache. */
1662 cache_purge(vp);
1663
1664 if (vip->vi_key.vk_key_len > 0) {
1665 /* Remove from vnode cache. */
1666 hash = vcache_hash(&vip->vi_key);
1667 mutex_enter(&vcache_lock);
1668 KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash));
1669 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
1670 vip, vnode_impl, vi_hash);
1671 mutex_exit(&vcache_lock);
1672 }
1673 if (temp_key != temp_buf)
1674 kmem_free(temp_key, temp_key_len);
1675
1676 /* Done with purge, notify sleepers of the grim news. */
1677 mutex_enter(vp->v_interlock);
1678 vp->v_op = dead_vnodeop_p;
1679 vp->v_vflag |= VV_LOCKSWORK;
1680 VSTATE_CHANGE(vp, VS_RECLAIMING, VS_RECLAIMED);
1681 vp->v_tag = VT_NON;
1682 KNOTE(&vp->v_klist, NOTE_REVOKE);
1683 mutex_exit(vp->v_interlock);
1684
1685 /*
1686 * Move to dead mount. Must be after changing the operations
1687 * vector as vnode operations enter the mount before using the
1688 * operations vector. See sys/kern/vnode_if.c.
1689 */
1690 vp->v_vflag &= ~VV_ROOT;
1691 vfs_ref(dead_rootmount);
1692 vfs_insmntque(vp, dead_rootmount);
1693
1694 #ifdef PAX_SEGVGUARD
1695 pax_segvguard_cleanup(vp);
1696 #endif /* PAX_SEGVGUARD */
1697
1698 mutex_enter(vp->v_interlock);
1699 fstrans_done(mp);
1700 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1701 }
1702
1703 /*
1704 * Disassociate the underlying file system from an open device vnode
1705 * and make it anonymous.
1706 *
1707 * Vnode unlocked on entry, drops a reference to the vnode.
1708 */
1709 void
1710 vcache_make_anon(vnode_t *vp)
1711 {
1712 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
1713 uint32_t hash;
1714 bool recycle;
1715
1716 KASSERT(vp->v_type == VBLK || vp->v_type == VCHR);
1717 KASSERT(vp->v_mount == dead_rootmount || fstrans_is_owner(vp->v_mount));
1718 VSTATE_ASSERT_UNLOCKED(vp, VS_ACTIVE);
1719
1720 /* Remove from vnode cache. */
1721 hash = vcache_hash(&vip->vi_key);
1722 mutex_enter(&vcache_lock);
1723 KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash));
1724 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
1725 vip, vnode_impl, vi_hash);
1726 vip->vi_key.vk_mount = dead_rootmount;
1727 vip->vi_key.vk_key_len = 0;
1728 vip->vi_key.vk_key = NULL;
1729 mutex_exit(&vcache_lock);
1730
1731 /*
1732 * Disassociate the underlying file system from the vnode.
1733 * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks
1734 * the vnode, and may destroy the vnode so that VOP_UNLOCK
1735 * would no longer function.
1736 */
1737 if (vn_lock(vp, LK_EXCLUSIVE)) {
1738 vnpanic(vp, "%s: cannot lock", __func__);
1739 }
1740 VOP_INACTIVE(vp, &recycle);
1741 KASSERT((vp->v_vflag & VV_LOCKSWORK) == 0 ||
1742 VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
1743 if (VOP_RECLAIM(vp)) {
1744 vnpanic(vp, "%s: cannot reclaim", __func__);
1745 }
1746
1747 /* Purge name cache. */
1748 cache_purge(vp);
1749
1750 /* Done with purge, change operations vector. */
1751 mutex_enter(vp->v_interlock);
1752 vp->v_op = spec_vnodeop_p;
1753 vp->v_vflag |= VV_MPSAFE;
1754 vp->v_vflag &= ~VV_LOCKSWORK;
1755 mutex_exit(vp->v_interlock);
1756
1757 /*
1758 * Move to dead mount. Must be after changing the operations
1759 * vector as vnode operations enter the mount before using the
1760 * operations vector. See sys/kern/vnode_if.c.
1761 */
1762 vfs_ref(dead_rootmount);
1763 vfs_insmntque(vp, dead_rootmount);
1764
1765 vrele(vp);
1766 }
1767
1768 /*
1769 * Update outstanding I/O count and do wakeup if requested.
1770 */
1771 void
1772 vwakeup(struct buf *bp)
1773 {
1774 vnode_t *vp;
1775
1776 if ((vp = bp->b_vp) == NULL)
1777 return;
1778
1779 KASSERT(bp->b_objlock == vp->v_interlock);
1780 KASSERT(mutex_owned(bp->b_objlock));
1781
1782 if (--vp->v_numoutput < 0)
1783 vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp);
1784 if (vp->v_numoutput == 0)
1785 cv_broadcast(&vp->v_cv);
1786 }
1787
1788 /*
1789 * Test a vnode for being or becoming dead. Returns one of:
1790 * EBUSY: vnode is becoming dead, with "flags == VDEAD_NOWAIT" only.
1791 * ENOENT: vnode is dead.
1792 * 0: otherwise.
1793 *
1794 * Whenever this function returns a non-zero value all future
1795 * calls will also return a non-zero value.
1796 */
1797 int
1798 vdead_check(struct vnode *vp, int flags)
1799 {
1800
1801 KASSERT(mutex_owned(vp->v_interlock));
1802
1803 if (! ISSET(flags, VDEAD_NOWAIT))
1804 VSTATE_WAIT_STABLE(vp);
1805
1806 if (VSTATE_GET(vp) == VS_RECLAIMING) {
1807 KASSERT(ISSET(flags, VDEAD_NOWAIT));
1808 return EBUSY;
1809 } else if (VSTATE_GET(vp) == VS_RECLAIMED) {
1810 return ENOENT;
1811 }
1812
1813 return 0;
1814 }
1815
1816 int
1817 vfs_drainvnodes(void)
1818 {
1819 int i, gen;
1820
1821 mutex_enter(&vdrain_lock);
1822 for (i = 0; i < 2; i++) {
1823 gen = vdrain_gen;
1824 while (gen == vdrain_gen) {
1825 cv_broadcast(&vdrain_cv);
1826 cv_wait(&vdrain_gen_cv, &vdrain_lock);
1827 }
1828 }
1829 mutex_exit(&vdrain_lock);
1830
1831 if (numvnodes >= desiredvnodes)
1832 return EBUSY;
1833
1834 if (vcache_hashsize != desiredvnodes)
1835 vcache_reinit();
1836
1837 return 0;
1838 }
1839
1840 void
1841 vnpanic(vnode_t *vp, const char *fmt, ...)
1842 {
1843 va_list ap;
1844
1845 #ifdef DIAGNOSTIC
1846 vprint(NULL, vp);
1847 #endif
1848 va_start(ap, fmt);
1849 vpanic(fmt, ap);
1850 va_end(ap);
1851 }
1852
1853 void
1854 vshareilock(vnode_t *tvp, vnode_t *fvp)
1855 {
1856 kmutex_t *oldlock;
1857
1858 oldlock = tvp->v_interlock;
1859 mutex_obj_hold(fvp->v_interlock);
1860 tvp->v_interlock = fvp->v_interlock;
1861 mutex_obj_free(oldlock);
1862 }
1863