vfs_vnode.c revision 1.105.2.8 1 /* $NetBSD: vfs_vnode.c,v 1.105.2.8 2020/02/23 19:14:03 ad Exp $ */
2
3 /*-
4 * Copyright (c) 1997-2011, 2019 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /*
34 * Copyright (c) 1989, 1993
35 * The Regents of the University of California. All rights reserved.
36 * (c) UNIX System Laboratories, Inc.
37 * All or some portions of this file are derived from material licensed
38 * to the University of California by American Telephone and Telegraph
39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40 * the permission of UNIX System Laboratories, Inc.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
67 */
68
69 /*
70 * The vnode cache subsystem.
71 *
72 * Life-cycle
73 *
74 * Normally, there are two points where new vnodes are created:
75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode
76 * starts in one of the following ways:
77 *
78 * - Allocation, via vcache_get(9) or vcache_new(9).
79 * - Reclamation of inactive vnode, via vcache_vget(9).
80 *
81 * Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9)
82 * was another, traditional way. Currently, only the draining thread
83 * recycles the vnodes. This behaviour might be revisited.
84 *
85 * The life-cycle ends when the last reference is dropped, usually
86 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform
87 * the file system that vnode is inactive. Via this call, file system
88 * indicates whether vnode can be recycled (usually, it checks its own
89 * references, e.g. count of links, whether the file was removed).
90 *
91 * Depending on indication, vnode can be put into a free list (cache),
92 * or cleaned via vcache_reclaim, which calls VOP_RECLAIM(9) to
93 * disassociate underlying file system from the vnode, and finally
94 * destroyed.
95 *
96 * Vnode state
97 *
98 * Vnode is always in one of six states:
99 * - MARKER This is a marker vnode to help list traversal. It
100 * will never change its state.
101 * - LOADING Vnode is associating underlying file system and not
102 * yet ready to use.
103 * - LOADED Vnode has associated underlying file system and is
104 * ready to use.
105 * - BLOCKED Vnode is active but cannot get new references.
106 * - RECLAIMING Vnode is disassociating from the underlying file
107 * system.
108 * - RECLAIMED Vnode has disassociated from underlying file system
109 * and is dead.
110 *
111 * Valid state changes are:
112 * LOADING -> LOADED
113 * Vnode has been initialised in vcache_get() or
114 * vcache_new() and is ready to use.
115 * LOADED -> RECLAIMING
116 * Vnode starts disassociation from underlying file
117 * system in vcache_reclaim().
118 * RECLAIMING -> RECLAIMED
119 * Vnode finished disassociation from underlying file
120 * system in vcache_reclaim().
121 * LOADED -> BLOCKED
122 * vcache_rekey*() is changing the vnode key.
123 * BLOCKED -> LOADED
124 * The block condition is over.
125 * LOADING -> RECLAIMED
126 * Either vcache_get() or vcache_new() failed to
127 * associate the underlying file system or vcache_rekey*()
128 * drops a vnode used as placeholder.
129 *
130 * Of these states LOADING, BLOCKED and RECLAIMING are intermediate
131 * and it is possible to wait for state change.
132 *
133 * State is protected with v_interlock with one exception:
134 * to change from LOADING both v_interlock and vcache_lock must be held
135 * so it is possible to check "state == LOADING" without holding
136 * v_interlock. See vcache_get() for details.
137 *
138 * Reference counting
139 *
140 * Vnode is considered active, if reference count (vnode_t::v_usecount)
141 * is non-zero. It is maintained using: vref(9) and vrele(9), as well
142 * as vput(9), routines. Common points holding references are e.g.
143 * file openings, current working directory, mount points, etc.
144 *
145 * Note on v_usecount and its locking
146 *
147 * At nearly all points it is known that v_usecount could be zero,
148 * the vnode_t::v_interlock will be held. To change the count away
149 * from zero, the interlock must be held. To change from a non-zero
150 * value to zero, again the interlock must be held.
151 *
152 * Changing the usecount from a non-zero value to a non-zero value can
153 * safely be done using atomic operations, without the interlock held.
154 */
155
156 #include <sys/cdefs.h>
157 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.105.2.8 2020/02/23 19:14:03 ad Exp $");
158
159 #ifdef _KERNEL_OPT
160 #include "opt_pax.h"
161 #endif
162
163 #include <sys/param.h>
164 #include <sys/kernel.h>
165
166 #include <sys/atomic.h>
167 #include <sys/buf.h>
168 #include <sys/conf.h>
169 #include <sys/device.h>
170 #include <sys/hash.h>
171 #include <sys/kauth.h>
172 #include <sys/kmem.h>
173 #include <sys/kthread.h>
174 #include <sys/module.h>
175 #include <sys/mount.h>
176 #include <sys/namei.h>
177 #include <sys/pax.h>
178 #include <sys/syscallargs.h>
179 #include <sys/sysctl.h>
180 #include <sys/systm.h>
181 #include <sys/vnode_impl.h>
182 #include <sys/wapbl.h>
183 #include <sys/fstrans.h>
184
185 #include <uvm/uvm.h>
186 #include <uvm/uvm_readahead.h>
187 #include <uvm/uvm_stat.h>
188
189 /* Flags to vrelel. */
190 #define VRELEL_ASYNC 0x0001 /* Always defer to vrele thread. */
191
192 #define LRU_VRELE 0
193 #define LRU_FREE 1
194 #define LRU_HOLD 2
195 #define LRU_COUNT 3
196
197 /*
198 * There are three lru lists: one holds vnodes waiting for async release,
199 * one is for vnodes which have no buffer/page references and one for those
200 * which do (i.e. v_holdcnt is non-zero). We put the lists into a single,
201 * private cache line as vnodes migrate between them while under the same
202 * lock (vdrain_lock).
203 */
204 u_int numvnodes __cacheline_aligned;
205 static vnodelst_t lru_list[LRU_COUNT] __cacheline_aligned;
206 static kmutex_t vdrain_lock __cacheline_aligned;
207 static kcondvar_t vdrain_cv;
208 static int vdrain_gen;
209 static kcondvar_t vdrain_gen_cv;
210 static bool vdrain_retry;
211 static lwp_t * vdrain_lwp;
212 SLIST_HEAD(hashhead, vnode_impl);
213 static kmutex_t vcache_lock __cacheline_aligned;
214 static kcondvar_t vcache_cv;
215 static u_int vcache_hashsize;
216 static u_long vcache_hashmask;
217 static struct hashhead *vcache_hashtab;
218 static pool_cache_t vcache_pool;
219 static void lru_requeue(vnode_t *, vnodelst_t *);
220 static vnodelst_t * lru_which(vnode_t *);
221 static vnode_impl_t * vcache_alloc(void);
222 static void vcache_dealloc(vnode_impl_t *);
223 static void vcache_free(vnode_impl_t *);
224 static void vcache_init(void);
225 static void vcache_reinit(void);
226 static void vcache_reclaim(vnode_t *);
227 static void vrelel(vnode_t *, int, int);
228 static void vdrain_thread(void *);
229 static void vnpanic(vnode_t *, const char *, ...)
230 __printflike(2, 3);
231
232 /* Routines having to do with the management of the vnode table. */
233 extern struct mount *dead_rootmount;
234 extern int (**dead_vnodeop_p)(void *);
235 extern int (**spec_vnodeop_p)(void *);
236 extern struct vfsops dead_vfsops;
237
238 /* Vnode state operations and diagnostics. */
239
240 #if defined(DIAGNOSTIC)
241
242 #define VSTATE_VALID(state) \
243 ((state) != VS_ACTIVE && (state) != VS_MARKER)
244 #define VSTATE_GET(vp) \
245 vstate_assert_get((vp), __func__, __LINE__)
246 #define VSTATE_CHANGE(vp, from, to) \
247 vstate_assert_change((vp), (from), (to), __func__, __LINE__)
248 #define VSTATE_WAIT_STABLE(vp) \
249 vstate_assert_wait_stable((vp), __func__, __LINE__)
250
251 void
252 _vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line,
253 bool has_lock)
254 {
255 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
256
257 if (!has_lock) {
258 /*
259 * Prevent predictive loads from the CPU, but check the state
260 * without loooking first.
261 */
262 membar_enter();
263 if (state == VS_ACTIVE && vp->v_usecount > 0 &&
264 (vip->vi_state == VS_LOADED || vip->vi_state == VS_BLOCKED))
265 return;
266 if (vip->vi_state == state)
267 return;
268 mutex_enter((vp)->v_interlock);
269 }
270
271 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
272
273 if ((state == VS_ACTIVE && vp->v_usecount > 0 &&
274 (vip->vi_state == VS_LOADED || vip->vi_state == VS_BLOCKED)) ||
275 vip->vi_state == state) {
276 if (!has_lock)
277 mutex_exit((vp)->v_interlock);
278 return;
279 }
280 vnpanic(vp, "state is %s, usecount %d, expected %s at %s:%d",
281 vstate_name(vip->vi_state), vp->v_usecount,
282 vstate_name(state), func, line);
283 }
284
285 static enum vnode_state
286 vstate_assert_get(vnode_t *vp, const char *func, int line)
287 {
288 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
289
290 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
291 if (! VSTATE_VALID(vip->vi_state))
292 vnpanic(vp, "state is %s at %s:%d",
293 vstate_name(vip->vi_state), func, line);
294
295 return vip->vi_state;
296 }
297
298 static void
299 vstate_assert_wait_stable(vnode_t *vp, const char *func, int line)
300 {
301 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
302
303 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
304 if (! VSTATE_VALID(vip->vi_state))
305 vnpanic(vp, "state is %s at %s:%d",
306 vstate_name(vip->vi_state), func, line);
307
308 while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED)
309 cv_wait(&vp->v_cv, vp->v_interlock);
310
311 if (! VSTATE_VALID(vip->vi_state))
312 vnpanic(vp, "state is %s at %s:%d",
313 vstate_name(vip->vi_state), func, line);
314 }
315
316 static void
317 vstate_assert_change(vnode_t *vp, enum vnode_state from, enum vnode_state to,
318 const char *func, int line)
319 {
320 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
321
322 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
323 if (from == VS_LOADING)
324 KASSERTMSG(mutex_owned(&vcache_lock), "at %s:%d", func, line);
325
326 if (! VSTATE_VALID(from))
327 vnpanic(vp, "from is %s at %s:%d",
328 vstate_name(from), func, line);
329 if (! VSTATE_VALID(to))
330 vnpanic(vp, "to is %s at %s:%d",
331 vstate_name(to), func, line);
332 if (vip->vi_state != from)
333 vnpanic(vp, "from is %s, expected %s at %s:%d\n",
334 vstate_name(vip->vi_state), vstate_name(from), func, line);
335 if ((from == VS_BLOCKED || to == VS_BLOCKED) && vp->v_usecount != 1)
336 vnpanic(vp, "%s to %s with usecount %d at %s:%d",
337 vstate_name(from), vstate_name(to), vp->v_usecount,
338 func, line);
339
340 vip->vi_state = to;
341 if (from == VS_LOADING)
342 cv_broadcast(&vcache_cv);
343 if (to == VS_LOADED || to == VS_RECLAIMED)
344 cv_broadcast(&vp->v_cv);
345 }
346
347 #else /* defined(DIAGNOSTIC) */
348
349 #define VSTATE_GET(vp) \
350 (VNODE_TO_VIMPL((vp))->vi_state)
351 #define VSTATE_CHANGE(vp, from, to) \
352 vstate_change((vp), (from), (to))
353 #define VSTATE_WAIT_STABLE(vp) \
354 vstate_wait_stable((vp))
355 void
356 _vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line,
357 bool has_lock)
358 {
359
360 }
361
362 static void
363 vstate_wait_stable(vnode_t *vp)
364 {
365 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
366
367 while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED)
368 cv_wait(&vp->v_cv, vp->v_interlock);
369 }
370
371 static void
372 vstate_change(vnode_t *vp, enum vnode_state from, enum vnode_state to)
373 {
374 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
375
376 vip->vi_state = to;
377 if (from == VS_LOADING)
378 cv_broadcast(&vcache_cv);
379 if (to == VS_LOADED || to == VS_RECLAIMED)
380 cv_broadcast(&vp->v_cv);
381 }
382
383 #endif /* defined(DIAGNOSTIC) */
384
385 void
386 vfs_vnode_sysinit(void)
387 {
388 int error __diagused, i;
389
390 dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL);
391 KASSERT(dead_rootmount != NULL);
392 dead_rootmount->mnt_iflag |= IMNT_MPSAFE;
393
394 mutex_init(&vdrain_lock, MUTEX_DEFAULT, IPL_NONE);
395 for (i = 0; i < LRU_COUNT; i++) {
396 TAILQ_INIT(&lru_list[i]);
397 }
398 vcache_init();
399
400 cv_init(&vdrain_cv, "vdrain");
401 cv_init(&vdrain_gen_cv, "vdrainwt");
402 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread,
403 NULL, &vdrain_lwp, "vdrain");
404 KASSERTMSG((error == 0), "kthread_create(vdrain) failed: %d", error);
405 }
406
407 /*
408 * Allocate a new marker vnode.
409 */
410 vnode_t *
411 vnalloc_marker(struct mount *mp)
412 {
413 vnode_impl_t *vip;
414 vnode_t *vp;
415
416 vip = pool_cache_get(vcache_pool, PR_WAITOK);
417 memset(vip, 0, sizeof(*vip));
418 vp = VIMPL_TO_VNODE(vip);
419 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0);
420 vp->v_mount = mp;
421 vp->v_type = VBAD;
422 vip->vi_state = VS_MARKER;
423
424 return vp;
425 }
426
427 /*
428 * Free a marker vnode.
429 */
430 void
431 vnfree_marker(vnode_t *vp)
432 {
433 vnode_impl_t *vip;
434
435 vip = VNODE_TO_VIMPL(vp);
436 KASSERT(vip->vi_state == VS_MARKER);
437 uvm_obj_destroy(&vp->v_uobj, true);
438 pool_cache_put(vcache_pool, vip);
439 }
440
441 /*
442 * Test a vnode for being a marker vnode.
443 */
444 bool
445 vnis_marker(vnode_t *vp)
446 {
447
448 return (VNODE_TO_VIMPL(vp)->vi_state == VS_MARKER);
449 }
450
451 /*
452 * Return the lru list this node should be on.
453 */
454 static vnodelst_t *
455 lru_which(vnode_t *vp)
456 {
457
458 KASSERT(mutex_owned(vp->v_interlock));
459
460 if (vp->v_holdcnt > 0)
461 return &lru_list[LRU_HOLD];
462 else
463 return &lru_list[LRU_FREE];
464 }
465
466 /*
467 * Put vnode to end of given list.
468 * Both the current and the new list may be NULL, used on vnode alloc/free.
469 * Adjust numvnodes and signal vdrain thread if there is work.
470 */
471 static void
472 lru_requeue(vnode_t *vp, vnodelst_t *listhd)
473 {
474 vnode_impl_t *vip;
475 int d;
476
477 /*
478 * If the vnode is on the correct list, and was put there recently,
479 * then leave it be, thus avoiding huge cache and lock contention.
480 */
481 vip = VNODE_TO_VIMPL(vp);
482 if (listhd == vip->vi_lrulisthd &&
483 (hardclock_ticks - vip->vi_lrulisttm) < hz) {
484 return;
485 }
486
487 mutex_enter(&vdrain_lock);
488 d = 0;
489 if (vip->vi_lrulisthd != NULL)
490 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
491 else
492 d++;
493 vip->vi_lrulisthd = listhd;
494 vip->vi_lrulisttm = hardclock_ticks;
495 if (vip->vi_lrulisthd != NULL)
496 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
497 else
498 d--;
499 if (d != 0) {
500 /*
501 * Looks strange? This is not a bug. Don't store
502 * numvnodes unless there is a change - avoid false
503 * sharing on MP.
504 */
505 numvnodes += d;
506 }
507 if (numvnodes > desiredvnodes || listhd == &lru_list[LRU_VRELE])
508 cv_broadcast(&vdrain_cv);
509 mutex_exit(&vdrain_lock);
510 }
511
512 /*
513 * Release deferred vrele vnodes for this mount.
514 * Called with file system suspended.
515 */
516 void
517 vrele_flush(struct mount *mp)
518 {
519 vnode_impl_t *vip, *marker;
520 vnode_t *vp;
521
522 KASSERT(fstrans_is_owner(mp));
523
524 marker = VNODE_TO_VIMPL(vnalloc_marker(NULL));
525
526 mutex_enter(&vdrain_lock);
527 TAILQ_INSERT_HEAD(&lru_list[LRU_VRELE], marker, vi_lrulist);
528
529 while ((vip = TAILQ_NEXT(marker, vi_lrulist))) {
530 TAILQ_REMOVE(&lru_list[LRU_VRELE], marker, vi_lrulist);
531 TAILQ_INSERT_AFTER(&lru_list[LRU_VRELE], vip, marker,
532 vi_lrulist);
533 vp = VIMPL_TO_VNODE(vip);
534 if (vnis_marker(vp))
535 continue;
536
537 KASSERT(vip->vi_lrulisthd == &lru_list[LRU_VRELE]);
538 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
539 vip->vi_lrulisthd = &lru_list[LRU_HOLD];
540 vip->vi_lrulisttm = hardclock_ticks;
541 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
542 mutex_exit(&vdrain_lock);
543
544 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
545 mutex_enter(vp->v_interlock);
546 vrelel(vp, 0, LK_EXCLUSIVE);
547
548 mutex_enter(&vdrain_lock);
549 }
550
551 TAILQ_REMOVE(&lru_list[LRU_VRELE], marker, vi_lrulist);
552 mutex_exit(&vdrain_lock);
553
554 vnfree_marker(VIMPL_TO_VNODE(marker));
555 }
556
557 /*
558 * Reclaim a cached vnode. Used from vdrain_thread only.
559 */
560 static __inline void
561 vdrain_remove(vnode_t *vp)
562 {
563 struct mount *mp;
564
565 KASSERT(mutex_owned(&vdrain_lock));
566
567 /* Probe usecount (unlocked). */
568 if (vp->v_usecount > 0)
569 return;
570 /* Try v_interlock -- we lock the wrong direction! */
571 if (!mutex_tryenter(vp->v_interlock))
572 return;
573 /* Probe usecount and state. */
574 if (vp->v_usecount > 0 || VSTATE_GET(vp) != VS_LOADED) {
575 mutex_exit(vp->v_interlock);
576 return;
577 }
578 mp = vp->v_mount;
579 if (fstrans_start_nowait(mp) != 0) {
580 mutex_exit(vp->v_interlock);
581 return;
582 }
583 vdrain_retry = true;
584 mutex_exit(&vdrain_lock);
585
586 if (vcache_vget(vp) == 0) {
587 if (!vrecycle(vp)) {
588 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
589 mutex_enter(vp->v_interlock);
590 vrelel(vp, 0, LK_EXCLUSIVE);
591 }
592 }
593 fstrans_done(mp);
594
595 mutex_enter(&vdrain_lock);
596 }
597
598 /*
599 * Release a cached vnode. Used from vdrain_thread only.
600 */
601 static __inline void
602 vdrain_vrele(vnode_t *vp)
603 {
604 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
605 struct mount *mp;
606
607 KASSERT(mutex_owned(&vdrain_lock));
608
609 mp = vp->v_mount;
610 if (fstrans_start_nowait(mp) != 0)
611 return;
612
613 /*
614 * First remove the vnode from the vrele list.
615 * Put it on the last lru list, the last vrele()
616 * will put it back onto the right list before
617 * its v_usecount reaches zero.
618 */
619 KASSERT(vip->vi_lrulisthd == &lru_list[LRU_VRELE]);
620 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
621 vip->vi_lrulisthd = &lru_list[LRU_HOLD];
622 vip->vi_lrulisttm = hardclock_ticks;
623 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
624
625 vdrain_retry = true;
626 mutex_exit(&vdrain_lock);
627
628 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
629 mutex_enter(vp->v_interlock);
630 vrelel(vp, 0, LK_EXCLUSIVE);
631 fstrans_done(mp);
632
633 mutex_enter(&vdrain_lock);
634 }
635
636 /*
637 * Helper thread to keep the number of vnodes below desiredvnodes
638 * and release vnodes from asynchronous vrele.
639 */
640 static void
641 vdrain_thread(void *cookie)
642 {
643 int i;
644 u_int target;
645 vnode_impl_t *vip, *marker;
646
647 marker = VNODE_TO_VIMPL(vnalloc_marker(NULL));
648
649 mutex_enter(&vdrain_lock);
650
651 for (;;) {
652 vdrain_retry = false;
653 target = desiredvnodes - desiredvnodes/10;
654
655 for (i = 0; i < LRU_COUNT; i++) {
656 TAILQ_INSERT_HEAD(&lru_list[i], marker, vi_lrulist);
657 while ((vip = TAILQ_NEXT(marker, vi_lrulist))) {
658 TAILQ_REMOVE(&lru_list[i], marker, vi_lrulist);
659 TAILQ_INSERT_AFTER(&lru_list[i], vip, marker,
660 vi_lrulist);
661 if (vnis_marker(VIMPL_TO_VNODE(vip)))
662 continue;
663 if (i == LRU_VRELE)
664 vdrain_vrele(VIMPL_TO_VNODE(vip));
665 else if (numvnodes < target)
666 break;
667 else
668 vdrain_remove(VIMPL_TO_VNODE(vip));
669 }
670 TAILQ_REMOVE(&lru_list[i], marker, vi_lrulist);
671 }
672
673 if (vdrain_retry) {
674 mutex_exit(&vdrain_lock);
675 yield();
676 mutex_enter(&vdrain_lock);
677 } else {
678 vdrain_gen++;
679 cv_broadcast(&vdrain_gen_cv);
680 cv_wait(&vdrain_cv, &vdrain_lock);
681 }
682 }
683 }
684
685 /*
686 * Try to drop reference on a vnode. Abort if we are releasing the
687 * last reference. Note: this _must_ succeed if not the last reference.
688 */
689 static bool
690 vtryrele(vnode_t *vp)
691 {
692 u_int use, next;
693
694 for (use = vp->v_usecount;; use = next) {
695 if (__predict_false(use == 1)) {
696 return false;
697 }
698 KASSERT(use > 1);
699 next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
700 if (__predict_true(next == use)) {
701 return true;
702 }
703 }
704 }
705
706 /*
707 * vput: unlock and release the reference.
708 */
709 void
710 vput(vnode_t *vp)
711 {
712 int lktype;
713
714 /*
715 * Do an unlocked check of v_usecount. If it looks like we're not
716 * about to drop the last reference, then unlock the vnode and try
717 * to drop the reference. If it ends up being the last reference
718 * after all, we dropped the lock when we shouldn't have. vrelel()
719 * can fix it all up. Most of the time this will all go to plan.
720 */
721 if (vp->v_usecount > 1) {
722 VOP_UNLOCK(vp);
723 if (vtryrele(vp)) {
724 return;
725 }
726 lktype = LK_NONE;
727 } else if ((vp->v_vflag & VV_LOCKSWORK) == 0) {
728 lktype = LK_EXCLUSIVE;
729 } else {
730 lktype = VOP_ISLOCKED(vp);
731 KASSERT(lktype != LK_NONE);
732 }
733 mutex_enter(vp->v_interlock);
734 vrelel(vp, 0, lktype);
735 }
736
737 /*
738 * Vnode release. If reference count drops to zero, call inactive
739 * routine and either return to freelist or free to the pool.
740 */
741 static void
742 vrelel(vnode_t *vp, int flags, int lktype)
743 {
744 const bool async = ((flags & VRELEL_ASYNC) != 0);
745 bool recycle, defer;
746 int error;
747
748 KASSERT(mutex_owned(vp->v_interlock));
749
750 if (__predict_false(vp->v_op == dead_vnodeop_p &&
751 VSTATE_GET(vp) != VS_RECLAIMED)) {
752 vnpanic(vp, "dead but not clean");
753 }
754
755 /*
756 * If not the last reference, just drop the reference count
757 * and unlock.
758 */
759 if (vtryrele(vp)) {
760 if (lktype != LK_NONE) {
761 VOP_UNLOCK(vp);
762 }
763 mutex_exit(vp->v_interlock);
764 return;
765 }
766 if (vp->v_usecount <= 0 || vp->v_writecount != 0) {
767 vnpanic(vp, "%s: bad ref count", __func__);
768 }
769
770 #ifdef DIAGNOSTIC
771 if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
772 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
773 vprint("vrelel: missing VOP_CLOSE()", vp);
774 }
775 #endif
776
777 /*
778 * First try to get the vnode locked for VOP_INACTIVE().
779 * Defer vnode release to vdrain_thread if caller requests
780 * it explicitly, is the pagedaemon or the lock failed.
781 */
782 defer = false;
783 if ((curlwp == uvm.pagedaemon_lwp) || async) {
784 defer = true;
785 } else if (lktype == LK_SHARED) {
786 /* Excellent chance of getting, if the last ref. */
787 error = vn_lock(vp, LK_UPGRADE | LK_RETRY |
788 LK_NOWAIT);
789 if (error != 0) {
790 defer = true;
791 } else {
792 lktype = LK_EXCLUSIVE;
793 }
794 } else if (lktype == LK_NONE) {
795 /* Excellent chance of getting, if the last ref. */
796 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY |
797 LK_NOWAIT);
798 if (error != 0) {
799 defer = true;
800 } else {
801 lktype = LK_EXCLUSIVE;
802 }
803 }
804 KASSERT(mutex_owned(vp->v_interlock));
805 if (defer) {
806 /*
807 * Defer reclaim to the kthread; it's not safe to
808 * clean it here. We donate it our last reference.
809 */
810 if (lktype != LK_NONE) {
811 VOP_UNLOCK(vp);
812 }
813 lru_requeue(vp, &lru_list[LRU_VRELE]);
814 mutex_exit(vp->v_interlock);
815 return;
816 }
817 KASSERT(lktype == LK_EXCLUSIVE);
818
819 /*
820 * If not clean, deactivate the vnode, but preserve
821 * our reference across the call to VOP_INACTIVE().
822 */
823 if (VSTATE_GET(vp) == VS_RECLAIMED) {
824 VOP_UNLOCK(vp);
825 } else {
826 /*
827 * If VOP_INACTIVE() indicates that the described file has
828 * been deleted, then recycle the vnode. Note that
829 * VOP_INACTIVE() will not drop the vnode lock.
830 *
831 * If the file has been deleted, this is a lingering
832 * reference and there is no need to worry about new
833 * references looking to do real work with the vnode (as it
834 * will have been purged from directories, caches, etc).
835 */
836 recycle = false;
837 mutex_exit(vp->v_interlock);
838 VOP_INACTIVE(vp, &recycle);
839 mutex_enter(vp->v_interlock);
840 if (!recycle) {
841 VOP_UNLOCK(vp);
842 if (vtryrele(vp)) {
843 mutex_exit(vp->v_interlock);
844 return;
845 }
846 }
847
848 /* Take care of space accounting. */
849 if ((vp->v_iflag & VI_EXECMAP) != 0 &&
850 vp->v_uobj.uo_npages != 0) {
851 cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages);
852 cpu_count(CPU_COUNT_FILEPAGES, vp->v_uobj.uo_npages);
853 }
854 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP);
855 vp->v_vflag &= ~VV_MAPPED;
856
857 /*
858 * Recycle the vnode if the file is now unused (unlinked),
859 * otherwise just free it.
860 */
861 if (recycle) {
862 VSTATE_ASSERT(vp, VS_LOADED);
863 /* vcache_reclaim drops the lock. */
864 vcache_reclaim(vp);
865 }
866 KASSERT(vp->v_usecount > 0);
867 }
868
869 if (atomic_dec_uint_nv(&vp->v_usecount) != 0) {
870 /* Gained another reference while being reclaimed. */
871 mutex_exit(vp->v_interlock);
872 return;
873 }
874
875 if (VSTATE_GET(vp) == VS_RECLAIMED && vp->v_holdcnt == 0) {
876 /*
877 * It's clean so destroy it. It isn't referenced
878 * anywhere since it has been reclaimed.
879 */
880 vcache_free(VNODE_TO_VIMPL(vp));
881 } else {
882 /*
883 * Otherwise, put it back onto the freelist. It
884 * can't be destroyed while still associated with
885 * a file system.
886 */
887 lru_requeue(vp, lru_which(vp));
888 mutex_exit(vp->v_interlock);
889 }
890 }
891
892 void
893 vrele(vnode_t *vp)
894 {
895
896 if (vtryrele(vp)) {
897 return;
898 }
899 mutex_enter(vp->v_interlock);
900 vrelel(vp, 0, LK_NONE);
901 }
902
903 /*
904 * Asynchronous vnode release, vnode is released in different context.
905 */
906 void
907 vrele_async(vnode_t *vp)
908 {
909
910 if (vtryrele(vp)) {
911 return;
912 }
913 mutex_enter(vp->v_interlock);
914 vrelel(vp, VRELEL_ASYNC, LK_NONE);
915 }
916
917 /*
918 * Vnode reference, where a reference is already held by some other
919 * object (for example, a file structure).
920 *
921 * NB: we have lockless code sequences that rely on this not blocking.
922 */
923 void
924 vref(vnode_t *vp)
925 {
926
927 KASSERT(vp->v_usecount != 0);
928
929 atomic_inc_uint(&vp->v_usecount);
930 }
931
932 /*
933 * Page or buffer structure gets a reference.
934 * Called with v_interlock held.
935 */
936 void
937 vholdl(vnode_t *vp)
938 {
939
940 KASSERT(mutex_owned(vp->v_interlock));
941
942 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0)
943 lru_requeue(vp, lru_which(vp));
944 }
945
946 /*
947 * Page or buffer structure gets a reference.
948 */
949 void
950 vhold(vnode_t *vp)
951 {
952
953 mutex_enter(vp->v_interlock);
954 vholdl(vp);
955 mutex_exit(vp->v_interlock);
956 }
957
958 /*
959 * Page or buffer structure frees a reference.
960 * Called with v_interlock held.
961 */
962 void
963 holdrelel(vnode_t *vp)
964 {
965
966 KASSERT(mutex_owned(vp->v_interlock));
967
968 if (vp->v_holdcnt <= 0) {
969 vnpanic(vp, "%s: holdcnt vp %p", __func__, vp);
970 }
971
972 vp->v_holdcnt--;
973 if (vp->v_holdcnt == 0 && vp->v_usecount == 0)
974 lru_requeue(vp, lru_which(vp));
975 }
976
977 /*
978 * Page or buffer structure frees a reference.
979 */
980 void
981 holdrele(vnode_t *vp)
982 {
983
984 mutex_enter(vp->v_interlock);
985 holdrelel(vp);
986 mutex_exit(vp->v_interlock);
987 }
988
989 /*
990 * Recycle an unused vnode if caller holds the last reference.
991 */
992 bool
993 vrecycle(vnode_t *vp)
994 {
995 int error __diagused;
996
997 mutex_enter(vp->v_interlock);
998
999 /* Make sure we hold the last reference. */
1000 VSTATE_WAIT_STABLE(vp);
1001 if (vp->v_usecount != 1) {
1002 mutex_exit(vp->v_interlock);
1003 return false;
1004 }
1005
1006 /* If the vnode is already clean we're done. */
1007 if (VSTATE_GET(vp) != VS_LOADED) {
1008 VSTATE_ASSERT(vp, VS_RECLAIMED);
1009 vrelel(vp, 0, LK_NONE);
1010 return true;
1011 }
1012
1013 /* Prevent further references until the vnode is locked. */
1014 VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
1015 mutex_exit(vp->v_interlock);
1016
1017 /*
1018 * On a leaf file system this lock will always succeed as we hold
1019 * the last reference and prevent further references.
1020 * On layered file systems waiting for the lock would open a can of
1021 * deadlocks as the lower vnodes may have other active references.
1022 */
1023 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
1024
1025 mutex_enter(vp->v_interlock);
1026 VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
1027
1028 if (error) {
1029 mutex_exit(vp->v_interlock);
1030 return false;
1031 }
1032
1033 KASSERT(vp->v_usecount == 1);
1034 vcache_reclaim(vp);
1035 vrelel(vp, 0, LK_NONE);
1036
1037 return true;
1038 }
1039
1040 /*
1041 * Helper for vrevoke() to propagate suspension from lastmp
1042 * to thismp. Both args may be NULL.
1043 * Returns the currently suspended file system or NULL.
1044 */
1045 static struct mount *
1046 vrevoke_suspend_next(struct mount *lastmp, struct mount *thismp)
1047 {
1048 int error;
1049
1050 if (lastmp == thismp)
1051 return thismp;
1052
1053 if (lastmp != NULL)
1054 vfs_resume(lastmp);
1055
1056 if (thismp == NULL)
1057 return NULL;
1058
1059 do {
1060 error = vfs_suspend(thismp, 0);
1061 } while (error == EINTR || error == ERESTART);
1062
1063 if (error == 0)
1064 return thismp;
1065
1066 KASSERT(error == EOPNOTSUPP);
1067 return NULL;
1068 }
1069
1070 /*
1071 * Eliminate all activity associated with the requested vnode
1072 * and with all vnodes aliased to the requested vnode.
1073 */
1074 void
1075 vrevoke(vnode_t *vp)
1076 {
1077 struct mount *mp;
1078 vnode_t *vq;
1079 enum vtype type;
1080 dev_t dev;
1081
1082 KASSERT(vp->v_usecount > 0);
1083
1084 mp = vrevoke_suspend_next(NULL, vp->v_mount);
1085
1086 mutex_enter(vp->v_interlock);
1087 VSTATE_WAIT_STABLE(vp);
1088 if (VSTATE_GET(vp) == VS_RECLAIMED) {
1089 mutex_exit(vp->v_interlock);
1090 } else if (vp->v_type != VBLK && vp->v_type != VCHR) {
1091 atomic_inc_uint(&vp->v_usecount);
1092 mutex_exit(vp->v_interlock);
1093 vgone(vp);
1094 } else {
1095 dev = vp->v_rdev;
1096 type = vp->v_type;
1097 mutex_exit(vp->v_interlock);
1098
1099 while (spec_node_lookup_by_dev(type, dev, &vq) == 0) {
1100 mp = vrevoke_suspend_next(mp, vq->v_mount);
1101 vgone(vq);
1102 }
1103 }
1104 vrevoke_suspend_next(mp, NULL);
1105 }
1106
1107 /*
1108 * Eliminate all activity associated with a vnode in preparation for
1109 * reuse. Drops a reference from the vnode.
1110 */
1111 void
1112 vgone(vnode_t *vp)
1113 {
1114 int lktype;
1115
1116 KASSERT(vp->v_mount == dead_rootmount || fstrans_is_owner(vp->v_mount));
1117
1118 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1119 lktype = LK_EXCLUSIVE;
1120 mutex_enter(vp->v_interlock);
1121 VSTATE_WAIT_STABLE(vp);
1122 if (VSTATE_GET(vp) == VS_LOADED) {
1123 vcache_reclaim(vp);
1124 lktype = LK_NONE;
1125 }
1126 VSTATE_ASSERT(vp, VS_RECLAIMED);
1127 vrelel(vp, 0, lktype);
1128 }
1129
1130 static inline uint32_t
1131 vcache_hash(const struct vcache_key *key)
1132 {
1133 uint32_t hash = HASH32_BUF_INIT;
1134
1135 KASSERT(key->vk_key_len > 0);
1136
1137 hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash);
1138 hash = hash32_buf(key->vk_key, key->vk_key_len, hash);
1139 return hash;
1140 }
1141
1142 static void
1143 vcache_init(void)
1144 {
1145
1146 vcache_pool = pool_cache_init(sizeof(vnode_impl_t), coherency_unit,
1147 0, 0, "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL);
1148 KASSERT(vcache_pool != NULL);
1149 mutex_init(&vcache_lock, MUTEX_DEFAULT, IPL_NONE);
1150 cv_init(&vcache_cv, "vcache");
1151 vcache_hashsize = desiredvnodes;
1152 vcache_hashtab = hashinit(desiredvnodes, HASH_SLIST, true,
1153 &vcache_hashmask);
1154 }
1155
1156 static void
1157 vcache_reinit(void)
1158 {
1159 int i;
1160 uint32_t hash;
1161 u_long oldmask, newmask;
1162 struct hashhead *oldtab, *newtab;
1163 vnode_impl_t *vip;
1164
1165 newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask);
1166 mutex_enter(&vcache_lock);
1167 oldtab = vcache_hashtab;
1168 oldmask = vcache_hashmask;
1169 vcache_hashsize = desiredvnodes;
1170 vcache_hashtab = newtab;
1171 vcache_hashmask = newmask;
1172 for (i = 0; i <= oldmask; i++) {
1173 while ((vip = SLIST_FIRST(&oldtab[i])) != NULL) {
1174 SLIST_REMOVE(&oldtab[i], vip, vnode_impl, vi_hash);
1175 hash = vcache_hash(&vip->vi_key);
1176 SLIST_INSERT_HEAD(&newtab[hash & vcache_hashmask],
1177 vip, vi_hash);
1178 }
1179 }
1180 mutex_exit(&vcache_lock);
1181 hashdone(oldtab, HASH_SLIST, oldmask);
1182 }
1183
1184 static inline vnode_impl_t *
1185 vcache_hash_lookup(const struct vcache_key *key, uint32_t hash)
1186 {
1187 struct hashhead *hashp;
1188 vnode_impl_t *vip;
1189
1190 KASSERT(mutex_owned(&vcache_lock));
1191
1192 hashp = &vcache_hashtab[hash & vcache_hashmask];
1193 SLIST_FOREACH(vip, hashp, vi_hash) {
1194 if (key->vk_mount != vip->vi_key.vk_mount)
1195 continue;
1196 if (key->vk_key_len != vip->vi_key.vk_key_len)
1197 continue;
1198 if (memcmp(key->vk_key, vip->vi_key.vk_key, key->vk_key_len))
1199 continue;
1200 return vip;
1201 }
1202 return NULL;
1203 }
1204
1205 /*
1206 * Allocate a new, uninitialized vcache node.
1207 */
1208 static vnode_impl_t *
1209 vcache_alloc(void)
1210 {
1211 vnode_impl_t *vip;
1212 vnode_t *vp;
1213
1214 vip = pool_cache_get(vcache_pool, PR_WAITOK);
1215 memset(vip, 0, sizeof(*vip));
1216
1217 rw_init(&vip->vi_lock);
1218
1219 vp = VIMPL_TO_VNODE(vip);
1220 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0);
1221 cv_init(&vp->v_cv, "vnode");
1222 cache_vnode_init(vp);
1223
1224 vp->v_usecount = 1;
1225 vp->v_type = VNON;
1226 vp->v_size = vp->v_writesize = VSIZENOTSET;
1227
1228 vip->vi_state = VS_LOADING;
1229
1230 lru_requeue(vp, &lru_list[LRU_FREE]);
1231
1232 return vip;
1233 }
1234
1235 /*
1236 * Deallocate a vcache node in state VS_LOADING.
1237 *
1238 * vcache_lock held on entry and released on return.
1239 */
1240 static void
1241 vcache_dealloc(vnode_impl_t *vip)
1242 {
1243 vnode_t *vp;
1244
1245 KASSERT(mutex_owned(&vcache_lock));
1246
1247 vp = VIMPL_TO_VNODE(vip);
1248 vfs_ref(dead_rootmount);
1249 vfs_insmntque(vp, dead_rootmount);
1250 mutex_enter(vp->v_interlock);
1251 vp->v_op = dead_vnodeop_p;
1252 VSTATE_CHANGE(vp, VS_LOADING, VS_RECLAIMED);
1253 mutex_exit(&vcache_lock);
1254 vrelel(vp, 0, LK_NONE);
1255 }
1256
1257 /*
1258 * Free an unused, unreferenced vcache node.
1259 * v_interlock locked on entry.
1260 */
1261 static void
1262 vcache_free(vnode_impl_t *vip)
1263 {
1264 vnode_t *vp;
1265
1266 vp = VIMPL_TO_VNODE(vip);
1267 KASSERT(mutex_owned(vp->v_interlock));
1268
1269 KASSERT(vp->v_usecount == 0);
1270 KASSERT(vp->v_holdcnt == 0);
1271 KASSERT(vp->v_writecount == 0);
1272 lru_requeue(vp, NULL);
1273 mutex_exit(vp->v_interlock);
1274
1275 vfs_insmntque(vp, NULL);
1276 if (vp->v_type == VBLK || vp->v_type == VCHR)
1277 spec_node_destroy(vp);
1278
1279 rw_destroy(&vip->vi_lock);
1280 uvm_obj_destroy(&vp->v_uobj, true);
1281 cv_destroy(&vp->v_cv);
1282 cache_vnode_fini(vp);
1283 pool_cache_put(vcache_pool, vip);
1284 }
1285
1286 /*
1287 * Try to get an initial reference on this cached vnode.
1288 * Returns zero on success, ENOENT if the vnode has been reclaimed and
1289 * EBUSY if the vnode state is unstable.
1290 *
1291 * v_interlock locked on entry and unlocked on exit.
1292 */
1293 int
1294 vcache_tryvget(vnode_t *vp)
1295 {
1296 int error = 0;
1297
1298 KASSERT(mutex_owned(vp->v_interlock));
1299
1300 if (__predict_false(VSTATE_GET(vp) == VS_RECLAIMED))
1301 error = ENOENT;
1302 else if (__predict_false(VSTATE_GET(vp) != VS_LOADED))
1303 error = EBUSY;
1304 else if (vp->v_usecount == 0)
1305 vp->v_usecount = 1;
1306 else
1307 atomic_inc_uint(&vp->v_usecount);
1308
1309 mutex_exit(vp->v_interlock);
1310
1311 return error;
1312 }
1313
1314 /*
1315 * Try to get an initial reference on this cached vnode.
1316 * Returns zero on success and ENOENT if the vnode has been reclaimed.
1317 * Will wait for the vnode state to be stable.
1318 *
1319 * v_interlock locked on entry and unlocked on exit.
1320 */
1321 int
1322 vcache_vget(vnode_t *vp)
1323 {
1324
1325 KASSERT(mutex_owned(vp->v_interlock));
1326
1327 /* Increment hold count to prevent vnode from disappearing. */
1328 vp->v_holdcnt++;
1329 VSTATE_WAIT_STABLE(vp);
1330 vp->v_holdcnt--;
1331
1332 /* If this was the last reference to a reclaimed vnode free it now. */
1333 if (__predict_false(VSTATE_GET(vp) == VS_RECLAIMED)) {
1334 if (vp->v_holdcnt == 0 && vp->v_usecount == 0)
1335 vcache_free(VNODE_TO_VIMPL(vp));
1336 else
1337 mutex_exit(vp->v_interlock);
1338 return ENOENT;
1339 }
1340 VSTATE_ASSERT(vp, VS_LOADED);
1341 if (vp->v_usecount == 0)
1342 vp->v_usecount = 1;
1343 else
1344 atomic_inc_uint(&vp->v_usecount);
1345 mutex_exit(vp->v_interlock);
1346
1347 return 0;
1348 }
1349
1350 /*
1351 * Get a vnode / fs node pair by key and return it referenced through vpp.
1352 */
1353 int
1354 vcache_get(struct mount *mp, const void *key, size_t key_len,
1355 struct vnode **vpp)
1356 {
1357 int error;
1358 uint32_t hash;
1359 const void *new_key;
1360 struct vnode *vp;
1361 struct vcache_key vcache_key;
1362 vnode_impl_t *vip, *new_vip;
1363
1364 new_key = NULL;
1365 *vpp = NULL;
1366
1367 vcache_key.vk_mount = mp;
1368 vcache_key.vk_key = key;
1369 vcache_key.vk_key_len = key_len;
1370 hash = vcache_hash(&vcache_key);
1371
1372 again:
1373 mutex_enter(&vcache_lock);
1374 vip = vcache_hash_lookup(&vcache_key, hash);
1375
1376 /* If found, take a reference or retry. */
1377 if (__predict_true(vip != NULL)) {
1378 /*
1379 * If the vnode is loading we cannot take the v_interlock
1380 * here as it might change during load (see uvm_obj_setlock()).
1381 * As changing state from VS_LOADING requires both vcache_lock
1382 * and v_interlock it is safe to test with vcache_lock held.
1383 *
1384 * Wait for vnodes changing state from VS_LOADING and retry.
1385 */
1386 if (__predict_false(vip->vi_state == VS_LOADING)) {
1387 cv_wait(&vcache_cv, &vcache_lock);
1388 mutex_exit(&vcache_lock);
1389 goto again;
1390 }
1391 vp = VIMPL_TO_VNODE(vip);
1392 mutex_enter(vp->v_interlock);
1393 mutex_exit(&vcache_lock);
1394 error = vcache_vget(vp);
1395 if (error == ENOENT)
1396 goto again;
1397 if (error == 0)
1398 *vpp = vp;
1399 KASSERT((error != 0) == (*vpp == NULL));
1400 return error;
1401 }
1402 mutex_exit(&vcache_lock);
1403
1404 /* Allocate and initialize a new vcache / vnode pair. */
1405 error = vfs_busy(mp);
1406 if (error)
1407 return error;
1408 new_vip = vcache_alloc();
1409 new_vip->vi_key = vcache_key;
1410 vp = VIMPL_TO_VNODE(new_vip);
1411 mutex_enter(&vcache_lock);
1412 vip = vcache_hash_lookup(&vcache_key, hash);
1413 if (vip == NULL) {
1414 SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask],
1415 new_vip, vi_hash);
1416 vip = new_vip;
1417 }
1418
1419 /* If another thread beat us inserting this node, retry. */
1420 if (vip != new_vip) {
1421 vcache_dealloc(new_vip);
1422 vfs_unbusy(mp);
1423 goto again;
1424 }
1425 mutex_exit(&vcache_lock);
1426
1427 /* Load the fs node. Exclusive as new_node is VS_LOADING. */
1428 error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key);
1429 if (error) {
1430 mutex_enter(&vcache_lock);
1431 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
1432 new_vip, vnode_impl, vi_hash);
1433 vcache_dealloc(new_vip);
1434 vfs_unbusy(mp);
1435 KASSERT(*vpp == NULL);
1436 return error;
1437 }
1438 KASSERT(new_key != NULL);
1439 KASSERT(memcmp(key, new_key, key_len) == 0);
1440 KASSERT(vp->v_op != NULL);
1441 vfs_insmntque(vp, mp);
1442 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1443 vp->v_vflag |= VV_MPSAFE;
1444 vfs_ref(mp);
1445 vfs_unbusy(mp);
1446
1447 /* Finished loading, finalize node. */
1448 mutex_enter(&vcache_lock);
1449 new_vip->vi_key.vk_key = new_key;
1450 mutex_enter(vp->v_interlock);
1451 VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED);
1452 mutex_exit(vp->v_interlock);
1453 mutex_exit(&vcache_lock);
1454 *vpp = vp;
1455 return 0;
1456 }
1457
1458 /*
1459 * Create a new vnode / fs node pair and return it referenced through vpp.
1460 */
1461 int
1462 vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap,
1463 kauth_cred_t cred, void *extra, struct vnode **vpp)
1464 {
1465 int error;
1466 uint32_t hash;
1467 struct vnode *vp, *ovp;
1468 vnode_impl_t *vip, *ovip;
1469
1470 *vpp = NULL;
1471
1472 /* Allocate and initialize a new vcache / vnode pair. */
1473 error = vfs_busy(mp);
1474 if (error)
1475 return error;
1476 vip = vcache_alloc();
1477 vip->vi_key.vk_mount = mp;
1478 vp = VIMPL_TO_VNODE(vip);
1479
1480 /* Create and load the fs node. */
1481 error = VFS_NEWVNODE(mp, dvp, vp, vap, cred, extra,
1482 &vip->vi_key.vk_key_len, &vip->vi_key.vk_key);
1483 if (error) {
1484 mutex_enter(&vcache_lock);
1485 vcache_dealloc(vip);
1486 vfs_unbusy(mp);
1487 KASSERT(*vpp == NULL);
1488 return error;
1489 }
1490 KASSERT(vp->v_op != NULL);
1491 KASSERT((vip->vi_key.vk_key_len == 0) == (mp == dead_rootmount));
1492 if (vip->vi_key.vk_key_len > 0) {
1493 KASSERT(vip->vi_key.vk_key != NULL);
1494 hash = vcache_hash(&vip->vi_key);
1495
1496 /*
1497 * Wait for previous instance to be reclaimed,
1498 * then insert new node.
1499 */
1500 mutex_enter(&vcache_lock);
1501 while ((ovip = vcache_hash_lookup(&vip->vi_key, hash))) {
1502 ovp = VIMPL_TO_VNODE(ovip);
1503 mutex_enter(ovp->v_interlock);
1504 mutex_exit(&vcache_lock);
1505 error = vcache_vget(ovp);
1506 KASSERT(error == ENOENT);
1507 mutex_enter(&vcache_lock);
1508 }
1509 SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask],
1510 vip, vi_hash);
1511 mutex_exit(&vcache_lock);
1512 }
1513 vfs_insmntque(vp, mp);
1514 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1515 vp->v_vflag |= VV_MPSAFE;
1516 vfs_ref(mp);
1517 vfs_unbusy(mp);
1518
1519 /* Finished loading, finalize node. */
1520 mutex_enter(&vcache_lock);
1521 mutex_enter(vp->v_interlock);
1522 VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED);
1523 mutex_exit(&vcache_lock);
1524 mutex_exit(vp->v_interlock);
1525 *vpp = vp;
1526 return 0;
1527 }
1528
1529 /*
1530 * Prepare key change: update old cache nodes key and lock new cache node.
1531 * Return an error if the new node already exists.
1532 */
1533 int
1534 vcache_rekey_enter(struct mount *mp, struct vnode *vp,
1535 const void *old_key, size_t old_key_len,
1536 const void *new_key, size_t new_key_len)
1537 {
1538 uint32_t old_hash, new_hash;
1539 struct vcache_key old_vcache_key, new_vcache_key;
1540 vnode_impl_t *vip, *new_vip;
1541
1542 old_vcache_key.vk_mount = mp;
1543 old_vcache_key.vk_key = old_key;
1544 old_vcache_key.vk_key_len = old_key_len;
1545 old_hash = vcache_hash(&old_vcache_key);
1546
1547 new_vcache_key.vk_mount = mp;
1548 new_vcache_key.vk_key = new_key;
1549 new_vcache_key.vk_key_len = new_key_len;
1550 new_hash = vcache_hash(&new_vcache_key);
1551
1552 new_vip = vcache_alloc();
1553 new_vip->vi_key = new_vcache_key;
1554
1555 /* Insert locked new node used as placeholder. */
1556 mutex_enter(&vcache_lock);
1557 vip = vcache_hash_lookup(&new_vcache_key, new_hash);
1558 if (vip != NULL) {
1559 vcache_dealloc(new_vip);
1560 return EEXIST;
1561 }
1562 SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask],
1563 new_vip, vi_hash);
1564
1565 /* Replace old nodes key with the temporary copy. */
1566 vip = vcache_hash_lookup(&old_vcache_key, old_hash);
1567 KASSERT(vip != NULL);
1568 KASSERT(VIMPL_TO_VNODE(vip) == vp);
1569 KASSERT(vip->vi_key.vk_key != old_vcache_key.vk_key);
1570 vip->vi_key = old_vcache_key;
1571 mutex_exit(&vcache_lock);
1572 return 0;
1573 }
1574
1575 /*
1576 * Key change complete: update old node and remove placeholder.
1577 */
1578 void
1579 vcache_rekey_exit(struct mount *mp, struct vnode *vp,
1580 const void *old_key, size_t old_key_len,
1581 const void *new_key, size_t new_key_len)
1582 {
1583 uint32_t old_hash, new_hash;
1584 struct vcache_key old_vcache_key, new_vcache_key;
1585 vnode_impl_t *vip, *new_vip;
1586 struct vnode *new_vp;
1587
1588 old_vcache_key.vk_mount = mp;
1589 old_vcache_key.vk_key = old_key;
1590 old_vcache_key.vk_key_len = old_key_len;
1591 old_hash = vcache_hash(&old_vcache_key);
1592
1593 new_vcache_key.vk_mount = mp;
1594 new_vcache_key.vk_key = new_key;
1595 new_vcache_key.vk_key_len = new_key_len;
1596 new_hash = vcache_hash(&new_vcache_key);
1597
1598 mutex_enter(&vcache_lock);
1599
1600 /* Lookup old and new node. */
1601 vip = vcache_hash_lookup(&old_vcache_key, old_hash);
1602 KASSERT(vip != NULL);
1603 KASSERT(VIMPL_TO_VNODE(vip) == vp);
1604
1605 new_vip = vcache_hash_lookup(&new_vcache_key, new_hash);
1606 KASSERT(new_vip != NULL);
1607 KASSERT(new_vip->vi_key.vk_key_len == new_key_len);
1608 new_vp = VIMPL_TO_VNODE(new_vip);
1609 mutex_enter(new_vp->v_interlock);
1610 VSTATE_ASSERT(VIMPL_TO_VNODE(new_vip), VS_LOADING);
1611 mutex_exit(new_vp->v_interlock);
1612
1613 /* Rekey old node and put it onto its new hashlist. */
1614 vip->vi_key = new_vcache_key;
1615 if (old_hash != new_hash) {
1616 SLIST_REMOVE(&vcache_hashtab[old_hash & vcache_hashmask],
1617 vip, vnode_impl, vi_hash);
1618 SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask],
1619 vip, vi_hash);
1620 }
1621
1622 /* Remove new node used as placeholder. */
1623 SLIST_REMOVE(&vcache_hashtab[new_hash & vcache_hashmask],
1624 new_vip, vnode_impl, vi_hash);
1625 vcache_dealloc(new_vip);
1626 }
1627
1628 /*
1629 * Disassociate the underlying file system from a vnode.
1630 *
1631 * Must be called with vnode locked and will return unlocked.
1632 * Must be called with the interlock held, and will return with it held.
1633 */
1634 static void
1635 vcache_reclaim(vnode_t *vp)
1636 {
1637 lwp_t *l = curlwp;
1638 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
1639 struct mount *mp = vp->v_mount;
1640 uint32_t hash;
1641 uint8_t temp_buf[64], *temp_key;
1642 size_t temp_key_len;
1643 bool recycle, active;
1644 int error;
1645
1646 KASSERT((vp->v_vflag & VV_LOCKSWORK) == 0 ||
1647 VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
1648 KASSERT(mutex_owned(vp->v_interlock));
1649 KASSERT(vp->v_usecount != 0);
1650
1651 active = (vp->v_usecount > 1);
1652 temp_key_len = vip->vi_key.vk_key_len;
1653 /*
1654 * Prevent the vnode from being recycled or brought into use
1655 * while we clean it out.
1656 */
1657 VSTATE_CHANGE(vp, VS_LOADED, VS_RECLAIMING);
1658 if ((vp->v_iflag & VI_EXECMAP) != 0 && vp->v_uobj.uo_npages != 0) {
1659 cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages);
1660 cpu_count(CPU_COUNT_FILEPAGES, vp->v_uobj.uo_npages);
1661 }
1662 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
1663 mutex_exit(vp->v_interlock);
1664
1665 /*
1666 * With vnode state set to reclaiming, purge name cache immediately
1667 * to prevent new handles on vnode, and wait for existing threads
1668 * trying to get a handle to notice VS_RECLAIMED status and abort.
1669 */
1670 cache_purge(vp);
1671
1672 /* Replace the vnode key with a temporary copy. */
1673 if (vip->vi_key.vk_key_len > sizeof(temp_buf)) {
1674 temp_key = kmem_alloc(temp_key_len, KM_SLEEP);
1675 } else {
1676 temp_key = temp_buf;
1677 }
1678 if (vip->vi_key.vk_key_len > 0) {
1679 mutex_enter(&vcache_lock);
1680 memcpy(temp_key, vip->vi_key.vk_key, temp_key_len);
1681 vip->vi_key.vk_key = temp_key;
1682 mutex_exit(&vcache_lock);
1683 }
1684
1685 fstrans_start(mp);
1686
1687 /*
1688 * Clean out any cached data associated with the vnode.
1689 * If purging an active vnode, it must be closed and
1690 * deactivated before being reclaimed.
1691 */
1692 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
1693 if (error != 0) {
1694 if (wapbl_vphaswapbl(vp))
1695 WAPBL_DISCARD(wapbl_vptomp(vp));
1696 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
1697 }
1698 KASSERTMSG((error == 0), "vinvalbuf failed: %d", error);
1699 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1700 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) {
1701 spec_node_revoke(vp);
1702 }
1703
1704 /*
1705 * Disassociate the underlying file system from the vnode.
1706 * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks
1707 * the vnode, and may destroy the vnode so that VOP_UNLOCK
1708 * would no longer function.
1709 */
1710 VOP_INACTIVE(vp, &recycle);
1711 KASSERT((vp->v_vflag & VV_LOCKSWORK) == 0 ||
1712 VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
1713 if (VOP_RECLAIM(vp)) {
1714 vnpanic(vp, "%s: cannot reclaim", __func__);
1715 }
1716
1717 KASSERT(vp->v_data == NULL);
1718 KASSERT(vp->v_uobj.uo_npages == 0);
1719
1720 if (vp->v_type == VREG && vp->v_ractx != NULL) {
1721 uvm_ra_freectx(vp->v_ractx);
1722 vp->v_ractx = NULL;
1723 }
1724
1725 if (vip->vi_key.vk_key_len > 0) {
1726 /* Remove from vnode cache. */
1727 hash = vcache_hash(&vip->vi_key);
1728 mutex_enter(&vcache_lock);
1729 KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash));
1730 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
1731 vip, vnode_impl, vi_hash);
1732 mutex_exit(&vcache_lock);
1733 }
1734 if (temp_key != temp_buf)
1735 kmem_free(temp_key, temp_key_len);
1736
1737 /* Done with purge, notify sleepers of the grim news. */
1738 mutex_enter(vp->v_interlock);
1739 vp->v_op = dead_vnodeop_p;
1740 vp->v_vflag |= VV_LOCKSWORK;
1741 VSTATE_CHANGE(vp, VS_RECLAIMING, VS_RECLAIMED);
1742 vp->v_tag = VT_NON;
1743 KNOTE(&vp->v_klist, NOTE_REVOKE);
1744 mutex_exit(vp->v_interlock);
1745
1746 /*
1747 * Move to dead mount. Must be after changing the operations
1748 * vector as vnode operations enter the mount before using the
1749 * operations vector. See sys/kern/vnode_if.c.
1750 */
1751 vp->v_vflag &= ~VV_ROOT;
1752 vfs_ref(dead_rootmount);
1753 vfs_insmntque(vp, dead_rootmount);
1754
1755 #ifdef PAX_SEGVGUARD
1756 pax_segvguard_cleanup(vp);
1757 #endif /* PAX_SEGVGUARD */
1758
1759 mutex_enter(vp->v_interlock);
1760 fstrans_done(mp);
1761 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1762 }
1763
1764 /*
1765 * Disassociate the underlying file system from an open device vnode
1766 * and make it anonymous.
1767 *
1768 * Vnode unlocked on entry, drops a reference to the vnode.
1769 */
1770 void
1771 vcache_make_anon(vnode_t *vp)
1772 {
1773 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
1774 uint32_t hash;
1775 bool recycle;
1776
1777 KASSERT(vp->v_type == VBLK || vp->v_type == VCHR);
1778 KASSERT(vp->v_mount == dead_rootmount || fstrans_is_owner(vp->v_mount));
1779 VSTATE_ASSERT_UNLOCKED(vp, VS_ACTIVE);
1780
1781 /* Remove from vnode cache. */
1782 hash = vcache_hash(&vip->vi_key);
1783 mutex_enter(&vcache_lock);
1784 KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash));
1785 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
1786 vip, vnode_impl, vi_hash);
1787 vip->vi_key.vk_mount = dead_rootmount;
1788 vip->vi_key.vk_key_len = 0;
1789 vip->vi_key.vk_key = NULL;
1790 mutex_exit(&vcache_lock);
1791
1792 /*
1793 * Disassociate the underlying file system from the vnode.
1794 * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks
1795 * the vnode, and may destroy the vnode so that VOP_UNLOCK
1796 * would no longer function.
1797 */
1798 if (vn_lock(vp, LK_EXCLUSIVE)) {
1799 vnpanic(vp, "%s: cannot lock", __func__);
1800 }
1801 VOP_INACTIVE(vp, &recycle);
1802 KASSERT((vp->v_vflag & VV_LOCKSWORK) == 0 ||
1803 VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
1804 if (VOP_RECLAIM(vp)) {
1805 vnpanic(vp, "%s: cannot reclaim", __func__);
1806 }
1807
1808 /* Purge name cache. */
1809 cache_purge(vp);
1810
1811 /* Done with purge, change operations vector. */
1812 mutex_enter(vp->v_interlock);
1813 vp->v_op = spec_vnodeop_p;
1814 vp->v_vflag |= VV_MPSAFE;
1815 vp->v_vflag &= ~VV_LOCKSWORK;
1816 mutex_exit(vp->v_interlock);
1817
1818 /*
1819 * Move to dead mount. Must be after changing the operations
1820 * vector as vnode operations enter the mount before using the
1821 * operations vector. See sys/kern/vnode_if.c.
1822 */
1823 vfs_ref(dead_rootmount);
1824 vfs_insmntque(vp, dead_rootmount);
1825
1826 vrele(vp);
1827 }
1828
1829 /*
1830 * Update outstanding I/O count and do wakeup if requested.
1831 */
1832 void
1833 vwakeup(struct buf *bp)
1834 {
1835 vnode_t *vp;
1836
1837 if ((vp = bp->b_vp) == NULL)
1838 return;
1839
1840 KASSERT(bp->b_objlock == vp->v_interlock);
1841 KASSERT(mutex_owned(bp->b_objlock));
1842
1843 if (--vp->v_numoutput < 0)
1844 vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp);
1845 if (vp->v_numoutput == 0)
1846 cv_broadcast(&vp->v_cv);
1847 }
1848
1849 /*
1850 * Test a vnode for being or becoming dead. Returns one of:
1851 * EBUSY: vnode is becoming dead, with "flags == VDEAD_NOWAIT" only.
1852 * ENOENT: vnode is dead.
1853 * 0: otherwise.
1854 *
1855 * Whenever this function returns a non-zero value all future
1856 * calls will also return a non-zero value.
1857 */
1858 int
1859 vdead_check(struct vnode *vp, int flags)
1860 {
1861
1862 KASSERT(mutex_owned(vp->v_interlock));
1863
1864 if (! ISSET(flags, VDEAD_NOWAIT))
1865 VSTATE_WAIT_STABLE(vp);
1866
1867 if (VSTATE_GET(vp) == VS_RECLAIMING) {
1868 KASSERT(ISSET(flags, VDEAD_NOWAIT));
1869 return EBUSY;
1870 } else if (VSTATE_GET(vp) == VS_RECLAIMED) {
1871 return ENOENT;
1872 }
1873
1874 return 0;
1875 }
1876
1877 int
1878 vfs_drainvnodes(void)
1879 {
1880 int i, gen;
1881
1882 mutex_enter(&vdrain_lock);
1883 for (i = 0; i < 2; i++) {
1884 gen = vdrain_gen;
1885 while (gen == vdrain_gen) {
1886 cv_broadcast(&vdrain_cv);
1887 cv_wait(&vdrain_gen_cv, &vdrain_lock);
1888 }
1889 }
1890 mutex_exit(&vdrain_lock);
1891
1892 if (numvnodes >= desiredvnodes)
1893 return EBUSY;
1894
1895 if (vcache_hashsize != desiredvnodes)
1896 vcache_reinit();
1897
1898 return 0;
1899 }
1900
1901 void
1902 vnpanic(vnode_t *vp, const char *fmt, ...)
1903 {
1904 va_list ap;
1905
1906 #ifdef DIAGNOSTIC
1907 vprint(NULL, vp);
1908 #endif
1909 va_start(ap, fmt);
1910 vpanic(fmt, ap);
1911 va_end(ap);
1912 }
1913