vfs_vnode.c revision 1.55 1 /* $NetBSD: vfs_vnode.c,v 1.55 2016/08/20 12:33:57 hannken Exp $ */
2
3 /*-
4 * Copyright (c) 1997-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /*
34 * Copyright (c) 1989, 1993
35 * The Regents of the University of California. All rights reserved.
36 * (c) UNIX System Laboratories, Inc.
37 * All or some portions of this file are derived from material licensed
38 * to the University of California by American Telephone and Telegraph
39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40 * the permission of UNIX System Laboratories, Inc.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
67 */
68
69 /*
70 * The vnode cache subsystem.
71 *
72 * Life-cycle
73 *
74 * Normally, there are two points where new vnodes are created:
75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode
76 * starts in one of the following ways:
77 *
78 * - Allocation, via vcache_get(9) or vcache_new(9).
79 * - Reclamation of inactive vnode, via vget(9).
80 *
81 * Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9)
82 * was another, traditional way. Currently, only the draining thread
83 * recycles the vnodes. This behaviour might be revisited.
84 *
85 * The life-cycle ends when the last reference is dropped, usually
86 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform
87 * the file system that vnode is inactive. Via this call, file system
88 * indicates whether vnode can be recycled (usually, it checks its own
89 * references, e.g. count of links, whether the file was removed).
90 *
91 * Depending on indication, vnode can be put into a free list (cache),
92 * or cleaned via vcache_reclaim, which calls VOP_RECLAIM(9) to
93 * disassociate underlying file system from the vnode, and finally
94 * destroyed.
95 *
96 * Vnode state
97 *
98 * Vnode is always in one of six states:
99 * - MARKER This is a marker vnode to help list traversal. It
100 * will never change its state.
101 * - LOADING Vnode is associating underlying file system and not
102 * yet ready to use.
103 * - ACTIVE Vnode has associated underlying file system and is
104 * ready to use.
105 * - BLOCKED Vnode is active but cannot get new references.
106 * - RECLAIMING Vnode is disassociating from the underlying file
107 * system.
108 * - RECLAIMED Vnode has disassociated from underlying file system
109 * and is dead.
110 *
111 * Valid state changes are:
112 * LOADING -> ACTIVE
113 * Vnode has been initialised in vcache_get() or
114 * vcache_new() and is ready to use.
115 * ACTIVE -> RECLAIMING
116 * Vnode starts disassociation from underlying file
117 * system in vcache_reclaim().
118 * RECLAIMING -> RECLAIMED
119 * Vnode finished disassociation from underlying file
120 * system in vcache_reclaim().
121 * ACTIVE -> BLOCKED
122 * Either vcache_rekey*() is changing the vnode key or
123 * vrelel() is about to call VOP_INACTIVE().
124 * BLOCKED -> ACTIVE
125 * The block condition is over.
126 * LOADING -> RECLAIMED
127 * Either vcache_get() or vcache_new() failed to
128 * associate the underlying file system or vcache_rekey*()
129 * drops a vnode used as placeholder.
130 *
131 * Of these states LOADING, BLOCKED and RECLAIMING are intermediate
132 * and it is possible to wait for state change.
133 *
134 * State is protected with v_interlock with one exception:
135 * to change from LOADING both v_interlock and vcache.lock must be held
136 * so it is possible to check "state == LOADING" without holding
137 * v_interlock. See vcache_get() for details.
138 *
139 * Reference counting
140 *
141 * Vnode is considered active, if reference count (vnode_t::v_usecount)
142 * is non-zero. It is maintained using: vref(9) and vrele(9), as well
143 * as vput(9), routines. Common points holding references are e.g.
144 * file openings, current working directory, mount points, etc.
145 *
146 * Note on v_usecount and its locking
147 *
148 * At nearly all points it is known that v_usecount could be zero,
149 * the vnode_t::v_interlock will be held. To change v_usecount away
150 * from zero, the interlock must be held. To change from a non-zero
151 * value to zero, again the interlock must be held.
152 *
153 * Changing the usecount from a non-zero value to a non-zero value can
154 * safely be done using atomic operations, without the interlock held.
155 *
156 */
157
158 #include <sys/cdefs.h>
159 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.55 2016/08/20 12:33:57 hannken Exp $");
160
161 #define _VFS_VNODE_PRIVATE
162
163 #include <sys/param.h>
164 #include <sys/kernel.h>
165
166 #include <sys/atomic.h>
167 #include <sys/buf.h>
168 #include <sys/conf.h>
169 #include <sys/device.h>
170 #include <sys/hash.h>
171 #include <sys/kauth.h>
172 #include <sys/kmem.h>
173 #include <sys/kthread.h>
174 #include <sys/module.h>
175 #include <sys/mount.h>
176 #include <sys/namei.h>
177 #include <sys/syscallargs.h>
178 #include <sys/sysctl.h>
179 #include <sys/systm.h>
180 #include <sys/vnode.h>
181 #include <sys/wapbl.h>
182 #include <sys/fstrans.h>
183
184 #include <uvm/uvm.h>
185 #include <uvm/uvm_readahead.h>
186
187 /* Flags to vrelel. */
188 #define VRELEL_ASYNC_RELE 0x0001 /* Always defer to vrele thread. */
189
190 enum vcache_state {
191 VN_MARKER, /* Stable, used as marker. Will not change. */
192 VN_LOADING, /* Intermediate, initialising the fs node. */
193 VN_ACTIVE, /* Stable, valid fs node attached. */
194 VN_BLOCKED, /* Intermediate, active, no new references allowed. */
195 VN_RECLAIMING, /* Intermediate, detaching the fs node. */
196 VN_RECLAIMED /* Stable, no fs node attached. */
197 };
198 struct vcache_key {
199 struct mount *vk_mount;
200 const void *vk_key;
201 size_t vk_key_len;
202 };
203 struct vcache_node {
204 struct vnode vn_vnode;
205 enum vcache_state vn_state;
206 SLIST_ENTRY(vcache_node) vn_hash;
207 struct vcache_key vn_key;
208 };
209
210 #define VN_TO_VP(node) ((vnode_t *)(node))
211 #define VP_TO_VN(vp) ((struct vcache_node *)(vp))
212
213 u_int numvnodes __cacheline_aligned;
214
215 /*
216 * There are two free lists: one is for vnodes which have no buffer/page
217 * references and one for those which do (i.e. v_holdcnt is non-zero).
218 * Vnode recycling mechanism first attempts to look into the former list.
219 */
220 static kmutex_t vnode_free_list_lock __cacheline_aligned;
221 static vnodelst_t vnode_free_list __cacheline_aligned;
222 static vnodelst_t vnode_hold_list __cacheline_aligned;
223 static kcondvar_t vdrain_cv __cacheline_aligned;
224
225 static vnodelst_t vrele_list __cacheline_aligned;
226 static kmutex_t vrele_lock __cacheline_aligned;
227 static kcondvar_t vrele_cv __cacheline_aligned;
228 static lwp_t * vrele_lwp __cacheline_aligned;
229 static int vrele_pending __cacheline_aligned;
230 static int vrele_gen __cacheline_aligned;
231
232 SLIST_HEAD(hashhead, vcache_node);
233 static struct {
234 kmutex_t lock;
235 kcondvar_t cv;
236 u_long hashmask;
237 struct hashhead *hashtab;
238 pool_cache_t pool;
239 } vcache __cacheline_aligned;
240
241 static int cleanvnode(void);
242 static struct vcache_node *vcache_alloc(void);
243 static void vcache_free(struct vcache_node *);
244 static void vcache_init(void);
245 static void vcache_reinit(void);
246 static void vcache_reclaim(vnode_t *);
247 static void vrelel(vnode_t *, int);
248 static void vdrain_thread(void *);
249 static void vrele_thread(void *);
250 static void vnpanic(vnode_t *, const char *, ...)
251 __printflike(2, 3);
252
253 /* Routines having to do with the management of the vnode table. */
254 extern struct mount *dead_rootmount;
255 extern int (**dead_vnodeop_p)(void *);
256 extern struct vfsops dead_vfsops;
257
258 /* Vnode state operations and diagnostics. */
259
260 static const char *
261 vstate_name(enum vcache_state state)
262 {
263
264 switch (state) {
265 case VN_MARKER:
266 return "MARKER";
267 case VN_LOADING:
268 return "LOADING";
269 case VN_ACTIVE:
270 return "ACTIVE";
271 case VN_BLOCKED:
272 return "BLOCKED";
273 case VN_RECLAIMING:
274 return "RECLAIMING";
275 case VN_RECLAIMED:
276 return "RECLAIMED";
277 default:
278 return "ILLEGAL";
279 }
280 }
281
282 #if defined(DIAGNOSTIC)
283
284 #define VSTATE_GET(vp) \
285 vstate_assert_get((vp), __func__, __LINE__)
286 #define VSTATE_CHANGE(vp, from, to) \
287 vstate_assert_change((vp), (from), (to), __func__, __LINE__)
288 #define VSTATE_WAIT_STABLE(vp) \
289 vstate_assert_wait_stable((vp), __func__, __LINE__)
290 #define VSTATE_ASSERT(vp, state) \
291 vstate_assert((vp), (state), __func__, __LINE__)
292
293 static void
294 vstate_assert(vnode_t *vp, enum vcache_state state, const char *func, int line)
295 {
296 struct vcache_node *node = VP_TO_VN(vp);
297
298 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
299
300 if (__predict_true(node->vn_state == state))
301 return;
302 vnpanic(vp, "state is %s, expected %s at %s:%d",
303 vstate_name(node->vn_state), vstate_name(state), func, line);
304 }
305
306 static enum vcache_state
307 vstate_assert_get(vnode_t *vp, const char *func, int line)
308 {
309 struct vcache_node *node = VP_TO_VN(vp);
310
311 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
312 if (node->vn_state == VN_MARKER)
313 vnpanic(vp, "state is %s at %s:%d",
314 vstate_name(node->vn_state), func, line);
315
316 return node->vn_state;
317 }
318
319 static void
320 vstate_assert_wait_stable(vnode_t *vp, const char *func, int line)
321 {
322 struct vcache_node *node = VP_TO_VN(vp);
323
324 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
325 if (node->vn_state == VN_MARKER)
326 vnpanic(vp, "state is %s at %s:%d",
327 vstate_name(node->vn_state), func, line);
328
329 while (node->vn_state != VN_ACTIVE && node->vn_state != VN_RECLAIMED)
330 cv_wait(&vp->v_cv, vp->v_interlock);
331
332 if (node->vn_state == VN_MARKER)
333 vnpanic(vp, "state is %s at %s:%d",
334 vstate_name(node->vn_state), func, line);
335 }
336
337 static void
338 vstate_assert_change(vnode_t *vp, enum vcache_state from, enum vcache_state to,
339 const char *func, int line)
340 {
341 struct vcache_node *node = VP_TO_VN(vp);
342
343 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
344 if (from == VN_LOADING)
345 KASSERTMSG(mutex_owned(&vcache.lock), "at %s:%d", func, line);
346
347 if (from == VN_MARKER)
348 vnpanic(vp, "from is %s at %s:%d",
349 vstate_name(from), func, line);
350 if (to == VN_MARKER)
351 vnpanic(vp, "to is %s at %s:%d",
352 vstate_name(to), func, line);
353 if (node->vn_state != from)
354 vnpanic(vp, "from is %s, expected %s at %s:%d\n",
355 vstate_name(node->vn_state), vstate_name(from), func, line);
356
357 node->vn_state = to;
358 if (from == VN_LOADING)
359 cv_broadcast(&vcache.cv);
360 if (to == VN_ACTIVE || to == VN_RECLAIMED)
361 cv_broadcast(&vp->v_cv);
362 }
363
364 #else /* defined(DIAGNOSTIC) */
365
366 #define VSTATE_GET(vp) \
367 (VP_TO_VN((vp))->vn_state)
368 #define VSTATE_CHANGE(vp, from, to) \
369 vstate_change((vp), (from), (to))
370 #define VSTATE_WAIT_STABLE(vp) \
371 vstate_wait_stable((vp))
372 #define VSTATE_ASSERT(vp, state)
373
374 static void
375 vstate_wait_stable(vnode_t *vp)
376 {
377 struct vcache_node *node = VP_TO_VN(vp);
378
379 while (node->vn_state != VN_ACTIVE && node->vn_state != VN_RECLAIMED)
380 cv_wait(&vp->v_cv, vp->v_interlock);
381 }
382
383 static void
384 vstate_change(vnode_t *vp, enum vcache_state from, enum vcache_state to)
385 {
386 struct vcache_node *node = VP_TO_VN(vp);
387
388 node->vn_state = to;
389 if (from == VN_LOADING)
390 cv_broadcast(&vcache.cv);
391 if (to == VN_ACTIVE || to == VN_RECLAIMED)
392 cv_broadcast(&vp->v_cv);
393 }
394
395 #endif /* defined(DIAGNOSTIC) */
396
397 void
398 vfs_vnode_sysinit(void)
399 {
400 int error __diagused;
401
402 dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL);
403 KASSERT(dead_rootmount != NULL);
404 dead_rootmount->mnt_iflag = IMNT_MPSAFE;
405
406 mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE);
407 TAILQ_INIT(&vnode_free_list);
408 TAILQ_INIT(&vnode_hold_list);
409 TAILQ_INIT(&vrele_list);
410
411 vcache_init();
412
413 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE);
414 cv_init(&vdrain_cv, "vdrain");
415 cv_init(&vrele_cv, "vrele");
416 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread,
417 NULL, NULL, "vdrain");
418 KASSERTMSG((error == 0), "kthread_create(vdrain) failed: %d", error);
419 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread,
420 NULL, &vrele_lwp, "vrele");
421 KASSERTMSG((error == 0), "kthread_create(vrele) failed: %d", error);
422 }
423
424 /*
425 * Allocate a new marker vnode.
426 */
427 vnode_t *
428 vnalloc_marker(struct mount *mp)
429 {
430 struct vcache_node *node;
431 vnode_t *vp;
432
433 node = pool_cache_get(vcache.pool, PR_WAITOK);
434 memset(node, 0, sizeof(*node));
435 vp = VN_TO_VP(node);
436 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0);
437 vp->v_mount = mp;
438 vp->v_type = VBAD;
439 node->vn_state = VN_MARKER;
440
441 return vp;
442 }
443
444 /*
445 * Free a marker vnode.
446 */
447 void
448 vnfree_marker(vnode_t *vp)
449 {
450 struct vcache_node *node;
451
452 node = VP_TO_VN(vp);
453 KASSERT(node->vn_state == VN_MARKER);
454 uvm_obj_destroy(&vp->v_uobj, true);
455 pool_cache_put(vcache.pool, node);
456 }
457
458 /*
459 * Test a vnode for being a marker vnode.
460 */
461 bool
462 vnis_marker(vnode_t *vp)
463 {
464
465 return (VP_TO_VN(vp)->vn_state == VN_MARKER);
466 }
467
468 /*
469 * cleanvnode: grab a vnode from freelist, clean and free it.
470 *
471 * => Releases vnode_free_list_lock.
472 */
473 static int
474 cleanvnode(void)
475 {
476 vnode_t *vp;
477 vnodelst_t *listhd;
478 struct mount *mp;
479
480 KASSERT(mutex_owned(&vnode_free_list_lock));
481
482 listhd = &vnode_free_list;
483 try_nextlist:
484 TAILQ_FOREACH(vp, listhd, v_freelist) {
485 /*
486 * It's safe to test v_usecount and v_iflag
487 * without holding the interlock here, since
488 * these vnodes should never appear on the
489 * lists.
490 */
491 KASSERT(vp->v_usecount == 0);
492 KASSERT(vp->v_freelisthd == listhd);
493
494 if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
495 continue;
496 if (!mutex_tryenter(vp->v_interlock)) {
497 VOP_UNLOCK(vp);
498 continue;
499 }
500 mp = vp->v_mount;
501 if (fstrans_start_nowait(mp, FSTRANS_SHARED) != 0) {
502 mutex_exit(vp->v_interlock);
503 VOP_UNLOCK(vp);
504 continue;
505 }
506 break;
507 }
508
509 if (vp == NULL) {
510 if (listhd == &vnode_free_list) {
511 listhd = &vnode_hold_list;
512 goto try_nextlist;
513 }
514 mutex_exit(&vnode_free_list_lock);
515 return EBUSY;
516 }
517
518 /* Remove it from the freelist. */
519 TAILQ_REMOVE(listhd, vp, v_freelist);
520 vp->v_freelisthd = NULL;
521 mutex_exit(&vnode_free_list_lock);
522
523 KASSERT(vp->v_usecount == 0);
524
525 /*
526 * The vnode is still associated with a file system, so we must
527 * clean it out before freeing it. We need to add a reference
528 * before doing this.
529 */
530 vp->v_usecount = 1;
531 vcache_reclaim(vp);
532 vrelel(vp, 0);
533 fstrans_done(mp);
534
535 return 0;
536 }
537
538 /*
539 * Helper thread to keep the number of vnodes below desiredvnodes.
540 */
541 static void
542 vdrain_thread(void *cookie)
543 {
544 int error;
545
546 mutex_enter(&vnode_free_list_lock);
547
548 for (;;) {
549 cv_timedwait(&vdrain_cv, &vnode_free_list_lock, hz);
550 while (numvnodes > desiredvnodes) {
551 error = cleanvnode();
552 if (error)
553 kpause("vndsbusy", false, hz, NULL);
554 mutex_enter(&vnode_free_list_lock);
555 if (error)
556 break;
557 }
558 }
559 }
560
561 /*
562 * Remove a vnode from its freelist.
563 */
564 void
565 vremfree(vnode_t *vp)
566 {
567
568 KASSERT(mutex_owned(vp->v_interlock));
569 KASSERT(vp->v_usecount == 0);
570
571 /*
572 * Note that the reference count must not change until
573 * the vnode is removed.
574 */
575 mutex_enter(&vnode_free_list_lock);
576 if (vp->v_holdcnt > 0) {
577 KASSERT(vp->v_freelisthd == &vnode_hold_list);
578 } else {
579 KASSERT(vp->v_freelisthd == &vnode_free_list);
580 }
581 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
582 vp->v_freelisthd = NULL;
583 mutex_exit(&vnode_free_list_lock);
584 }
585
586 /*
587 * vget: get a particular vnode from the free list, increment its reference
588 * count and return it.
589 *
590 * => Must be called with v_interlock held.
591 *
592 * If state is VN_RECLAIMING, the vnode may be eliminated in vcache_reclaim().
593 * In that case, we cannot grab the vnode, so the process is awakened when
594 * the transition is completed, and an error returned to indicate that the
595 * vnode is no longer usable.
596 *
597 * If state is VN_LOADING or VN_BLOCKED, wait until the vnode enters a
598 * stable state (VN_ACTIVE or VN_RECLAIMED).
599 */
600 int
601 vget(vnode_t *vp, int flags, bool waitok)
602 {
603
604 KASSERT(mutex_owned(vp->v_interlock));
605 KASSERT((flags & ~LK_NOWAIT) == 0);
606 KASSERT(waitok == ((flags & LK_NOWAIT) == 0));
607
608 /*
609 * Before adding a reference, we must remove the vnode
610 * from its freelist.
611 */
612 if (vp->v_usecount == 0) {
613 vremfree(vp);
614 vp->v_usecount = 1;
615 } else {
616 atomic_inc_uint(&vp->v_usecount);
617 }
618
619 /*
620 * If the vnode is in the process of changing state we wait
621 * for the change to complete and take care not to return
622 * a clean vnode.
623 */
624 if (! ISSET(flags, LK_NOWAIT))
625 VSTATE_WAIT_STABLE(vp);
626 if (VSTATE_GET(vp) == VN_RECLAIMED) {
627 vrelel(vp, 0);
628 return ENOENT;
629 } else if (VSTATE_GET(vp) != VN_ACTIVE) {
630 KASSERT(ISSET(flags, LK_NOWAIT));
631 vrelel(vp, 0);
632 return EBUSY;
633 }
634
635 /*
636 * Ok, we got it in good shape.
637 */
638 VSTATE_ASSERT(vp, VN_ACTIVE);
639 mutex_exit(vp->v_interlock);
640
641 return 0;
642 }
643
644 /*
645 * vput: unlock and release the reference.
646 */
647 void
648 vput(vnode_t *vp)
649 {
650
651 VOP_UNLOCK(vp);
652 vrele(vp);
653 }
654
655 /*
656 * Try to drop reference on a vnode. Abort if we are releasing the
657 * last reference. Note: this _must_ succeed if not the last reference.
658 */
659 static inline bool
660 vtryrele(vnode_t *vp)
661 {
662 u_int use, next;
663
664 for (use = vp->v_usecount;; use = next) {
665 if (use == 1) {
666 return false;
667 }
668 KASSERT(use > 1);
669 next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
670 if (__predict_true(next == use)) {
671 return true;
672 }
673 }
674 }
675
676 /*
677 * Vnode release. If reference count drops to zero, call inactive
678 * routine and either return to freelist or free to the pool.
679 */
680 static void
681 vrelel(vnode_t *vp, int flags)
682 {
683 bool recycle, defer;
684 int error;
685
686 KASSERT(mutex_owned(vp->v_interlock));
687 KASSERT(vp->v_freelisthd == NULL);
688
689 if (__predict_false(vp->v_op == dead_vnodeop_p &&
690 VSTATE_GET(vp) != VN_RECLAIMED)) {
691 vnpanic(vp, "dead but not clean");
692 }
693
694 /*
695 * If not the last reference, just drop the reference count
696 * and unlock.
697 */
698 if (vtryrele(vp)) {
699 mutex_exit(vp->v_interlock);
700 return;
701 }
702 if (vp->v_usecount <= 0 || vp->v_writecount != 0) {
703 vnpanic(vp, "%s: bad ref count", __func__);
704 }
705
706 #ifdef DIAGNOSTIC
707 if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
708 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
709 vprint("vrelel: missing VOP_CLOSE()", vp);
710 }
711 #endif
712
713 /*
714 * If not clean, deactivate the vnode, but preserve
715 * our reference across the call to VOP_INACTIVE().
716 */
717 if (VSTATE_GET(vp) != VN_RECLAIMED) {
718 recycle = false;
719
720 /*
721 * XXX This ugly block can be largely eliminated if
722 * locking is pushed down into the file systems.
723 *
724 * Defer vnode release to vrele_thread if caller
725 * requests it explicitly or is the pagedaemon.
726 */
727 if ((curlwp == uvm.pagedaemon_lwp) ||
728 (flags & VRELEL_ASYNC_RELE) != 0) {
729 defer = true;
730 } else if (curlwp == vrele_lwp) {
731 /*
732 * We have to try harder.
733 */
734 mutex_exit(vp->v_interlock);
735 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
736 KASSERTMSG((error == 0), "vn_lock failed: %d", error);
737 mutex_enter(vp->v_interlock);
738 defer = false;
739 } else {
740 /* If we can't acquire the lock, then defer. */
741 mutex_exit(vp->v_interlock);
742 error = vn_lock(vp,
743 LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
744 defer = (error != 0);
745 mutex_enter(vp->v_interlock);
746 }
747
748 KASSERT(mutex_owned(vp->v_interlock));
749 KASSERT(! (curlwp == vrele_lwp && defer));
750
751 if (defer) {
752 /*
753 * Defer reclaim to the kthread; it's not safe to
754 * clean it here. We donate it our last reference.
755 */
756 mutex_enter(&vrele_lock);
757 TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist);
758 if (++vrele_pending > (desiredvnodes >> 8))
759 cv_signal(&vrele_cv);
760 mutex_exit(&vrele_lock);
761 mutex_exit(vp->v_interlock);
762 return;
763 }
764
765 /*
766 * If the node got another reference while we
767 * released the interlock, don't try to inactivate it yet.
768 */
769 if (__predict_false(vtryrele(vp))) {
770 VOP_UNLOCK(vp);
771 mutex_exit(vp->v_interlock);
772 return;
773 }
774 VSTATE_CHANGE(vp, VN_ACTIVE, VN_BLOCKED);
775 mutex_exit(vp->v_interlock);
776
777 /*
778 * The vnode must not gain another reference while being
779 * deactivated. If VOP_INACTIVE() indicates that
780 * the described file has been deleted, then recycle
781 * the vnode.
782 *
783 * Note that VOP_INACTIVE() will drop the vnode lock.
784 */
785 VOP_INACTIVE(vp, &recycle);
786 if (recycle) {
787 /* vcache_reclaim() below will drop the lock. */
788 if (vn_lock(vp, LK_EXCLUSIVE) != 0)
789 recycle = false;
790 }
791 mutex_enter(vp->v_interlock);
792 VSTATE_CHANGE(vp, VN_BLOCKED, VN_ACTIVE);
793 if (!recycle) {
794 if (vtryrele(vp)) {
795 mutex_exit(vp->v_interlock);
796 return;
797 }
798 }
799
800 /* Take care of space accounting. */
801 if (vp->v_iflag & VI_EXECMAP) {
802 atomic_add_int(&uvmexp.execpages,
803 -vp->v_uobj.uo_npages);
804 atomic_add_int(&uvmexp.filepages,
805 vp->v_uobj.uo_npages);
806 }
807 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP);
808 vp->v_vflag &= ~VV_MAPPED;
809
810 /*
811 * Recycle the vnode if the file is now unused (unlinked),
812 * otherwise just free it.
813 */
814 if (recycle) {
815 VSTATE_ASSERT(vp, VN_ACTIVE);
816 vcache_reclaim(vp);
817 }
818 KASSERT(vp->v_usecount > 0);
819 }
820
821 if (atomic_dec_uint_nv(&vp->v_usecount) != 0) {
822 /* Gained another reference while being reclaimed. */
823 mutex_exit(vp->v_interlock);
824 return;
825 }
826
827 if (VSTATE_GET(vp) == VN_RECLAIMED) {
828 /*
829 * It's clean so destroy it. It isn't referenced
830 * anywhere since it has been reclaimed.
831 */
832 KASSERT(vp->v_holdcnt == 0);
833 KASSERT(vp->v_writecount == 0);
834 mutex_exit(vp->v_interlock);
835 vfs_insmntque(vp, NULL);
836 if (vp->v_type == VBLK || vp->v_type == VCHR) {
837 spec_node_destroy(vp);
838 }
839 vcache_free(VP_TO_VN(vp));
840 } else {
841 /*
842 * Otherwise, put it back onto the freelist. It
843 * can't be destroyed while still associated with
844 * a file system.
845 */
846 mutex_enter(&vnode_free_list_lock);
847 if (vp->v_holdcnt > 0) {
848 vp->v_freelisthd = &vnode_hold_list;
849 } else {
850 vp->v_freelisthd = &vnode_free_list;
851 }
852 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
853 mutex_exit(&vnode_free_list_lock);
854 mutex_exit(vp->v_interlock);
855 }
856 }
857
858 void
859 vrele(vnode_t *vp)
860 {
861
862 if (vtryrele(vp)) {
863 return;
864 }
865 mutex_enter(vp->v_interlock);
866 vrelel(vp, 0);
867 }
868
869 /*
870 * Asynchronous vnode release, vnode is released in different context.
871 */
872 void
873 vrele_async(vnode_t *vp)
874 {
875
876 if (vtryrele(vp)) {
877 return;
878 }
879 mutex_enter(vp->v_interlock);
880 vrelel(vp, VRELEL_ASYNC_RELE);
881 }
882
883 static void
884 vrele_thread(void *cookie)
885 {
886 vnodelst_t skip_list;
887 vnode_t *vp;
888 struct mount *mp;
889
890 TAILQ_INIT(&skip_list);
891
892 mutex_enter(&vrele_lock);
893 for (;;) {
894 while (TAILQ_EMPTY(&vrele_list)) {
895 vrele_gen++;
896 cv_broadcast(&vrele_cv);
897 cv_timedwait(&vrele_cv, &vrele_lock, hz);
898 TAILQ_CONCAT(&vrele_list, &skip_list, v_freelist);
899 }
900 vp = TAILQ_FIRST(&vrele_list);
901 mp = vp->v_mount;
902 TAILQ_REMOVE(&vrele_list, vp, v_freelist);
903 if (fstrans_start_nowait(mp, FSTRANS_LAZY) != 0) {
904 TAILQ_INSERT_TAIL(&skip_list, vp, v_freelist);
905 continue;
906 }
907 vrele_pending--;
908 mutex_exit(&vrele_lock);
909
910 /*
911 * If not the last reference, then ignore the vnode
912 * and look for more work.
913 */
914 mutex_enter(vp->v_interlock);
915 vrelel(vp, 0);
916 fstrans_done(mp);
917 mutex_enter(&vrele_lock);
918 }
919 }
920
921 void
922 vrele_flush(void)
923 {
924 int gen;
925
926 mutex_enter(&vrele_lock);
927 gen = vrele_gen;
928 while (vrele_pending && gen == vrele_gen) {
929 cv_broadcast(&vrele_cv);
930 cv_wait(&vrele_cv, &vrele_lock);
931 }
932 mutex_exit(&vrele_lock);
933 }
934
935 /*
936 * Vnode reference, where a reference is already held by some other
937 * object (for example, a file structure).
938 */
939 void
940 vref(vnode_t *vp)
941 {
942
943 KASSERT(vp->v_usecount != 0);
944
945 atomic_inc_uint(&vp->v_usecount);
946 }
947
948 /*
949 * Page or buffer structure gets a reference.
950 * Called with v_interlock held.
951 */
952 void
953 vholdl(vnode_t *vp)
954 {
955
956 KASSERT(mutex_owned(vp->v_interlock));
957
958 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) {
959 mutex_enter(&vnode_free_list_lock);
960 KASSERT(vp->v_freelisthd == &vnode_free_list);
961 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
962 vp->v_freelisthd = &vnode_hold_list;
963 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
964 mutex_exit(&vnode_free_list_lock);
965 }
966 }
967
968 /*
969 * Page or buffer structure frees a reference.
970 * Called with v_interlock held.
971 */
972 void
973 holdrelel(vnode_t *vp)
974 {
975
976 KASSERT(mutex_owned(vp->v_interlock));
977
978 if (vp->v_holdcnt <= 0) {
979 vnpanic(vp, "%s: holdcnt vp %p", __func__, vp);
980 }
981
982 vp->v_holdcnt--;
983 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) {
984 mutex_enter(&vnode_free_list_lock);
985 KASSERT(vp->v_freelisthd == &vnode_hold_list);
986 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
987 vp->v_freelisthd = &vnode_free_list;
988 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
989 mutex_exit(&vnode_free_list_lock);
990 }
991 }
992
993 /*
994 * Recycle an unused vnode if caller holds the last reference.
995 */
996 bool
997 vrecycle(vnode_t *vp)
998 {
999
1000 if (vn_lock(vp, LK_EXCLUSIVE) != 0)
1001 return false;
1002
1003 mutex_enter(vp->v_interlock);
1004
1005 if (vp->v_usecount != 1) {
1006 mutex_exit(vp->v_interlock);
1007 VOP_UNLOCK(vp);
1008 return false;
1009 }
1010 vcache_reclaim(vp);
1011 vrelel(vp, 0);
1012 return true;
1013 }
1014
1015 /*
1016 * Eliminate all activity associated with the requested vnode
1017 * and with all vnodes aliased to the requested vnode.
1018 */
1019 void
1020 vrevoke(vnode_t *vp)
1021 {
1022 vnode_t *vq;
1023 enum vtype type;
1024 dev_t dev;
1025
1026 KASSERT(vp->v_usecount > 0);
1027
1028 mutex_enter(vp->v_interlock);
1029 VSTATE_WAIT_STABLE(vp);
1030 if (VSTATE_GET(vp) == VN_RECLAIMED) {
1031 mutex_exit(vp->v_interlock);
1032 return;
1033 } else if (vp->v_type != VBLK && vp->v_type != VCHR) {
1034 atomic_inc_uint(&vp->v_usecount);
1035 mutex_exit(vp->v_interlock);
1036 vgone(vp);
1037 return;
1038 } else {
1039 dev = vp->v_rdev;
1040 type = vp->v_type;
1041 mutex_exit(vp->v_interlock);
1042 }
1043
1044 while (spec_node_lookup_by_dev(type, dev, &vq) == 0) {
1045 vgone(vq);
1046 }
1047 }
1048
1049 /*
1050 * Eliminate all activity associated with a vnode in preparation for
1051 * reuse. Drops a reference from the vnode.
1052 */
1053 void
1054 vgone(vnode_t *vp)
1055 {
1056
1057 if (vn_lock(vp, LK_EXCLUSIVE) != 0) {
1058 VSTATE_ASSERT(vp, VN_RECLAIMED);
1059 vrele(vp);
1060 }
1061
1062 mutex_enter(vp->v_interlock);
1063 vcache_reclaim(vp);
1064 vrelel(vp, 0);
1065 }
1066
1067 static inline uint32_t
1068 vcache_hash(const struct vcache_key *key)
1069 {
1070 uint32_t hash = HASH32_BUF_INIT;
1071
1072 hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash);
1073 hash = hash32_buf(key->vk_key, key->vk_key_len, hash);
1074 return hash;
1075 }
1076
1077 static void
1078 vcache_init(void)
1079 {
1080
1081 vcache.pool = pool_cache_init(sizeof(struct vcache_node), 0, 0, 0,
1082 "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL);
1083 KASSERT(vcache.pool != NULL);
1084 mutex_init(&vcache.lock, MUTEX_DEFAULT, IPL_NONE);
1085 cv_init(&vcache.cv, "vcache");
1086 vcache.hashtab = hashinit(desiredvnodes, HASH_SLIST, true,
1087 &vcache.hashmask);
1088 }
1089
1090 static void
1091 vcache_reinit(void)
1092 {
1093 int i;
1094 uint32_t hash;
1095 u_long oldmask, newmask;
1096 struct hashhead *oldtab, *newtab;
1097 struct vcache_node *node;
1098
1099 newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask);
1100 mutex_enter(&vcache.lock);
1101 oldtab = vcache.hashtab;
1102 oldmask = vcache.hashmask;
1103 vcache.hashtab = newtab;
1104 vcache.hashmask = newmask;
1105 for (i = 0; i <= oldmask; i++) {
1106 while ((node = SLIST_FIRST(&oldtab[i])) != NULL) {
1107 SLIST_REMOVE(&oldtab[i], node, vcache_node, vn_hash);
1108 hash = vcache_hash(&node->vn_key);
1109 SLIST_INSERT_HEAD(&newtab[hash & vcache.hashmask],
1110 node, vn_hash);
1111 }
1112 }
1113 mutex_exit(&vcache.lock);
1114 hashdone(oldtab, HASH_SLIST, oldmask);
1115 }
1116
1117 static inline struct vcache_node *
1118 vcache_hash_lookup(const struct vcache_key *key, uint32_t hash)
1119 {
1120 struct hashhead *hashp;
1121 struct vcache_node *node;
1122
1123 KASSERT(mutex_owned(&vcache.lock));
1124
1125 hashp = &vcache.hashtab[hash & vcache.hashmask];
1126 SLIST_FOREACH(node, hashp, vn_hash) {
1127 if (key->vk_mount != node->vn_key.vk_mount)
1128 continue;
1129 if (key->vk_key_len != node->vn_key.vk_key_len)
1130 continue;
1131 if (memcmp(key->vk_key, node->vn_key.vk_key, key->vk_key_len))
1132 continue;
1133 return node;
1134 }
1135 return NULL;
1136 }
1137
1138 /*
1139 * Allocate a new, uninitialized vcache node.
1140 */
1141 static struct vcache_node *
1142 vcache_alloc(void)
1143 {
1144 struct vcache_node *node;
1145 vnode_t *vp;
1146
1147 node = pool_cache_get(vcache.pool, PR_WAITOK);
1148 memset(node, 0, sizeof(*node));
1149
1150 /* SLIST_INIT(&node->vn_hash); */
1151
1152 vp = VN_TO_VP(node);
1153 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0);
1154 cv_init(&vp->v_cv, "vnode");
1155 /* LIST_INIT(&vp->v_nclist); */
1156 /* LIST_INIT(&vp->v_dnclist); */
1157
1158 mutex_enter(&vnode_free_list_lock);
1159 numvnodes++;
1160 if (numvnodes > desiredvnodes + desiredvnodes / 10)
1161 cv_signal(&vdrain_cv);
1162 mutex_exit(&vnode_free_list_lock);
1163
1164 rw_init(&vp->v_lock);
1165 vp->v_usecount = 1;
1166 vp->v_type = VNON;
1167 vp->v_size = vp->v_writesize = VSIZENOTSET;
1168
1169 node->vn_state = VN_LOADING;
1170
1171 return node;
1172 }
1173
1174 /*
1175 * Free an unused, unreferenced vcache node.
1176 */
1177 static void
1178 vcache_free(struct vcache_node *node)
1179 {
1180 vnode_t *vp;
1181
1182 vp = VN_TO_VP(node);
1183
1184 KASSERT(vp->v_usecount == 0);
1185
1186 rw_destroy(&vp->v_lock);
1187 mutex_enter(&vnode_free_list_lock);
1188 numvnodes--;
1189 mutex_exit(&vnode_free_list_lock);
1190
1191 uvm_obj_destroy(&vp->v_uobj, true);
1192 cv_destroy(&vp->v_cv);
1193 pool_cache_put(vcache.pool, node);
1194 }
1195
1196 /*
1197 * Get a vnode / fs node pair by key and return it referenced through vpp.
1198 */
1199 int
1200 vcache_get(struct mount *mp, const void *key, size_t key_len,
1201 struct vnode **vpp)
1202 {
1203 int error;
1204 uint32_t hash;
1205 const void *new_key;
1206 struct vnode *vp;
1207 struct vcache_key vcache_key;
1208 struct vcache_node *node, *new_node;
1209
1210 new_key = NULL;
1211 *vpp = NULL;
1212
1213 vcache_key.vk_mount = mp;
1214 vcache_key.vk_key = key;
1215 vcache_key.vk_key_len = key_len;
1216 hash = vcache_hash(&vcache_key);
1217
1218 again:
1219 mutex_enter(&vcache.lock);
1220 node = vcache_hash_lookup(&vcache_key, hash);
1221
1222 /* If found, take a reference or retry. */
1223 if (__predict_true(node != NULL)) {
1224 /*
1225 * If the vnode is loading we cannot take the v_interlock
1226 * here as it might change during load (see uvm_obj_setlock()).
1227 * As changing state from VN_LOADING requires both vcache.lock
1228 * and v_interlock it is safe to test with vcache.lock held.
1229 *
1230 * Wait for vnodes changing state from VN_LOADING and retry.
1231 */
1232 if (__predict_false(node->vn_state == VN_LOADING)) {
1233 cv_wait(&vcache.cv, &vcache.lock);
1234 mutex_exit(&vcache.lock);
1235 goto again;
1236 }
1237 vp = VN_TO_VP(node);
1238 mutex_enter(vp->v_interlock);
1239 mutex_exit(&vcache.lock);
1240 error = vget(vp, 0, true /* wait */);
1241 if (error == ENOENT)
1242 goto again;
1243 if (error == 0)
1244 *vpp = vp;
1245 KASSERT((error != 0) == (*vpp == NULL));
1246 return error;
1247 }
1248 mutex_exit(&vcache.lock);
1249
1250 /* Allocate and initialize a new vcache / vnode pair. */
1251 error = vfs_busy(mp, NULL);
1252 if (error)
1253 return error;
1254 new_node = vcache_alloc();
1255 new_node->vn_key = vcache_key;
1256 vp = VN_TO_VP(new_node);
1257 mutex_enter(&vcache.lock);
1258 node = vcache_hash_lookup(&vcache_key, hash);
1259 if (node == NULL) {
1260 SLIST_INSERT_HEAD(&vcache.hashtab[hash & vcache.hashmask],
1261 new_node, vn_hash);
1262 node = new_node;
1263 }
1264
1265 /* If another thread beat us inserting this node, retry. */
1266 if (node != new_node) {
1267 mutex_enter(vp->v_interlock);
1268 VSTATE_CHANGE(vp, VN_LOADING, VN_RECLAIMED);
1269 mutex_exit(&vcache.lock);
1270 vrelel(vp, 0);
1271 vfs_unbusy(mp, false, NULL);
1272 goto again;
1273 }
1274 mutex_exit(&vcache.lock);
1275
1276 /* Load the fs node. Exclusive as new_node is VN_LOADING. */
1277 error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key);
1278 if (error) {
1279 mutex_enter(&vcache.lock);
1280 SLIST_REMOVE(&vcache.hashtab[hash & vcache.hashmask],
1281 new_node, vcache_node, vn_hash);
1282 mutex_enter(vp->v_interlock);
1283 VSTATE_CHANGE(vp, VN_LOADING, VN_RECLAIMED);
1284 mutex_exit(&vcache.lock);
1285 vrelel(vp, 0);
1286 vfs_unbusy(mp, false, NULL);
1287 KASSERT(*vpp == NULL);
1288 return error;
1289 }
1290 KASSERT(new_key != NULL);
1291 KASSERT(memcmp(key, new_key, key_len) == 0);
1292 KASSERT(vp->v_op != NULL);
1293 vfs_insmntque(vp, mp);
1294 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1295 vp->v_vflag |= VV_MPSAFE;
1296 vfs_unbusy(mp, true, NULL);
1297
1298 /* Finished loading, finalize node. */
1299 mutex_enter(&vcache.lock);
1300 new_node->vn_key.vk_key = new_key;
1301 mutex_enter(vp->v_interlock);
1302 VSTATE_CHANGE(vp, VN_LOADING, VN_ACTIVE);
1303 mutex_exit(vp->v_interlock);
1304 mutex_exit(&vcache.lock);
1305 *vpp = vp;
1306 return 0;
1307 }
1308
1309 /*
1310 * Create a new vnode / fs node pair and return it referenced through vpp.
1311 */
1312 int
1313 vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap,
1314 kauth_cred_t cred, struct vnode **vpp)
1315 {
1316 int error;
1317 uint32_t hash;
1318 struct vnode *ovp, *vp;
1319 struct vcache_node *new_node;
1320 struct vcache_node *old_node __diagused;
1321
1322 *vpp = NULL;
1323
1324 /* Allocate and initialize a new vcache / vnode pair. */
1325 error = vfs_busy(mp, NULL);
1326 if (error)
1327 return error;
1328 new_node = vcache_alloc();
1329 new_node->vn_key.vk_mount = mp;
1330 vp = VN_TO_VP(new_node);
1331
1332 /* Create and load the fs node. */
1333 error = VFS_NEWVNODE(mp, dvp, vp, vap, cred,
1334 &new_node->vn_key.vk_key_len, &new_node->vn_key.vk_key);
1335 if (error) {
1336 mutex_enter(&vcache.lock);
1337 mutex_enter(vp->v_interlock);
1338 VSTATE_CHANGE(vp, VN_LOADING, VN_RECLAIMED);
1339 mutex_exit(&vcache.lock);
1340 vrelel(vp, 0);
1341 vfs_unbusy(mp, false, NULL);
1342 KASSERT(*vpp == NULL);
1343 return error;
1344 }
1345 KASSERT(new_node->vn_key.vk_key != NULL);
1346 KASSERT(vp->v_op != NULL);
1347 hash = vcache_hash(&new_node->vn_key);
1348
1349 /* Wait for previous instance to be reclaimed, then insert new node. */
1350 mutex_enter(&vcache.lock);
1351 while ((old_node = vcache_hash_lookup(&new_node->vn_key, hash))) {
1352 ovp = VN_TO_VP(old_node);
1353 mutex_enter(ovp->v_interlock);
1354 mutex_exit(&vcache.lock);
1355 error = vget(ovp, 0, true /* wait */);
1356 KASSERT(error == ENOENT);
1357 mutex_enter(&vcache.lock);
1358 }
1359 SLIST_INSERT_HEAD(&vcache.hashtab[hash & vcache.hashmask],
1360 new_node, vn_hash);
1361 mutex_exit(&vcache.lock);
1362 vfs_insmntque(vp, mp);
1363 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1364 vp->v_vflag |= VV_MPSAFE;
1365 vfs_unbusy(mp, true, NULL);
1366
1367 /* Finished loading, finalize node. */
1368 mutex_enter(&vcache.lock);
1369 mutex_enter(vp->v_interlock);
1370 VSTATE_CHANGE(vp, VN_LOADING, VN_ACTIVE);
1371 mutex_exit(&vcache.lock);
1372 mutex_exit(vp->v_interlock);
1373 *vpp = vp;
1374 return 0;
1375 }
1376
1377 /*
1378 * Prepare key change: lock old and new cache node.
1379 * Return an error if the new node already exists.
1380 */
1381 int
1382 vcache_rekey_enter(struct mount *mp, struct vnode *vp,
1383 const void *old_key, size_t old_key_len,
1384 const void *new_key, size_t new_key_len)
1385 {
1386 uint32_t old_hash, new_hash;
1387 struct vcache_key old_vcache_key, new_vcache_key;
1388 struct vcache_node *node, *new_node;
1389 struct vnode *tvp;
1390
1391 old_vcache_key.vk_mount = mp;
1392 old_vcache_key.vk_key = old_key;
1393 old_vcache_key.vk_key_len = old_key_len;
1394 old_hash = vcache_hash(&old_vcache_key);
1395
1396 new_vcache_key.vk_mount = mp;
1397 new_vcache_key.vk_key = new_key;
1398 new_vcache_key.vk_key_len = new_key_len;
1399 new_hash = vcache_hash(&new_vcache_key);
1400
1401 new_node = vcache_alloc();
1402 new_node->vn_key = new_vcache_key;
1403 tvp = VN_TO_VP(new_node);
1404
1405 /* Insert locked new node used as placeholder. */
1406 mutex_enter(&vcache.lock);
1407 node = vcache_hash_lookup(&new_vcache_key, new_hash);
1408 if (node != NULL) {
1409 mutex_enter(tvp->v_interlock);
1410 VSTATE_CHANGE(tvp, VN_LOADING, VN_RECLAIMED);
1411 mutex_exit(&vcache.lock);
1412 vrelel(tvp, 0);
1413 return EEXIST;
1414 }
1415 SLIST_INSERT_HEAD(&vcache.hashtab[new_hash & vcache.hashmask],
1416 new_node, vn_hash);
1417
1418 /* Lock old node. */
1419 node = vcache_hash_lookup(&old_vcache_key, old_hash);
1420 KASSERT(node != NULL);
1421 KASSERT(VN_TO_VP(node) == vp);
1422 mutex_enter(vp->v_interlock);
1423 VSTATE_CHANGE(vp, VN_ACTIVE, VN_BLOCKED);
1424 node->vn_key = old_vcache_key;
1425 mutex_exit(vp->v_interlock);
1426 mutex_exit(&vcache.lock);
1427 return 0;
1428 }
1429
1430 /*
1431 * Key change complete: remove old node and unlock new node.
1432 */
1433 void
1434 vcache_rekey_exit(struct mount *mp, struct vnode *vp,
1435 const void *old_key, size_t old_key_len,
1436 const void *new_key, size_t new_key_len)
1437 {
1438 uint32_t old_hash, new_hash;
1439 struct vcache_key old_vcache_key, new_vcache_key;
1440 struct vcache_node *old_node, *new_node;
1441 struct vnode *tvp;
1442
1443 old_vcache_key.vk_mount = mp;
1444 old_vcache_key.vk_key = old_key;
1445 old_vcache_key.vk_key_len = old_key_len;
1446 old_hash = vcache_hash(&old_vcache_key);
1447
1448 new_vcache_key.vk_mount = mp;
1449 new_vcache_key.vk_key = new_key;
1450 new_vcache_key.vk_key_len = new_key_len;
1451 new_hash = vcache_hash(&new_vcache_key);
1452
1453 mutex_enter(&vcache.lock);
1454
1455 /* Lookup old and new node. */
1456 old_node = vcache_hash_lookup(&old_vcache_key, old_hash);
1457 KASSERT(old_node != NULL);
1458 KASSERT(VN_TO_VP(old_node) == vp);
1459 mutex_enter(vp->v_interlock);
1460 VSTATE_ASSERT(vp, VN_BLOCKED);
1461
1462 new_node = vcache_hash_lookup(&new_vcache_key, new_hash);
1463 KASSERT(new_node != NULL);
1464 KASSERT(new_node->vn_key.vk_key_len == new_key_len);
1465 tvp = VN_TO_VP(new_node);
1466 mutex_enter(tvp->v_interlock);
1467 VSTATE_ASSERT(VN_TO_VP(new_node), VN_LOADING);
1468
1469 /* Rekey old node and put it onto its new hashlist. */
1470 old_node->vn_key = new_vcache_key;
1471 if (old_hash != new_hash) {
1472 SLIST_REMOVE(&vcache.hashtab[old_hash & vcache.hashmask],
1473 old_node, vcache_node, vn_hash);
1474 SLIST_INSERT_HEAD(&vcache.hashtab[new_hash & vcache.hashmask],
1475 old_node, vn_hash);
1476 }
1477 VSTATE_CHANGE(vp, VN_BLOCKED, VN_ACTIVE);
1478 mutex_exit(vp->v_interlock);
1479
1480 /* Remove new node used as placeholder. */
1481 SLIST_REMOVE(&vcache.hashtab[new_hash & vcache.hashmask],
1482 new_node, vcache_node, vn_hash);
1483 VSTATE_CHANGE(tvp, VN_LOADING, VN_RECLAIMED);
1484 mutex_exit(&vcache.lock);
1485 vrelel(tvp, 0);
1486 }
1487
1488 /*
1489 * Disassociate the underlying file system from a vnode.
1490 *
1491 * Must be called with vnode locked and will return unlocked.
1492 * Must be called with the interlock held, and will return with it held.
1493 */
1494 static void
1495 vcache_reclaim(vnode_t *vp)
1496 {
1497 lwp_t *l = curlwp;
1498 struct vcache_node *node = VP_TO_VN(vp);
1499 uint32_t hash;
1500 uint8_t temp_buf[64], *temp_key;
1501 size_t temp_key_len;
1502 bool recycle, active;
1503 int error;
1504
1505 KASSERT((vp->v_vflag & VV_LOCKSWORK) == 0 ||
1506 VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
1507 KASSERT(mutex_owned(vp->v_interlock));
1508 KASSERT(vp->v_usecount != 0);
1509
1510 active = (vp->v_usecount > 1);
1511 temp_key_len = node->vn_key.vk_key_len;
1512 /*
1513 * Prevent the vnode from being recycled or brought into use
1514 * while we clean it out.
1515 */
1516 VSTATE_CHANGE(vp, VN_ACTIVE, VN_RECLAIMING);
1517 if (vp->v_iflag & VI_EXECMAP) {
1518 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages);
1519 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages);
1520 }
1521 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
1522 mutex_exit(vp->v_interlock);
1523
1524 /* Replace the vnode key with a temporary copy. */
1525 if (node->vn_key.vk_key_len > sizeof(temp_buf)) {
1526 temp_key = kmem_alloc(temp_key_len, KM_SLEEP);
1527 } else {
1528 temp_key = temp_buf;
1529 }
1530 mutex_enter(&vcache.lock);
1531 memcpy(temp_key, node->vn_key.vk_key, temp_key_len);
1532 node->vn_key.vk_key = temp_key;
1533 mutex_exit(&vcache.lock);
1534
1535 /*
1536 * Clean out any cached data associated with the vnode.
1537 * If purging an active vnode, it must be closed and
1538 * deactivated before being reclaimed. Note that the
1539 * VOP_INACTIVE will unlock the vnode.
1540 */
1541 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
1542 if (error != 0) {
1543 if (wapbl_vphaswapbl(vp))
1544 WAPBL_DISCARD(wapbl_vptomp(vp));
1545 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
1546 }
1547 KASSERTMSG((error == 0), "vinvalbuf failed: %d", error);
1548 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1549 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) {
1550 spec_node_revoke(vp);
1551 }
1552 if (active) {
1553 VOP_INACTIVE(vp, &recycle);
1554 } else {
1555 /*
1556 * Any other processes trying to obtain this lock must first
1557 * wait for VN_RECLAIMED, then call the new lock operation.
1558 */
1559 VOP_UNLOCK(vp);
1560 }
1561
1562 /* Disassociate the underlying file system from the vnode. */
1563 if (VOP_RECLAIM(vp)) {
1564 vnpanic(vp, "%s: cannot reclaim", __func__);
1565 }
1566
1567 KASSERT(vp->v_data == NULL);
1568 KASSERT(vp->v_uobj.uo_npages == 0);
1569
1570 if (vp->v_type == VREG && vp->v_ractx != NULL) {
1571 uvm_ra_freectx(vp->v_ractx);
1572 vp->v_ractx = NULL;
1573 }
1574
1575 /* Purge name cache. */
1576 cache_purge(vp);
1577
1578 /* Move to dead mount. */
1579 vp->v_vflag &= ~VV_ROOT;
1580 atomic_inc_uint(&dead_rootmount->mnt_refcnt);
1581 vfs_insmntque(vp, dead_rootmount);
1582
1583 /* Remove from vnode cache. */
1584 hash = vcache_hash(&node->vn_key);
1585 mutex_enter(&vcache.lock);
1586 KASSERT(node == vcache_hash_lookup(&node->vn_key, hash));
1587 SLIST_REMOVE(&vcache.hashtab[hash & vcache.hashmask],
1588 node, vcache_node, vn_hash);
1589 mutex_exit(&vcache.lock);
1590 if (temp_key != temp_buf)
1591 kmem_free(temp_key, temp_key_len);
1592
1593 /* Done with purge, notify sleepers of the grim news. */
1594 mutex_enter(vp->v_interlock);
1595 vp->v_op = dead_vnodeop_p;
1596 vp->v_vflag |= VV_LOCKSWORK;
1597 VSTATE_CHANGE(vp, VN_RECLAIMING, VN_RECLAIMED);
1598 vp->v_tag = VT_NON;
1599 KNOTE(&vp->v_klist, NOTE_REVOKE);
1600
1601 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1602 }
1603
1604 /*
1605 * Remove a vnode / fs node pair from the cache.
1606 */
1607 void
1608 vcache_remove(struct mount *mp, const void *key, size_t key_len)
1609 {
1610 #ifdef DIAGNOSTIC
1611 uint32_t hash;
1612 struct vcache_key vcache_key;
1613 struct vcache_node *node;
1614
1615 vcache_key.vk_mount = mp;
1616 vcache_key.vk_key = key;
1617 vcache_key.vk_key_len = key_len;
1618 hash = vcache_hash(&vcache_key);
1619
1620 mutex_enter(&vcache.lock);
1621 node = vcache_hash_lookup(&vcache_key, hash);
1622 KASSERT(node != NULL);
1623 mutex_exit(&vcache.lock);
1624 #endif
1625 }
1626
1627 /*
1628 * Print a vcache node.
1629 */
1630 void
1631 vcache_print(vnode_t *vp, const char *prefix, void (*pr)(const char *, ...))
1632 {
1633 int n;
1634 const uint8_t *cp;
1635 struct vcache_node *node;
1636
1637 node = VP_TO_VN(vp);
1638 n = node->vn_key.vk_key_len;
1639 cp = node->vn_key.vk_key;
1640
1641 (*pr)("%sstate %s, key(%d)", prefix, vstate_name(node->vn_state), n);
1642
1643 while (n-- > 0)
1644 (*pr)(" %02x", *cp++);
1645 (*pr)("\n");
1646 }
1647
1648 /*
1649 * Update outstanding I/O count and do wakeup if requested.
1650 */
1651 void
1652 vwakeup(struct buf *bp)
1653 {
1654 vnode_t *vp;
1655
1656 if ((vp = bp->b_vp) == NULL)
1657 return;
1658
1659 KASSERT(bp->b_objlock == vp->v_interlock);
1660 KASSERT(mutex_owned(bp->b_objlock));
1661
1662 if (--vp->v_numoutput < 0)
1663 vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp);
1664 if (vp->v_numoutput == 0)
1665 cv_broadcast(&vp->v_cv);
1666 }
1667
1668 /*
1669 * Test a vnode for being or becoming dead. Returns one of:
1670 * EBUSY: vnode is becoming dead, with "flags == VDEAD_NOWAIT" only.
1671 * ENOENT: vnode is dead.
1672 * 0: otherwise.
1673 *
1674 * Whenever this function returns a non-zero value all future
1675 * calls will also return a non-zero value.
1676 */
1677 int
1678 vdead_check(struct vnode *vp, int flags)
1679 {
1680
1681 KASSERT(mutex_owned(vp->v_interlock));
1682
1683 if (! ISSET(flags, VDEAD_NOWAIT))
1684 VSTATE_WAIT_STABLE(vp);
1685
1686 if (VSTATE_GET(vp) == VN_RECLAIMING) {
1687 KASSERT(ISSET(flags, VDEAD_NOWAIT));
1688 return EBUSY;
1689 } else if (VSTATE_GET(vp) == VN_RECLAIMED) {
1690 return ENOENT;
1691 }
1692
1693 return 0;
1694 }
1695
1696 int
1697 vfs_drainvnodes(long target)
1698 {
1699 int error;
1700
1701 mutex_enter(&vnode_free_list_lock);
1702
1703 while (numvnodes > target) {
1704 error = cleanvnode();
1705 if (error != 0)
1706 return error;
1707 mutex_enter(&vnode_free_list_lock);
1708 }
1709
1710 mutex_exit(&vnode_free_list_lock);
1711
1712 vcache_reinit();
1713
1714 return 0;
1715 }
1716
1717 void
1718 vnpanic(vnode_t *vp, const char *fmt, ...)
1719 {
1720 va_list ap;
1721
1722 #ifdef DIAGNOSTIC
1723 vprint(NULL, vp);
1724 #endif
1725 va_start(ap, fmt);
1726 vpanic(fmt, ap);
1727 va_end(ap);
1728 }
1729