vfs_vnode.c revision 1.100 1 /* $NetBSD: vfs_vnode.c,v 1.100 2017/09/22 06:05:20 joerg Exp $ */
2
3 /*-
4 * Copyright (c) 1997-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /*
34 * Copyright (c) 1989, 1993
35 * The Regents of the University of California. All rights reserved.
36 * (c) UNIX System Laboratories, Inc.
37 * All or some portions of this file are derived from material licensed
38 * to the University of California by American Telephone and Telegraph
39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40 * the permission of UNIX System Laboratories, Inc.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
67 */
68
69 /*
70 * The vnode cache subsystem.
71 *
72 * Life-cycle
73 *
74 * Normally, there are two points where new vnodes are created:
75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode
76 * starts in one of the following ways:
77 *
78 * - Allocation, via vcache_get(9) or vcache_new(9).
79 * - Reclamation of inactive vnode, via vcache_vget(9).
80 *
81 * Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9)
82 * was another, traditional way. Currently, only the draining thread
83 * recycles the vnodes. This behaviour might be revisited.
84 *
85 * The life-cycle ends when the last reference is dropped, usually
86 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform
87 * the file system that vnode is inactive. Via this call, file system
88 * indicates whether vnode can be recycled (usually, it checks its own
89 * references, e.g. count of links, whether the file was removed).
90 *
91 * Depending on indication, vnode can be put into a free list (cache),
92 * or cleaned via vcache_reclaim, which calls VOP_RECLAIM(9) to
93 * disassociate underlying file system from the vnode, and finally
94 * destroyed.
95 *
96 * Vnode state
97 *
98 * Vnode is always in one of six states:
99 * - MARKER This is a marker vnode to help list traversal. It
100 * will never change its state.
101 * - LOADING Vnode is associating underlying file system and not
102 * yet ready to use.
103 * - LOADED Vnode has associated underlying file system and is
104 * ready to use.
105 * - BLOCKED Vnode is active but cannot get new references.
106 * - RECLAIMING Vnode is disassociating from the underlying file
107 * system.
108 * - RECLAIMED Vnode has disassociated from underlying file system
109 * and is dead.
110 *
111 * Valid state changes are:
112 * LOADING -> LOADED
113 * Vnode has been initialised in vcache_get() or
114 * vcache_new() and is ready to use.
115 * LOADED -> RECLAIMING
116 * Vnode starts disassociation from underlying file
117 * system in vcache_reclaim().
118 * RECLAIMING -> RECLAIMED
119 * Vnode finished disassociation from underlying file
120 * system in vcache_reclaim().
121 * LOADED -> BLOCKED
122 * Either vcache_rekey*() is changing the vnode key or
123 * vrelel() is about to call VOP_INACTIVE().
124 * BLOCKED -> LOADED
125 * The block condition is over.
126 * LOADING -> RECLAIMED
127 * Either vcache_get() or vcache_new() failed to
128 * associate the underlying file system or vcache_rekey*()
129 * drops a vnode used as placeholder.
130 *
131 * Of these states LOADING, BLOCKED and RECLAIMING are intermediate
132 * and it is possible to wait for state change.
133 *
134 * State is protected with v_interlock with one exception:
135 * to change from LOADING both v_interlock and vcache_lock must be held
136 * so it is possible to check "state == LOADING" without holding
137 * v_interlock. See vcache_get() for details.
138 *
139 * Reference counting
140 *
141 * Vnode is considered active, if reference count (vnode_t::v_usecount)
142 * is non-zero. It is maintained using: vref(9) and vrele(9), as well
143 * as vput(9), routines. Common points holding references are e.g.
144 * file openings, current working directory, mount points, etc.
145 *
146 * Note on v_usecount and its locking
147 *
148 * At nearly all points it is known that v_usecount could be zero,
149 * the vnode_t::v_interlock will be held. To change v_usecount away
150 * from zero, the interlock must be held. To change from a non-zero
151 * value to zero, again the interlock must be held.
152 *
153 * Changing the usecount from a non-zero value to a non-zero value can
154 * safely be done using atomic operations, without the interlock held.
155 *
156 */
157
158 #include <sys/cdefs.h>
159 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.100 2017/09/22 06:05:20 joerg Exp $");
160
161 #include <sys/param.h>
162 #include <sys/kernel.h>
163
164 #include <sys/atomic.h>
165 #include <sys/buf.h>
166 #include <sys/conf.h>
167 #include <sys/device.h>
168 #include <sys/hash.h>
169 #include <sys/kauth.h>
170 #include <sys/kmem.h>
171 #include <sys/kthread.h>
172 #include <sys/module.h>
173 #include <sys/mount.h>
174 #include <sys/namei.h>
175 #include <sys/syscallargs.h>
176 #include <sys/sysctl.h>
177 #include <sys/systm.h>
178 #include <sys/vnode_impl.h>
179 #include <sys/wapbl.h>
180 #include <sys/fstrans.h>
181
182 #include <uvm/uvm.h>
183 #include <uvm/uvm_readahead.h>
184
185 /* Flags to vrelel. */
186 #define VRELEL_ASYNC_RELE 0x0001 /* Always defer to vrele thread. */
187 #define VRELEL_FORCE_RELE 0x0002 /* Must always succeed. */
188
189 u_int numvnodes __cacheline_aligned;
190
191 /*
192 * There are three lru lists: one holds vnodes waiting for async release,
193 * one is for vnodes which have no buffer/page references and
194 * one for those which do (i.e. v_holdcnt is non-zero).
195 */
196 static vnodelst_t lru_vrele_list __cacheline_aligned;
197 static vnodelst_t lru_free_list __cacheline_aligned;
198 static vnodelst_t lru_hold_list __cacheline_aligned;
199 static kmutex_t vdrain_lock __cacheline_aligned;
200 static kcondvar_t vdrain_cv __cacheline_aligned;
201 static int vdrain_gen;
202 static kcondvar_t vdrain_gen_cv;
203 static bool vdrain_retry;
204 static lwp_t * vdrain_lwp;
205 SLIST_HEAD(hashhead, vnode_impl);
206 static kmutex_t vcache_lock __cacheline_aligned;
207 static kcondvar_t vcache_cv __cacheline_aligned;
208 static u_int vcache_hashsize;
209 static u_long vcache_hashmask;
210 static struct hashhead *vcache_hashtab __cacheline_aligned;
211 static pool_cache_t vcache_pool;
212 static void lru_requeue(vnode_t *, vnodelst_t *);
213 static vnodelst_t * lru_which(vnode_t *);
214 static vnode_impl_t * vcache_alloc(void);
215 static void vcache_dealloc(vnode_impl_t *);
216 static void vcache_free(vnode_impl_t *);
217 static void vcache_init(void);
218 static void vcache_reinit(void);
219 static void vcache_reclaim(vnode_t *);
220 static void vrelel(vnode_t *, int);
221 static void vdrain_thread(void *);
222 static void vnpanic(vnode_t *, const char *, ...)
223 __printflike(2, 3);
224
225 /* Routines having to do with the management of the vnode table. */
226 extern struct mount *dead_rootmount;
227 extern int (**dead_vnodeop_p)(void *);
228 extern int (**spec_vnodeop_p)(void *);
229 extern struct vfsops dead_vfsops;
230
231 /* Vnode state operations and diagnostics. */
232
233 #if defined(DIAGNOSTIC)
234
235 #define VSTATE_VALID(state) \
236 ((state) != VS_ACTIVE && (state) != VS_MARKER)
237 #define VSTATE_GET(vp) \
238 vstate_assert_get((vp), __func__, __LINE__)
239 #define VSTATE_CHANGE(vp, from, to) \
240 vstate_assert_change((vp), (from), (to), __func__, __LINE__)
241 #define VSTATE_WAIT_STABLE(vp) \
242 vstate_assert_wait_stable((vp), __func__, __LINE__)
243
244 void
245 _vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line,
246 bool has_lock)
247 {
248 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
249
250 if (!has_lock) {
251 /*
252 * Prevent predictive loads from the CPU, but check the state
253 * without loooking first.
254 */
255 membar_enter();
256 if (state == VS_ACTIVE && vp->v_usecount > 0 &&
257 (vip->vi_state == VS_LOADED || vip->vi_state == VS_BLOCKED))
258 return;
259 if (vip->vi_state == state)
260 return;
261 mutex_enter((vp)->v_interlock);
262 }
263
264 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
265
266 if ((state == VS_ACTIVE && vp->v_usecount > 0 &&
267 (vip->vi_state == VS_LOADED || vip->vi_state == VS_BLOCKED)) ||
268 vip->vi_state == state) {
269 if (!has_lock)
270 mutex_exit((vp)->v_interlock);
271 return;
272 }
273 vnpanic(vp, "state is %s, usecount %d, expected %s at %s:%d",
274 vstate_name(vip->vi_state), vp->v_usecount,
275 vstate_name(state), func, line);
276 }
277
278 static enum vnode_state
279 vstate_assert_get(vnode_t *vp, const char *func, int line)
280 {
281 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
282
283 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
284 if (! VSTATE_VALID(vip->vi_state))
285 vnpanic(vp, "state is %s at %s:%d",
286 vstate_name(vip->vi_state), func, line);
287
288 return vip->vi_state;
289 }
290
291 static void
292 vstate_assert_wait_stable(vnode_t *vp, const char *func, int line)
293 {
294 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
295
296 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
297 if (! VSTATE_VALID(vip->vi_state))
298 vnpanic(vp, "state is %s at %s:%d",
299 vstate_name(vip->vi_state), func, line);
300
301 while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED)
302 cv_wait(&vp->v_cv, vp->v_interlock);
303
304 if (! VSTATE_VALID(vip->vi_state))
305 vnpanic(vp, "state is %s at %s:%d",
306 vstate_name(vip->vi_state), func, line);
307 }
308
309 static void
310 vstate_assert_change(vnode_t *vp, enum vnode_state from, enum vnode_state to,
311 const char *func, int line)
312 {
313 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
314
315 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
316 if (from == VS_LOADING)
317 KASSERTMSG(mutex_owned(&vcache_lock), "at %s:%d", func, line);
318
319 if (! VSTATE_VALID(from))
320 vnpanic(vp, "from is %s at %s:%d",
321 vstate_name(from), func, line);
322 if (! VSTATE_VALID(to))
323 vnpanic(vp, "to is %s at %s:%d",
324 vstate_name(to), func, line);
325 if (vip->vi_state != from)
326 vnpanic(vp, "from is %s, expected %s at %s:%d\n",
327 vstate_name(vip->vi_state), vstate_name(from), func, line);
328 if ((from == VS_BLOCKED || to == VS_BLOCKED) && vp->v_usecount != 1)
329 vnpanic(vp, "%s to %s with usecount %d at %s:%d",
330 vstate_name(from), vstate_name(to), vp->v_usecount,
331 func, line);
332
333 vip->vi_state = to;
334 if (from == VS_LOADING)
335 cv_broadcast(&vcache_cv);
336 if (to == VS_LOADED || to == VS_RECLAIMED)
337 cv_broadcast(&vp->v_cv);
338 }
339
340 #else /* defined(DIAGNOSTIC) */
341
342 #define VSTATE_GET(vp) \
343 (VNODE_TO_VIMPL((vp))->vi_state)
344 #define VSTATE_CHANGE(vp, from, to) \
345 vstate_change((vp), (from), (to))
346 #define VSTATE_WAIT_STABLE(vp) \
347 vstate_wait_stable((vp))
348 void
349 _vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line,
350 bool has_lock)
351 {
352
353 }
354
355 static void
356 vstate_wait_stable(vnode_t *vp)
357 {
358 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
359
360 while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED)
361 cv_wait(&vp->v_cv, vp->v_interlock);
362 }
363
364 static void
365 vstate_change(vnode_t *vp, enum vnode_state from, enum vnode_state to)
366 {
367 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
368
369 vip->vi_state = to;
370 if (from == VS_LOADING)
371 cv_broadcast(&vcache_cv);
372 if (to == VS_LOADED || to == VS_RECLAIMED)
373 cv_broadcast(&vp->v_cv);
374 }
375
376 #endif /* defined(DIAGNOSTIC) */
377
378 void
379 vfs_vnode_sysinit(void)
380 {
381 int error __diagused;
382
383 dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL);
384 KASSERT(dead_rootmount != NULL);
385 dead_rootmount->mnt_iflag = IMNT_MPSAFE;
386
387 mutex_init(&vdrain_lock, MUTEX_DEFAULT, IPL_NONE);
388 TAILQ_INIT(&lru_free_list);
389 TAILQ_INIT(&lru_hold_list);
390 TAILQ_INIT(&lru_vrele_list);
391
392 vcache_init();
393
394 cv_init(&vdrain_cv, "vdrain");
395 cv_init(&vdrain_gen_cv, "vdrainwt");
396 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread,
397 NULL, &vdrain_lwp, "vdrain");
398 KASSERTMSG((error == 0), "kthread_create(vdrain) failed: %d", error);
399 }
400
401 /*
402 * Allocate a new marker vnode.
403 */
404 vnode_t *
405 vnalloc_marker(struct mount *mp)
406 {
407 vnode_impl_t *vip;
408 vnode_t *vp;
409
410 vip = pool_cache_get(vcache_pool, PR_WAITOK);
411 memset(vip, 0, sizeof(*vip));
412 vp = VIMPL_TO_VNODE(vip);
413 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0);
414 vp->v_mount = mp;
415 vp->v_type = VBAD;
416 vip->vi_state = VS_MARKER;
417
418 return vp;
419 }
420
421 /*
422 * Free a marker vnode.
423 */
424 void
425 vnfree_marker(vnode_t *vp)
426 {
427 vnode_impl_t *vip;
428
429 vip = VNODE_TO_VIMPL(vp);
430 KASSERT(vip->vi_state == VS_MARKER);
431 uvm_obj_destroy(&vp->v_uobj, true);
432 pool_cache_put(vcache_pool, vip);
433 }
434
435 /*
436 * Test a vnode for being a marker vnode.
437 */
438 bool
439 vnis_marker(vnode_t *vp)
440 {
441
442 return (VNODE_TO_VIMPL(vp)->vi_state == VS_MARKER);
443 }
444
445 /*
446 * Return the lru list this node should be on.
447 */
448 static vnodelst_t *
449 lru_which(vnode_t *vp)
450 {
451
452 KASSERT(mutex_owned(vp->v_interlock));
453
454 if (vp->v_holdcnt > 0)
455 return &lru_hold_list;
456 else
457 return &lru_free_list;
458 }
459
460 /*
461 * Put vnode to end of given list.
462 * Both the current and the new list may be NULL, used on vnode alloc/free.
463 * Adjust numvnodes and signal vdrain thread if there is work.
464 */
465 static void
466 lru_requeue(vnode_t *vp, vnodelst_t *listhd)
467 {
468 vnode_impl_t *vip;
469
470 mutex_enter(&vdrain_lock);
471 vip = VNODE_TO_VIMPL(vp);
472 if (vip->vi_lrulisthd != NULL)
473 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
474 else
475 numvnodes++;
476 vip->vi_lrulisthd = listhd;
477 if (vip->vi_lrulisthd != NULL)
478 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
479 else
480 numvnodes--;
481 if (numvnodes > desiredvnodes || listhd == &lru_vrele_list)
482 cv_broadcast(&vdrain_cv);
483 mutex_exit(&vdrain_lock);
484 }
485
486 /*
487 * Release deferred vrele vnodes for this mount.
488 * Called with file system suspended.
489 */
490 void
491 vrele_flush(struct mount *mp)
492 {
493 vnode_impl_t *vip, *marker;
494
495 KASSERT(fstrans_is_owner(mp));
496
497 marker = VNODE_TO_VIMPL(vnalloc_marker(NULL));
498
499 mutex_enter(&vdrain_lock);
500 TAILQ_INSERT_HEAD(&lru_vrele_list, marker, vi_lrulist);
501
502 while ((vip = TAILQ_NEXT(marker, vi_lrulist))) {
503 TAILQ_REMOVE(&lru_vrele_list, marker, vi_lrulist);
504 TAILQ_INSERT_AFTER(&lru_vrele_list, vip, marker, vi_lrulist);
505 if (vnis_marker(VIMPL_TO_VNODE(vip)))
506 continue;
507
508 KASSERT(vip->vi_lrulisthd == &lru_vrele_list);
509 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
510 vip->vi_lrulisthd = &lru_hold_list;
511 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
512 mutex_exit(&vdrain_lock);
513
514 mutex_enter(VIMPL_TO_VNODE(vip)->v_interlock);
515 vrelel(VIMPL_TO_VNODE(vip), VRELEL_FORCE_RELE);
516
517 mutex_enter(&vdrain_lock);
518 }
519
520 TAILQ_REMOVE(&lru_vrele_list, marker, vi_lrulist);
521 mutex_exit(&vdrain_lock);
522
523 vnfree_marker(VIMPL_TO_VNODE(marker));
524 }
525
526 /*
527 * Reclaim a cached vnode. Used from vdrain_thread only.
528 */
529 static __inline void
530 vdrain_remove(vnode_t *vp)
531 {
532 struct mount *mp;
533
534 KASSERT(mutex_owned(&vdrain_lock));
535
536 /* Probe usecount (unlocked). */
537 if (vp->v_usecount > 0)
538 return;
539 /* Try v_interlock -- we lock the wrong direction! */
540 if (!mutex_tryenter(vp->v_interlock))
541 return;
542 /* Probe usecount and state. */
543 if (vp->v_usecount > 0 || VSTATE_GET(vp) != VS_LOADED) {
544 mutex_exit(vp->v_interlock);
545 return;
546 }
547 mp = vp->v_mount;
548 if (fstrans_start_nowait(mp) != 0) {
549 mutex_exit(vp->v_interlock);
550 return;
551 }
552 vdrain_retry = true;
553 mutex_exit(&vdrain_lock);
554
555 if (vcache_vget(vp) == 0) {
556 if (!vrecycle(vp)) {
557 mutex_enter(vp->v_interlock);
558 vrelel(vp, VRELEL_FORCE_RELE);
559 }
560 }
561 fstrans_done(mp);
562
563 mutex_enter(&vdrain_lock);
564 }
565
566 /*
567 * Release a cached vnode. Used from vdrain_thread only.
568 */
569 static __inline void
570 vdrain_vrele(vnode_t *vp)
571 {
572 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
573 struct mount *mp;
574
575 KASSERT(mutex_owned(&vdrain_lock));
576
577 mp = vp->v_mount;
578 if (fstrans_start_nowait(mp) != 0)
579 return;
580
581 /*
582 * First remove the vnode from the vrele list.
583 * Put it on the last lru list, the last vrele()
584 * will put it back onto the right list before
585 * its v_usecount reaches zero.
586 */
587 KASSERT(vip->vi_lrulisthd == &lru_vrele_list);
588 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
589 vip->vi_lrulisthd = &lru_hold_list;
590 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
591
592 vdrain_retry = true;
593 mutex_exit(&vdrain_lock);
594
595 mutex_enter(vp->v_interlock);
596 vrelel(vp, VRELEL_FORCE_RELE);
597 fstrans_done(mp);
598
599 mutex_enter(&vdrain_lock);
600 }
601
602 /*
603 * Helper thread to keep the number of vnodes below desiredvnodes
604 * and release vnodes from asynchronous vrele.
605 */
606 static void
607 vdrain_thread(void *cookie)
608 {
609 vnodelst_t *listhd[] = {
610 &lru_vrele_list, &lru_free_list, &lru_hold_list
611 };
612 int i;
613 u_int target;
614 vnode_impl_t *vip, *marker;
615
616 marker = VNODE_TO_VIMPL(vnalloc_marker(NULL));
617
618 mutex_enter(&vdrain_lock);
619
620 for (;;) {
621 vdrain_retry = false;
622 target = desiredvnodes - desiredvnodes/10;
623
624 for (i = 0; i < __arraycount(listhd); i++) {
625 TAILQ_INSERT_HEAD(listhd[i], marker, vi_lrulist);
626 while ((vip = TAILQ_NEXT(marker, vi_lrulist))) {
627 TAILQ_REMOVE(listhd[i], marker, vi_lrulist);
628 TAILQ_INSERT_AFTER(listhd[i], vip, marker,
629 vi_lrulist);
630 if (vnis_marker(VIMPL_TO_VNODE(vip)))
631 continue;
632 if (listhd[i] == &lru_vrele_list)
633 vdrain_vrele(VIMPL_TO_VNODE(vip));
634 else if (numvnodes < target)
635 break;
636 else
637 vdrain_remove(VIMPL_TO_VNODE(vip));
638 }
639 TAILQ_REMOVE(listhd[i], marker, vi_lrulist);
640 }
641
642 if (vdrain_retry) {
643 mutex_exit(&vdrain_lock);
644 yield();
645 mutex_enter(&vdrain_lock);
646 } else {
647 vdrain_gen++;
648 cv_broadcast(&vdrain_gen_cv);
649 cv_wait(&vdrain_cv, &vdrain_lock);
650 }
651 }
652 }
653
654 /*
655 * vput: unlock and release the reference.
656 */
657 void
658 vput(vnode_t *vp)
659 {
660
661 VOP_UNLOCK(vp);
662 vrele(vp);
663 }
664
665 /*
666 * Try to drop reference on a vnode. Abort if we are releasing the
667 * last reference. Note: this _must_ succeed if not the last reference.
668 */
669 static inline bool
670 vtryrele(vnode_t *vp)
671 {
672 u_int use, next;
673
674 for (use = vp->v_usecount;; use = next) {
675 if (use == 1) {
676 return false;
677 }
678 KASSERT(use > 1);
679 next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
680 if (__predict_true(next == use)) {
681 return true;
682 }
683 }
684 }
685
686 /*
687 * Vnode release. If reference count drops to zero, call inactive
688 * routine and either return to freelist or free to the pool.
689 */
690 static void
691 vrelel(vnode_t *vp, int flags)
692 {
693 const bool async = ((flags & VRELEL_ASYNC_RELE) != 0);
694 const bool force = ((flags & VRELEL_FORCE_RELE) != 0);
695 bool recycle, defer;
696 int error;
697
698 KASSERT(mutex_owned(vp->v_interlock));
699
700 if (__predict_false(vp->v_op == dead_vnodeop_p &&
701 VSTATE_GET(vp) != VS_RECLAIMED)) {
702 vnpanic(vp, "dead but not clean");
703 }
704
705 /*
706 * If not the last reference, just drop the reference count
707 * and unlock.
708 */
709 if (vtryrele(vp)) {
710 mutex_exit(vp->v_interlock);
711 return;
712 }
713 if (vp->v_usecount <= 0 || vp->v_writecount != 0) {
714 vnpanic(vp, "%s: bad ref count", __func__);
715 }
716
717 #ifdef DIAGNOSTIC
718 if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
719 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
720 vprint("vrelel: missing VOP_CLOSE()", vp);
721 }
722 #endif
723
724 /*
725 * First try to get the vnode locked for VOP_INACTIVE().
726 * Defer vnode release to vdrain_thread if caller requests
727 * it explicitly, is the pagedaemon or the lock failed.
728 */
729 if ((curlwp == uvm.pagedaemon_lwp) || async) {
730 defer = true;
731 } else {
732 mutex_exit(vp->v_interlock);
733 error = vn_lock(vp,
734 LK_EXCLUSIVE | LK_RETRY | (force ? 0 : LK_NOWAIT));
735 defer = (error != 0);
736 mutex_enter(vp->v_interlock);
737 }
738 KASSERT(mutex_owned(vp->v_interlock));
739 KASSERT(! (force && defer));
740 if (defer) {
741 /*
742 * Defer reclaim to the kthread; it's not safe to
743 * clean it here. We donate it our last reference.
744 */
745 lru_requeue(vp, &lru_vrele_list);
746 mutex_exit(vp->v_interlock);
747 return;
748 }
749
750 /*
751 * If the node got another reference while we
752 * released the interlock, don't try to inactivate it yet.
753 */
754 if (__predict_false(vtryrele(vp))) {
755 VOP_UNLOCK(vp);
756 mutex_exit(vp->v_interlock);
757 return;
758 }
759
760 /*
761 * If not clean, deactivate the vnode, but preserve
762 * our reference across the call to VOP_INACTIVE().
763 */
764 if (VSTATE_GET(vp) == VS_RECLAIMED) {
765 VOP_UNLOCK(vp);
766 } else {
767 VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
768 mutex_exit(vp->v_interlock);
769
770 /*
771 * The vnode must not gain another reference while being
772 * deactivated. If VOP_INACTIVE() indicates that
773 * the described file has been deleted, then recycle
774 * the vnode.
775 *
776 * Note that VOP_INACTIVE() will not drop the vnode lock.
777 */
778 recycle = false;
779 VOP_INACTIVE(vp, &recycle);
780 if (!recycle)
781 VOP_UNLOCK(vp);
782 mutex_enter(vp->v_interlock);
783 VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
784 if (!recycle) {
785 if (vtryrele(vp)) {
786 mutex_exit(vp->v_interlock);
787 return;
788 }
789 }
790
791 /* Take care of space accounting. */
792 if (vp->v_iflag & VI_EXECMAP) {
793 atomic_add_int(&uvmexp.execpages,
794 -vp->v_uobj.uo_npages);
795 atomic_add_int(&uvmexp.filepages,
796 vp->v_uobj.uo_npages);
797 }
798 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP);
799 vp->v_vflag &= ~VV_MAPPED;
800
801 /*
802 * Recycle the vnode if the file is now unused (unlinked),
803 * otherwise just free it.
804 */
805 if (recycle) {
806 VSTATE_ASSERT(vp, VS_LOADED);
807 /* vcache_reclaim drops the lock. */
808 vcache_reclaim(vp);
809 }
810 KASSERT(vp->v_usecount > 0);
811 }
812
813 if (atomic_dec_uint_nv(&vp->v_usecount) != 0) {
814 /* Gained another reference while being reclaimed. */
815 mutex_exit(vp->v_interlock);
816 return;
817 }
818
819 if (VSTATE_GET(vp) == VS_RECLAIMED && vp->v_holdcnt == 0) {
820 /*
821 * It's clean so destroy it. It isn't referenced
822 * anywhere since it has been reclaimed.
823 */
824 vcache_free(VNODE_TO_VIMPL(vp));
825 } else {
826 /*
827 * Otherwise, put it back onto the freelist. It
828 * can't be destroyed while still associated with
829 * a file system.
830 */
831 lru_requeue(vp, lru_which(vp));
832 mutex_exit(vp->v_interlock);
833 }
834 }
835
836 void
837 vrele(vnode_t *vp)
838 {
839
840 if (vtryrele(vp)) {
841 return;
842 }
843 mutex_enter(vp->v_interlock);
844 vrelel(vp, 0);
845 }
846
847 /*
848 * Asynchronous vnode release, vnode is released in different context.
849 */
850 void
851 vrele_async(vnode_t *vp)
852 {
853
854 if (vtryrele(vp)) {
855 return;
856 }
857 mutex_enter(vp->v_interlock);
858 vrelel(vp, VRELEL_ASYNC_RELE);
859 }
860
861 /*
862 * Vnode reference, where a reference is already held by some other
863 * object (for example, a file structure).
864 */
865 void
866 vref(vnode_t *vp)
867 {
868
869 KASSERT(vp->v_usecount != 0);
870
871 atomic_inc_uint(&vp->v_usecount);
872 }
873
874 /*
875 * Page or buffer structure gets a reference.
876 * Called with v_interlock held.
877 */
878 void
879 vholdl(vnode_t *vp)
880 {
881
882 KASSERT(mutex_owned(vp->v_interlock));
883
884 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0)
885 lru_requeue(vp, lru_which(vp));
886 }
887
888 /*
889 * Page or buffer structure frees a reference.
890 * Called with v_interlock held.
891 */
892 void
893 holdrelel(vnode_t *vp)
894 {
895
896 KASSERT(mutex_owned(vp->v_interlock));
897
898 if (vp->v_holdcnt <= 0) {
899 vnpanic(vp, "%s: holdcnt vp %p", __func__, vp);
900 }
901
902 vp->v_holdcnt--;
903 if (vp->v_holdcnt == 0 && vp->v_usecount == 0)
904 lru_requeue(vp, lru_which(vp));
905 }
906
907 /*
908 * Recycle an unused vnode if caller holds the last reference.
909 */
910 bool
911 vrecycle(vnode_t *vp)
912 {
913 int error __diagused;
914
915 mutex_enter(vp->v_interlock);
916
917 /* Make sure we hold the last reference. */
918 VSTATE_WAIT_STABLE(vp);
919 if (vp->v_usecount != 1) {
920 mutex_exit(vp->v_interlock);
921 return false;
922 }
923
924 /* If the vnode is already clean we're done. */
925 if (VSTATE_GET(vp) != VS_LOADED) {
926 VSTATE_ASSERT(vp, VS_RECLAIMED);
927 vrelel(vp, 0);
928 return true;
929 }
930
931 /* Prevent further references until the vnode is locked. */
932 VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
933 mutex_exit(vp->v_interlock);
934
935 /*
936 * On a leaf file system this lock will always succeed as we hold
937 * the last reference and prevent further references.
938 * On layered file systems waiting for the lock would open a can of
939 * deadlocks as the lower vnodes may have other active references.
940 */
941 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
942
943 mutex_enter(vp->v_interlock);
944 VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
945
946 if (error) {
947 mutex_exit(vp->v_interlock);
948 return false;
949 }
950
951 KASSERT(vp->v_usecount == 1);
952 vcache_reclaim(vp);
953 vrelel(vp, 0);
954
955 return true;
956 }
957
958 /*
959 * Helper for vrevoke() to propagate suspension from lastmp
960 * to thismp. Both args may be NULL.
961 * Returns the currently suspended file system or NULL.
962 */
963 static struct mount *
964 vrevoke_suspend_next(struct mount *lastmp, struct mount *thismp)
965 {
966 int error;
967
968 if (lastmp == thismp)
969 return thismp;
970
971 if (lastmp != NULL)
972 vfs_resume(lastmp);
973
974 if (thismp == NULL)
975 return NULL;
976
977 do {
978 error = vfs_suspend(thismp, 0);
979 } while (error == EINTR || error == ERESTART);
980
981 if (error == 0)
982 return thismp;
983
984 KASSERT(error == EOPNOTSUPP);
985 return NULL;
986 }
987
988 /*
989 * Eliminate all activity associated with the requested vnode
990 * and with all vnodes aliased to the requested vnode.
991 */
992 void
993 vrevoke(vnode_t *vp)
994 {
995 struct mount *mp;
996 vnode_t *vq;
997 enum vtype type;
998 dev_t dev;
999
1000 KASSERT(vp->v_usecount > 0);
1001
1002 mp = vrevoke_suspend_next(NULL, vp->v_mount);
1003
1004 mutex_enter(vp->v_interlock);
1005 VSTATE_WAIT_STABLE(vp);
1006 if (VSTATE_GET(vp) == VS_RECLAIMED) {
1007 mutex_exit(vp->v_interlock);
1008 } else if (vp->v_type != VBLK && vp->v_type != VCHR) {
1009 atomic_inc_uint(&vp->v_usecount);
1010 mutex_exit(vp->v_interlock);
1011 vgone(vp);
1012 } else {
1013 dev = vp->v_rdev;
1014 type = vp->v_type;
1015 mutex_exit(vp->v_interlock);
1016
1017 while (spec_node_lookup_by_dev(type, dev, &vq) == 0) {
1018 mp = vrevoke_suspend_next(mp, vq->v_mount);
1019 vgone(vq);
1020 }
1021 }
1022 vrevoke_suspend_next(mp, NULL);
1023 }
1024
1025 /*
1026 * Eliminate all activity associated with a vnode in preparation for
1027 * reuse. Drops a reference from the vnode.
1028 */
1029 void
1030 vgone(vnode_t *vp)
1031 {
1032
1033 KASSERT((vp->v_mount->mnt_iflag & IMNT_HAS_TRANS) == 0 ||
1034 fstrans_is_owner(vp->v_mount));
1035
1036 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1037 mutex_enter(vp->v_interlock);
1038 VSTATE_WAIT_STABLE(vp);
1039 if (VSTATE_GET(vp) == VS_LOADED)
1040 vcache_reclaim(vp);
1041 VSTATE_ASSERT(vp, VS_RECLAIMED);
1042 vrelel(vp, 0);
1043 }
1044
1045 static inline uint32_t
1046 vcache_hash(const struct vcache_key *key)
1047 {
1048 uint32_t hash = HASH32_BUF_INIT;
1049
1050 KASSERT(key->vk_key_len > 0);
1051
1052 hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash);
1053 hash = hash32_buf(key->vk_key, key->vk_key_len, hash);
1054 return hash;
1055 }
1056
1057 static void
1058 vcache_init(void)
1059 {
1060
1061 vcache_pool = pool_cache_init(sizeof(vnode_impl_t), 0, 0, 0,
1062 "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL);
1063 KASSERT(vcache_pool != NULL);
1064 mutex_init(&vcache_lock, MUTEX_DEFAULT, IPL_NONE);
1065 cv_init(&vcache_cv, "vcache");
1066 vcache_hashsize = desiredvnodes;
1067 vcache_hashtab = hashinit(desiredvnodes, HASH_SLIST, true,
1068 &vcache_hashmask);
1069 }
1070
1071 static void
1072 vcache_reinit(void)
1073 {
1074 int i;
1075 uint32_t hash;
1076 u_long oldmask, newmask;
1077 struct hashhead *oldtab, *newtab;
1078 vnode_impl_t *vip;
1079
1080 newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask);
1081 mutex_enter(&vcache_lock);
1082 oldtab = vcache_hashtab;
1083 oldmask = vcache_hashmask;
1084 vcache_hashsize = desiredvnodes;
1085 vcache_hashtab = newtab;
1086 vcache_hashmask = newmask;
1087 for (i = 0; i <= oldmask; i++) {
1088 while ((vip = SLIST_FIRST(&oldtab[i])) != NULL) {
1089 SLIST_REMOVE(&oldtab[i], vip, vnode_impl, vi_hash);
1090 hash = vcache_hash(&vip->vi_key);
1091 SLIST_INSERT_HEAD(&newtab[hash & vcache_hashmask],
1092 vip, vi_hash);
1093 }
1094 }
1095 mutex_exit(&vcache_lock);
1096 hashdone(oldtab, HASH_SLIST, oldmask);
1097 }
1098
1099 static inline vnode_impl_t *
1100 vcache_hash_lookup(const struct vcache_key *key, uint32_t hash)
1101 {
1102 struct hashhead *hashp;
1103 vnode_impl_t *vip;
1104
1105 KASSERT(mutex_owned(&vcache_lock));
1106
1107 hashp = &vcache_hashtab[hash & vcache_hashmask];
1108 SLIST_FOREACH(vip, hashp, vi_hash) {
1109 if (key->vk_mount != vip->vi_key.vk_mount)
1110 continue;
1111 if (key->vk_key_len != vip->vi_key.vk_key_len)
1112 continue;
1113 if (memcmp(key->vk_key, vip->vi_key.vk_key, key->vk_key_len))
1114 continue;
1115 return vip;
1116 }
1117 return NULL;
1118 }
1119
1120 /*
1121 * Allocate a new, uninitialized vcache node.
1122 */
1123 static vnode_impl_t *
1124 vcache_alloc(void)
1125 {
1126 vnode_impl_t *vip;
1127 vnode_t *vp;
1128
1129 vip = pool_cache_get(vcache_pool, PR_WAITOK);
1130 memset(vip, 0, sizeof(*vip));
1131
1132 rw_init(&vip->vi_lock);
1133 /* SLIST_INIT(&vip->vi_hash); */
1134 /* LIST_INIT(&vip->vi_nclist); */
1135 /* LIST_INIT(&vip->vi_dnclist); */
1136
1137 vp = VIMPL_TO_VNODE(vip);
1138 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0);
1139 cv_init(&vp->v_cv, "vnode");
1140
1141 vp->v_usecount = 1;
1142 vp->v_type = VNON;
1143 vp->v_size = vp->v_writesize = VSIZENOTSET;
1144
1145 vip->vi_state = VS_LOADING;
1146
1147 lru_requeue(vp, &lru_free_list);
1148
1149 return vip;
1150 }
1151
1152 /*
1153 * Deallocate a vcache node in state VS_LOADING.
1154 *
1155 * vcache_lock held on entry and released on return.
1156 */
1157 static void
1158 vcache_dealloc(vnode_impl_t *vip)
1159 {
1160 vnode_t *vp;
1161
1162 KASSERT(mutex_owned(&vcache_lock));
1163
1164 vp = VIMPL_TO_VNODE(vip);
1165 mutex_enter(vp->v_interlock);
1166 vp->v_op = dead_vnodeop_p;
1167 VSTATE_CHANGE(vp, VS_LOADING, VS_RECLAIMED);
1168 mutex_exit(&vcache_lock);
1169 vrelel(vp, 0);
1170 }
1171
1172 /*
1173 * Free an unused, unreferenced vcache node.
1174 * v_interlock locked on entry.
1175 */
1176 static void
1177 vcache_free(vnode_impl_t *vip)
1178 {
1179 vnode_t *vp;
1180
1181 vp = VIMPL_TO_VNODE(vip);
1182 KASSERT(mutex_owned(vp->v_interlock));
1183
1184 KASSERT(vp->v_usecount == 0);
1185 KASSERT(vp->v_holdcnt == 0);
1186 KASSERT(vp->v_writecount == 0);
1187 lru_requeue(vp, NULL);
1188 mutex_exit(vp->v_interlock);
1189
1190 vfs_insmntque(vp, NULL);
1191 if (vp->v_type == VBLK || vp->v_type == VCHR)
1192 spec_node_destroy(vp);
1193
1194 rw_destroy(&vip->vi_lock);
1195 uvm_obj_destroy(&vp->v_uobj, true);
1196 cv_destroy(&vp->v_cv);
1197 pool_cache_put(vcache_pool, vip);
1198 }
1199
1200 /*
1201 * Try to get an initial reference on this cached vnode.
1202 * Returns zero on success, ENOENT if the vnode has been reclaimed and
1203 * EBUSY if the vnode state is unstable.
1204 *
1205 * v_interlock locked on entry and unlocked on exit.
1206 */
1207 int
1208 vcache_tryvget(vnode_t *vp)
1209 {
1210 int error = 0;
1211
1212 KASSERT(mutex_owned(vp->v_interlock));
1213
1214 if (__predict_false(VSTATE_GET(vp) == VS_RECLAIMED))
1215 error = ENOENT;
1216 else if (__predict_false(VSTATE_GET(vp) != VS_LOADED))
1217 error = EBUSY;
1218 else if (vp->v_usecount == 0)
1219 vp->v_usecount = 1;
1220 else
1221 atomic_inc_uint(&vp->v_usecount);
1222
1223 mutex_exit(vp->v_interlock);
1224
1225 return error;
1226 }
1227
1228 /*
1229 * Try to get an initial reference on this cached vnode.
1230 * Returns zero on success and ENOENT if the vnode has been reclaimed.
1231 * Will wait for the vnode state to be stable.
1232 *
1233 * v_interlock locked on entry and unlocked on exit.
1234 */
1235 int
1236 vcache_vget(vnode_t *vp)
1237 {
1238
1239 KASSERT(mutex_owned(vp->v_interlock));
1240
1241 /* Increment hold count to prevent vnode from disappearing. */
1242 vp->v_holdcnt++;
1243 VSTATE_WAIT_STABLE(vp);
1244 vp->v_holdcnt--;
1245
1246 /* If this was the last reference to a reclaimed vnode free it now. */
1247 if (__predict_false(VSTATE_GET(vp) == VS_RECLAIMED)) {
1248 if (vp->v_holdcnt == 0 && vp->v_usecount == 0)
1249 vcache_free(VNODE_TO_VIMPL(vp));
1250 else
1251 mutex_exit(vp->v_interlock);
1252 return ENOENT;
1253 }
1254 VSTATE_ASSERT(vp, VS_LOADED);
1255 if (vp->v_usecount == 0)
1256 vp->v_usecount = 1;
1257 else
1258 atomic_inc_uint(&vp->v_usecount);
1259
1260 mutex_exit(vp->v_interlock);
1261
1262 return 0;
1263 }
1264
1265 /*
1266 * Get a vnode / fs node pair by key and return it referenced through vpp.
1267 */
1268 int
1269 vcache_get(struct mount *mp, const void *key, size_t key_len,
1270 struct vnode **vpp)
1271 {
1272 int error;
1273 uint32_t hash;
1274 const void *new_key;
1275 struct vnode *vp;
1276 struct vcache_key vcache_key;
1277 vnode_impl_t *vip, *new_vip;
1278
1279 new_key = NULL;
1280 *vpp = NULL;
1281
1282 vcache_key.vk_mount = mp;
1283 vcache_key.vk_key = key;
1284 vcache_key.vk_key_len = key_len;
1285 hash = vcache_hash(&vcache_key);
1286
1287 again:
1288 mutex_enter(&vcache_lock);
1289 vip = vcache_hash_lookup(&vcache_key, hash);
1290
1291 /* If found, take a reference or retry. */
1292 if (__predict_true(vip != NULL)) {
1293 /*
1294 * If the vnode is loading we cannot take the v_interlock
1295 * here as it might change during load (see uvm_obj_setlock()).
1296 * As changing state from VS_LOADING requires both vcache_lock
1297 * and v_interlock it is safe to test with vcache_lock held.
1298 *
1299 * Wait for vnodes changing state from VS_LOADING and retry.
1300 */
1301 if (__predict_false(vip->vi_state == VS_LOADING)) {
1302 cv_wait(&vcache_cv, &vcache_lock);
1303 mutex_exit(&vcache_lock);
1304 goto again;
1305 }
1306 vp = VIMPL_TO_VNODE(vip);
1307 mutex_enter(vp->v_interlock);
1308 mutex_exit(&vcache_lock);
1309 error = vcache_vget(vp);
1310 if (error == ENOENT)
1311 goto again;
1312 if (error == 0)
1313 *vpp = vp;
1314 KASSERT((error != 0) == (*vpp == NULL));
1315 return error;
1316 }
1317 mutex_exit(&vcache_lock);
1318
1319 /* Allocate and initialize a new vcache / vnode pair. */
1320 error = vfs_busy(mp);
1321 if (error)
1322 return error;
1323 new_vip = vcache_alloc();
1324 new_vip->vi_key = vcache_key;
1325 vp = VIMPL_TO_VNODE(new_vip);
1326 mutex_enter(&vcache_lock);
1327 vip = vcache_hash_lookup(&vcache_key, hash);
1328 if (vip == NULL) {
1329 SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask],
1330 new_vip, vi_hash);
1331 vip = new_vip;
1332 }
1333
1334 /* If another thread beat us inserting this node, retry. */
1335 if (vip != new_vip) {
1336 vcache_dealloc(new_vip);
1337 vfs_unbusy(mp);
1338 goto again;
1339 }
1340 mutex_exit(&vcache_lock);
1341
1342 /* Load the fs node. Exclusive as new_node is VS_LOADING. */
1343 error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key);
1344 if (error) {
1345 mutex_enter(&vcache_lock);
1346 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
1347 new_vip, vnode_impl, vi_hash);
1348 vcache_dealloc(new_vip);
1349 vfs_unbusy(mp);
1350 KASSERT(*vpp == NULL);
1351 return error;
1352 }
1353 KASSERT(new_key != NULL);
1354 KASSERT(memcmp(key, new_key, key_len) == 0);
1355 KASSERT(vp->v_op != NULL);
1356 vfs_insmntque(vp, mp);
1357 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1358 vp->v_vflag |= VV_MPSAFE;
1359 vfs_ref(mp);
1360 vfs_unbusy(mp);
1361
1362 /* Finished loading, finalize node. */
1363 mutex_enter(&vcache_lock);
1364 new_vip->vi_key.vk_key = new_key;
1365 mutex_enter(vp->v_interlock);
1366 VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED);
1367 mutex_exit(vp->v_interlock);
1368 mutex_exit(&vcache_lock);
1369 *vpp = vp;
1370 return 0;
1371 }
1372
1373 /*
1374 * Create a new vnode / fs node pair and return it referenced through vpp.
1375 */
1376 int
1377 vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap,
1378 kauth_cred_t cred, struct vnode **vpp)
1379 {
1380 int error;
1381 uint32_t hash;
1382 struct vnode *vp, *ovp;
1383 vnode_impl_t *vip, *ovip;
1384
1385 *vpp = NULL;
1386
1387 /* Allocate and initialize a new vcache / vnode pair. */
1388 error = vfs_busy(mp);
1389 if (error)
1390 return error;
1391 vip = vcache_alloc();
1392 vip->vi_key.vk_mount = mp;
1393 vp = VIMPL_TO_VNODE(vip);
1394
1395 /* Create and load the fs node. */
1396 error = VFS_NEWVNODE(mp, dvp, vp, vap, cred,
1397 &vip->vi_key.vk_key_len, &vip->vi_key.vk_key);
1398 if (error) {
1399 mutex_enter(&vcache_lock);
1400 vcache_dealloc(vip);
1401 vfs_unbusy(mp);
1402 KASSERT(*vpp == NULL);
1403 return error;
1404 }
1405 KASSERT(vp->v_op != NULL);
1406 KASSERT((vip->vi_key.vk_key_len == 0) == (mp == dead_rootmount));
1407 if (vip->vi_key.vk_key_len > 0) {
1408 KASSERT(vip->vi_key.vk_key != NULL);
1409 hash = vcache_hash(&vip->vi_key);
1410
1411 /*
1412 * Wait for previous instance to be reclaimed,
1413 * then insert new node.
1414 */
1415 mutex_enter(&vcache_lock);
1416 while ((ovip = vcache_hash_lookup(&vip->vi_key, hash))) {
1417 ovp = VIMPL_TO_VNODE(ovip);
1418 mutex_enter(ovp->v_interlock);
1419 mutex_exit(&vcache_lock);
1420 error = vcache_vget(ovp);
1421 KASSERT(error == ENOENT);
1422 mutex_enter(&vcache_lock);
1423 }
1424 SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask],
1425 vip, vi_hash);
1426 mutex_exit(&vcache_lock);
1427 }
1428 vfs_insmntque(vp, mp);
1429 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1430 vp->v_vflag |= VV_MPSAFE;
1431 vfs_ref(mp);
1432 vfs_unbusy(mp);
1433
1434 /* Finished loading, finalize node. */
1435 mutex_enter(&vcache_lock);
1436 mutex_enter(vp->v_interlock);
1437 VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED);
1438 mutex_exit(&vcache_lock);
1439 mutex_exit(vp->v_interlock);
1440 *vpp = vp;
1441 return 0;
1442 }
1443
1444 /*
1445 * Prepare key change: update old cache nodes key and lock new cache node.
1446 * Return an error if the new node already exists.
1447 */
1448 int
1449 vcache_rekey_enter(struct mount *mp, struct vnode *vp,
1450 const void *old_key, size_t old_key_len,
1451 const void *new_key, size_t new_key_len)
1452 {
1453 uint32_t old_hash, new_hash;
1454 struct vcache_key old_vcache_key, new_vcache_key;
1455 vnode_impl_t *vip, *new_vip;
1456
1457 old_vcache_key.vk_mount = mp;
1458 old_vcache_key.vk_key = old_key;
1459 old_vcache_key.vk_key_len = old_key_len;
1460 old_hash = vcache_hash(&old_vcache_key);
1461
1462 new_vcache_key.vk_mount = mp;
1463 new_vcache_key.vk_key = new_key;
1464 new_vcache_key.vk_key_len = new_key_len;
1465 new_hash = vcache_hash(&new_vcache_key);
1466
1467 new_vip = vcache_alloc();
1468 new_vip->vi_key = new_vcache_key;
1469
1470 /* Insert locked new node used as placeholder. */
1471 mutex_enter(&vcache_lock);
1472 vip = vcache_hash_lookup(&new_vcache_key, new_hash);
1473 if (vip != NULL) {
1474 vcache_dealloc(new_vip);
1475 return EEXIST;
1476 }
1477 SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask],
1478 new_vip, vi_hash);
1479
1480 /* Replace old nodes key with the temporary copy. */
1481 vip = vcache_hash_lookup(&old_vcache_key, old_hash);
1482 KASSERT(vip != NULL);
1483 KASSERT(VIMPL_TO_VNODE(vip) == vp);
1484 KASSERT(vip->vi_key.vk_key != old_vcache_key.vk_key);
1485 vip->vi_key = old_vcache_key;
1486 mutex_exit(&vcache_lock);
1487 return 0;
1488 }
1489
1490 /*
1491 * Key change complete: update old node and remove placeholder.
1492 */
1493 void
1494 vcache_rekey_exit(struct mount *mp, struct vnode *vp,
1495 const void *old_key, size_t old_key_len,
1496 const void *new_key, size_t new_key_len)
1497 {
1498 uint32_t old_hash, new_hash;
1499 struct vcache_key old_vcache_key, new_vcache_key;
1500 vnode_impl_t *vip, *new_vip;
1501 struct vnode *new_vp;
1502
1503 old_vcache_key.vk_mount = mp;
1504 old_vcache_key.vk_key = old_key;
1505 old_vcache_key.vk_key_len = old_key_len;
1506 old_hash = vcache_hash(&old_vcache_key);
1507
1508 new_vcache_key.vk_mount = mp;
1509 new_vcache_key.vk_key = new_key;
1510 new_vcache_key.vk_key_len = new_key_len;
1511 new_hash = vcache_hash(&new_vcache_key);
1512
1513 mutex_enter(&vcache_lock);
1514
1515 /* Lookup old and new node. */
1516 vip = vcache_hash_lookup(&old_vcache_key, old_hash);
1517 KASSERT(vip != NULL);
1518 KASSERT(VIMPL_TO_VNODE(vip) == vp);
1519
1520 new_vip = vcache_hash_lookup(&new_vcache_key, new_hash);
1521 KASSERT(new_vip != NULL);
1522 KASSERT(new_vip->vi_key.vk_key_len == new_key_len);
1523 new_vp = VIMPL_TO_VNODE(new_vip);
1524 mutex_enter(new_vp->v_interlock);
1525 VSTATE_ASSERT(VIMPL_TO_VNODE(new_vip), VS_LOADING);
1526 mutex_exit(new_vp->v_interlock);
1527
1528 /* Rekey old node and put it onto its new hashlist. */
1529 vip->vi_key = new_vcache_key;
1530 if (old_hash != new_hash) {
1531 SLIST_REMOVE(&vcache_hashtab[old_hash & vcache_hashmask],
1532 vip, vnode_impl, vi_hash);
1533 SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask],
1534 vip, vi_hash);
1535 }
1536
1537 /* Remove new node used as placeholder. */
1538 SLIST_REMOVE(&vcache_hashtab[new_hash & vcache_hashmask],
1539 new_vip, vnode_impl, vi_hash);
1540 vcache_dealloc(new_vip);
1541 }
1542
1543 /*
1544 * Disassociate the underlying file system from a vnode.
1545 *
1546 * Must be called with vnode locked and will return unlocked.
1547 * Must be called with the interlock held, and will return with it held.
1548 */
1549 static void
1550 vcache_reclaim(vnode_t *vp)
1551 {
1552 lwp_t *l = curlwp;
1553 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
1554 struct mount *mp = vp->v_mount;
1555 uint32_t hash;
1556 uint8_t temp_buf[64], *temp_key;
1557 size_t temp_key_len;
1558 bool recycle, active;
1559 int error;
1560
1561 KASSERT((vp->v_vflag & VV_LOCKSWORK) == 0 ||
1562 VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
1563 KASSERT(mutex_owned(vp->v_interlock));
1564 KASSERT(vp->v_usecount != 0);
1565
1566 active = (vp->v_usecount > 1);
1567 temp_key_len = vip->vi_key.vk_key_len;
1568 /*
1569 * Prevent the vnode from being recycled or brought into use
1570 * while we clean it out.
1571 */
1572 VSTATE_CHANGE(vp, VS_LOADED, VS_RECLAIMING);
1573 if (vp->v_iflag & VI_EXECMAP) {
1574 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages);
1575 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages);
1576 }
1577 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
1578 mutex_exit(vp->v_interlock);
1579
1580 /* Replace the vnode key with a temporary copy. */
1581 if (vip->vi_key.vk_key_len > sizeof(temp_buf)) {
1582 temp_key = kmem_alloc(temp_key_len, KM_SLEEP);
1583 } else {
1584 temp_key = temp_buf;
1585 }
1586 if (vip->vi_key.vk_key_len > 0) {
1587 mutex_enter(&vcache_lock);
1588 memcpy(temp_key, vip->vi_key.vk_key, temp_key_len);
1589 vip->vi_key.vk_key = temp_key;
1590 mutex_exit(&vcache_lock);
1591 }
1592
1593 fstrans_start(mp);
1594
1595 /*
1596 * Clean out any cached data associated with the vnode.
1597 * If purging an active vnode, it must be closed and
1598 * deactivated before being reclaimed.
1599 */
1600 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
1601 if (error != 0) {
1602 if (wapbl_vphaswapbl(vp))
1603 WAPBL_DISCARD(wapbl_vptomp(vp));
1604 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
1605 }
1606 KASSERTMSG((error == 0), "vinvalbuf failed: %d", error);
1607 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1608 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) {
1609 spec_node_revoke(vp);
1610 }
1611
1612 /*
1613 * Disassociate the underlying file system from the vnode.
1614 * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks
1615 * the vnode, and may destroy the vnode so that VOP_UNLOCK
1616 * would no longer function.
1617 */
1618 VOP_INACTIVE(vp, &recycle);
1619 KASSERT((vp->v_vflag & VV_LOCKSWORK) == 0 ||
1620 VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
1621 if (VOP_RECLAIM(vp)) {
1622 vnpanic(vp, "%s: cannot reclaim", __func__);
1623 }
1624
1625 KASSERT(vp->v_data == NULL);
1626 KASSERT(vp->v_uobj.uo_npages == 0);
1627
1628 if (vp->v_type == VREG && vp->v_ractx != NULL) {
1629 uvm_ra_freectx(vp->v_ractx);
1630 vp->v_ractx = NULL;
1631 }
1632
1633 /* Purge name cache. */
1634 cache_purge(vp);
1635
1636 if (vip->vi_key.vk_key_len > 0) {
1637 /* Remove from vnode cache. */
1638 hash = vcache_hash(&vip->vi_key);
1639 mutex_enter(&vcache_lock);
1640 KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash));
1641 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
1642 vip, vnode_impl, vi_hash);
1643 mutex_exit(&vcache_lock);
1644 }
1645 if (temp_key != temp_buf)
1646 kmem_free(temp_key, temp_key_len);
1647
1648 /* Done with purge, notify sleepers of the grim news. */
1649 mutex_enter(vp->v_interlock);
1650 vp->v_op = dead_vnodeop_p;
1651 vp->v_vflag |= VV_LOCKSWORK;
1652 VSTATE_CHANGE(vp, VS_RECLAIMING, VS_RECLAIMED);
1653 vp->v_tag = VT_NON;
1654 KNOTE(&vp->v_klist, NOTE_REVOKE);
1655 mutex_exit(vp->v_interlock);
1656
1657 /*
1658 * Move to dead mount. Must be after changing the operations
1659 * vector as vnode operations enter the mount before using the
1660 * operations vector. See sys/kern/vnode_if.c.
1661 */
1662 vp->v_vflag &= ~VV_ROOT;
1663 vfs_ref(dead_rootmount);
1664 vfs_insmntque(vp, dead_rootmount);
1665
1666 mutex_enter(vp->v_interlock);
1667 fstrans_done(mp);
1668 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1669 }
1670
1671 /*
1672 * Disassociate the underlying file system from an open device vnode
1673 * and make it anonymous.
1674 *
1675 * Vnode unlocked on entry, drops a reference to the vnode.
1676 */
1677 void
1678 vcache_make_anon(vnode_t *vp)
1679 {
1680 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
1681 uint32_t hash;
1682 bool recycle;
1683
1684 KASSERT(vp->v_type == VBLK || vp->v_type == VCHR);
1685 KASSERT((vp->v_mount->mnt_iflag & IMNT_HAS_TRANS) == 0 ||
1686 fstrans_is_owner(vp->v_mount));
1687 VSTATE_ASSERT_UNLOCKED(vp, VS_ACTIVE);
1688
1689 /* Remove from vnode cache. */
1690 hash = vcache_hash(&vip->vi_key);
1691 mutex_enter(&vcache_lock);
1692 KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash));
1693 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
1694 vip, vnode_impl, vi_hash);
1695 vip->vi_key.vk_mount = dead_rootmount;
1696 vip->vi_key.vk_key_len = 0;
1697 vip->vi_key.vk_key = NULL;
1698 mutex_exit(&vcache_lock);
1699
1700 /*
1701 * Disassociate the underlying file system from the vnode.
1702 * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks
1703 * the vnode, and may destroy the vnode so that VOP_UNLOCK
1704 * would no longer function.
1705 */
1706 if (vn_lock(vp, LK_EXCLUSIVE)) {
1707 vnpanic(vp, "%s: cannot lock", __func__);
1708 }
1709 VOP_INACTIVE(vp, &recycle);
1710 KASSERT((vp->v_vflag & VV_LOCKSWORK) == 0 ||
1711 VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
1712 if (VOP_RECLAIM(vp)) {
1713 vnpanic(vp, "%s: cannot reclaim", __func__);
1714 }
1715
1716 /* Purge name cache. */
1717 cache_purge(vp);
1718
1719 /* Done with purge, change operations vector. */
1720 mutex_enter(vp->v_interlock);
1721 vp->v_op = spec_vnodeop_p;
1722 vp->v_vflag |= VV_MPSAFE;
1723 vp->v_vflag &= ~VV_LOCKSWORK;
1724 mutex_exit(vp->v_interlock);
1725
1726 /*
1727 * Move to dead mount. Must be after changing the operations
1728 * vector as vnode operations enter the mount before using the
1729 * operations vector. See sys/kern/vnode_if.c.
1730 */
1731 vfs_ref(dead_rootmount);
1732 vfs_insmntque(vp, dead_rootmount);
1733
1734 vrele(vp);
1735 }
1736
1737 /*
1738 * Update outstanding I/O count and do wakeup if requested.
1739 */
1740 void
1741 vwakeup(struct buf *bp)
1742 {
1743 vnode_t *vp;
1744
1745 if ((vp = bp->b_vp) == NULL)
1746 return;
1747
1748 KASSERT(bp->b_objlock == vp->v_interlock);
1749 KASSERT(mutex_owned(bp->b_objlock));
1750
1751 if (--vp->v_numoutput < 0)
1752 vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp);
1753 if (vp->v_numoutput == 0)
1754 cv_broadcast(&vp->v_cv);
1755 }
1756
1757 /*
1758 * Test a vnode for being or becoming dead. Returns one of:
1759 * EBUSY: vnode is becoming dead, with "flags == VDEAD_NOWAIT" only.
1760 * ENOENT: vnode is dead.
1761 * 0: otherwise.
1762 *
1763 * Whenever this function returns a non-zero value all future
1764 * calls will also return a non-zero value.
1765 */
1766 int
1767 vdead_check(struct vnode *vp, int flags)
1768 {
1769
1770 KASSERT(mutex_owned(vp->v_interlock));
1771
1772 if (! ISSET(flags, VDEAD_NOWAIT))
1773 VSTATE_WAIT_STABLE(vp);
1774
1775 if (VSTATE_GET(vp) == VS_RECLAIMING) {
1776 KASSERT(ISSET(flags, VDEAD_NOWAIT));
1777 return EBUSY;
1778 } else if (VSTATE_GET(vp) == VS_RECLAIMED) {
1779 return ENOENT;
1780 }
1781
1782 return 0;
1783 }
1784
1785 int
1786 vfs_drainvnodes(void)
1787 {
1788 int i, gen;
1789
1790 mutex_enter(&vdrain_lock);
1791 for (i = 0; i < 2; i++) {
1792 gen = vdrain_gen;
1793 while (gen == vdrain_gen) {
1794 cv_broadcast(&vdrain_cv);
1795 cv_wait(&vdrain_gen_cv, &vdrain_lock);
1796 }
1797 }
1798 mutex_exit(&vdrain_lock);
1799
1800 if (numvnodes >= desiredvnodes)
1801 return EBUSY;
1802
1803 if (vcache_hashsize != desiredvnodes)
1804 vcache_reinit();
1805
1806 return 0;
1807 }
1808
1809 void
1810 vnpanic(vnode_t *vp, const char *fmt, ...)
1811 {
1812 va_list ap;
1813
1814 #ifdef DIAGNOSTIC
1815 vprint(NULL, vp);
1816 #endif
1817 va_start(ap, fmt);
1818 vpanic(fmt, ap);
1819 va_end(ap);
1820 }
1821