vfs_vnode.c revision 1.103 1 /* $NetBSD: vfs_vnode.c,v 1.103 2019/02/20 10:07:27 hannken Exp $ */
2
3 /*-
4 * Copyright (c) 1997-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /*
34 * Copyright (c) 1989, 1993
35 * The Regents of the University of California. All rights reserved.
36 * (c) UNIX System Laboratories, Inc.
37 * All or some portions of this file are derived from material licensed
38 * to the University of California by American Telephone and Telegraph
39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40 * the permission of UNIX System Laboratories, Inc.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
67 */
68
69 /*
70 * The vnode cache subsystem.
71 *
72 * Life-cycle
73 *
74 * Normally, there are two points where new vnodes are created:
75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode
76 * starts in one of the following ways:
77 *
78 * - Allocation, via vcache_get(9) or vcache_new(9).
79 * - Reclamation of inactive vnode, via vcache_vget(9).
80 *
81 * Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9)
82 * was another, traditional way. Currently, only the draining thread
83 * recycles the vnodes. This behaviour might be revisited.
84 *
85 * The life-cycle ends when the last reference is dropped, usually
86 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform
87 * the file system that vnode is inactive. Via this call, file system
88 * indicates whether vnode can be recycled (usually, it checks its own
89 * references, e.g. count of links, whether the file was removed).
90 *
91 * Depending on indication, vnode can be put into a free list (cache),
92 * or cleaned via vcache_reclaim, which calls VOP_RECLAIM(9) to
93 * disassociate underlying file system from the vnode, and finally
94 * destroyed.
95 *
96 * Vnode state
97 *
98 * Vnode is always in one of six states:
99 * - MARKER This is a marker vnode to help list traversal. It
100 * will never change its state.
101 * - LOADING Vnode is associating underlying file system and not
102 * yet ready to use.
103 * - LOADED Vnode has associated underlying file system and is
104 * ready to use.
105 * - BLOCKED Vnode is active but cannot get new references.
106 * - RECLAIMING Vnode is disassociating from the underlying file
107 * system.
108 * - RECLAIMED Vnode has disassociated from underlying file system
109 * and is dead.
110 *
111 * Valid state changes are:
112 * LOADING -> LOADED
113 * Vnode has been initialised in vcache_get() or
114 * vcache_new() and is ready to use.
115 * LOADED -> RECLAIMING
116 * Vnode starts disassociation from underlying file
117 * system in vcache_reclaim().
118 * RECLAIMING -> RECLAIMED
119 * Vnode finished disassociation from underlying file
120 * system in vcache_reclaim().
121 * LOADED -> BLOCKED
122 * Either vcache_rekey*() is changing the vnode key or
123 * vrelel() is about to call VOP_INACTIVE().
124 * BLOCKED -> LOADED
125 * The block condition is over.
126 * LOADING -> RECLAIMED
127 * Either vcache_get() or vcache_new() failed to
128 * associate the underlying file system or vcache_rekey*()
129 * drops a vnode used as placeholder.
130 *
131 * Of these states LOADING, BLOCKED and RECLAIMING are intermediate
132 * and it is possible to wait for state change.
133 *
134 * State is protected with v_interlock with one exception:
135 * to change from LOADING both v_interlock and vcache_lock must be held
136 * so it is possible to check "state == LOADING" without holding
137 * v_interlock. See vcache_get() for details.
138 *
139 * Reference counting
140 *
141 * Vnode is considered active, if reference count (vnode_t::v_usecount)
142 * is non-zero. It is maintained using: vref(9) and vrele(9), as well
143 * as vput(9), routines. Common points holding references are e.g.
144 * file openings, current working directory, mount points, etc.
145 *
146 * Note on v_usecount and its locking
147 *
148 * At nearly all points it is known that v_usecount could be zero,
149 * the vnode_t::v_interlock will be held. To change v_usecount away
150 * from zero, the interlock must be held. To change from a non-zero
151 * value to zero, again the interlock must be held.
152 *
153 * Changing the usecount from a non-zero value to a non-zero value can
154 * safely be done using atomic operations, without the interlock held.
155 *
156 */
157
158 #include <sys/cdefs.h>
159 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.103 2019/02/20 10:07:27 hannken Exp $");
160
161 #include <sys/param.h>
162 #include <sys/kernel.h>
163
164 #include <sys/atomic.h>
165 #include <sys/buf.h>
166 #include <sys/conf.h>
167 #include <sys/device.h>
168 #include <sys/hash.h>
169 #include <sys/kauth.h>
170 #include <sys/kmem.h>
171 #include <sys/kthread.h>
172 #include <sys/module.h>
173 #include <sys/mount.h>
174 #include <sys/namei.h>
175 #include <sys/syscallargs.h>
176 #include <sys/sysctl.h>
177 #include <sys/systm.h>
178 #include <sys/vnode_impl.h>
179 #include <sys/wapbl.h>
180 #include <sys/fstrans.h>
181
182 #include <uvm/uvm.h>
183 #include <uvm/uvm_readahead.h>
184
185 /* Flags to vrelel. */
186 #define VRELEL_ASYNC_RELE 0x0001 /* Always defer to vrele thread. */
187 #define VRELEL_FORCE_RELE 0x0002 /* Must always succeed. */
188
189 u_int numvnodes __cacheline_aligned;
190
191 /*
192 * There are three lru lists: one holds vnodes waiting for async release,
193 * one is for vnodes which have no buffer/page references and
194 * one for those which do (i.e. v_holdcnt is non-zero).
195 */
196 static vnodelst_t lru_vrele_list __cacheline_aligned;
197 static vnodelst_t lru_free_list __cacheline_aligned;
198 static vnodelst_t lru_hold_list __cacheline_aligned;
199 static kmutex_t vdrain_lock __cacheline_aligned;
200 static kcondvar_t vdrain_cv __cacheline_aligned;
201 static int vdrain_gen;
202 static kcondvar_t vdrain_gen_cv;
203 static bool vdrain_retry;
204 static lwp_t * vdrain_lwp;
205 SLIST_HEAD(hashhead, vnode_impl);
206 static kmutex_t vcache_lock __cacheline_aligned;
207 static kcondvar_t vcache_cv __cacheline_aligned;
208 static u_int vcache_hashsize;
209 static u_long vcache_hashmask;
210 static struct hashhead *vcache_hashtab __cacheline_aligned;
211 static pool_cache_t vcache_pool;
212 static void lru_requeue(vnode_t *, vnodelst_t *);
213 static vnodelst_t * lru_which(vnode_t *);
214 static vnode_impl_t * vcache_alloc(void);
215 static void vcache_dealloc(vnode_impl_t *);
216 static void vcache_free(vnode_impl_t *);
217 static void vcache_init(void);
218 static void vcache_reinit(void);
219 static void vcache_reclaim(vnode_t *);
220 static void vrelel(vnode_t *, int);
221 static void vdrain_thread(void *);
222 static void vnpanic(vnode_t *, const char *, ...)
223 __printflike(2, 3);
224
225 /* Routines having to do with the management of the vnode table. */
226 extern struct mount *dead_rootmount;
227 extern int (**dead_vnodeop_p)(void *);
228 extern int (**spec_vnodeop_p)(void *);
229 extern struct vfsops dead_vfsops;
230
231 /* Vnode state operations and diagnostics. */
232
233 #if defined(DIAGNOSTIC)
234
235 #define VSTATE_VALID(state) \
236 ((state) != VS_ACTIVE && (state) != VS_MARKER)
237 #define VSTATE_GET(vp) \
238 vstate_assert_get((vp), __func__, __LINE__)
239 #define VSTATE_CHANGE(vp, from, to) \
240 vstate_assert_change((vp), (from), (to), __func__, __LINE__)
241 #define VSTATE_WAIT_STABLE(vp) \
242 vstate_assert_wait_stable((vp), __func__, __LINE__)
243
244 void
245 _vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line,
246 bool has_lock)
247 {
248 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
249
250 if (!has_lock) {
251 /*
252 * Prevent predictive loads from the CPU, but check the state
253 * without loooking first.
254 */
255 membar_enter();
256 if (state == VS_ACTIVE && vp->v_usecount > 0 &&
257 (vip->vi_state == VS_LOADED || vip->vi_state == VS_BLOCKED))
258 return;
259 if (vip->vi_state == state)
260 return;
261 mutex_enter((vp)->v_interlock);
262 }
263
264 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
265
266 if ((state == VS_ACTIVE && vp->v_usecount > 0 &&
267 (vip->vi_state == VS_LOADED || vip->vi_state == VS_BLOCKED)) ||
268 vip->vi_state == state) {
269 if (!has_lock)
270 mutex_exit((vp)->v_interlock);
271 return;
272 }
273 vnpanic(vp, "state is %s, usecount %d, expected %s at %s:%d",
274 vstate_name(vip->vi_state), vp->v_usecount,
275 vstate_name(state), func, line);
276 }
277
278 static enum vnode_state
279 vstate_assert_get(vnode_t *vp, const char *func, int line)
280 {
281 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
282
283 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
284 if (! VSTATE_VALID(vip->vi_state))
285 vnpanic(vp, "state is %s at %s:%d",
286 vstate_name(vip->vi_state), func, line);
287
288 return vip->vi_state;
289 }
290
291 static void
292 vstate_assert_wait_stable(vnode_t *vp, const char *func, int line)
293 {
294 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
295
296 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
297 if (! VSTATE_VALID(vip->vi_state))
298 vnpanic(vp, "state is %s at %s:%d",
299 vstate_name(vip->vi_state), func, line);
300
301 while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED)
302 cv_wait(&vp->v_cv, vp->v_interlock);
303
304 if (! VSTATE_VALID(vip->vi_state))
305 vnpanic(vp, "state is %s at %s:%d",
306 vstate_name(vip->vi_state), func, line);
307 }
308
309 static void
310 vstate_assert_change(vnode_t *vp, enum vnode_state from, enum vnode_state to,
311 const char *func, int line)
312 {
313 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
314
315 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
316 if (from == VS_LOADING)
317 KASSERTMSG(mutex_owned(&vcache_lock), "at %s:%d", func, line);
318
319 if (! VSTATE_VALID(from))
320 vnpanic(vp, "from is %s at %s:%d",
321 vstate_name(from), func, line);
322 if (! VSTATE_VALID(to))
323 vnpanic(vp, "to is %s at %s:%d",
324 vstate_name(to), func, line);
325 if (vip->vi_state != from)
326 vnpanic(vp, "from is %s, expected %s at %s:%d\n",
327 vstate_name(vip->vi_state), vstate_name(from), func, line);
328 if ((from == VS_BLOCKED || to == VS_BLOCKED) && vp->v_usecount != 1)
329 vnpanic(vp, "%s to %s with usecount %d at %s:%d",
330 vstate_name(from), vstate_name(to), vp->v_usecount,
331 func, line);
332
333 vip->vi_state = to;
334 if (from == VS_LOADING)
335 cv_broadcast(&vcache_cv);
336 if (to == VS_LOADED || to == VS_RECLAIMED)
337 cv_broadcast(&vp->v_cv);
338 }
339
340 #else /* defined(DIAGNOSTIC) */
341
342 #define VSTATE_GET(vp) \
343 (VNODE_TO_VIMPL((vp))->vi_state)
344 #define VSTATE_CHANGE(vp, from, to) \
345 vstate_change((vp), (from), (to))
346 #define VSTATE_WAIT_STABLE(vp) \
347 vstate_wait_stable((vp))
348 void
349 _vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line,
350 bool has_lock)
351 {
352
353 }
354
355 static void
356 vstate_wait_stable(vnode_t *vp)
357 {
358 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
359
360 while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED)
361 cv_wait(&vp->v_cv, vp->v_interlock);
362 }
363
364 static void
365 vstate_change(vnode_t *vp, enum vnode_state from, enum vnode_state to)
366 {
367 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
368
369 vip->vi_state = to;
370 if (from == VS_LOADING)
371 cv_broadcast(&vcache_cv);
372 if (to == VS_LOADED || to == VS_RECLAIMED)
373 cv_broadcast(&vp->v_cv);
374 }
375
376 #endif /* defined(DIAGNOSTIC) */
377
378 void
379 vfs_vnode_sysinit(void)
380 {
381 int error __diagused;
382
383 dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL);
384 KASSERT(dead_rootmount != NULL);
385 dead_rootmount->mnt_iflag |= IMNT_MPSAFE;
386
387 mutex_init(&vdrain_lock, MUTEX_DEFAULT, IPL_NONE);
388 TAILQ_INIT(&lru_free_list);
389 TAILQ_INIT(&lru_hold_list);
390 TAILQ_INIT(&lru_vrele_list);
391
392 vcache_init();
393
394 cv_init(&vdrain_cv, "vdrain");
395 cv_init(&vdrain_gen_cv, "vdrainwt");
396 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread,
397 NULL, &vdrain_lwp, "vdrain");
398 KASSERTMSG((error == 0), "kthread_create(vdrain) failed: %d", error);
399 }
400
401 /*
402 * Allocate a new marker vnode.
403 */
404 vnode_t *
405 vnalloc_marker(struct mount *mp)
406 {
407 vnode_impl_t *vip;
408 vnode_t *vp;
409
410 vip = pool_cache_get(vcache_pool, PR_WAITOK);
411 memset(vip, 0, sizeof(*vip));
412 vp = VIMPL_TO_VNODE(vip);
413 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0);
414 vp->v_mount = mp;
415 vp->v_type = VBAD;
416 vip->vi_state = VS_MARKER;
417
418 return vp;
419 }
420
421 /*
422 * Free a marker vnode.
423 */
424 void
425 vnfree_marker(vnode_t *vp)
426 {
427 vnode_impl_t *vip;
428
429 vip = VNODE_TO_VIMPL(vp);
430 KASSERT(vip->vi_state == VS_MARKER);
431 uvm_obj_destroy(&vp->v_uobj, true);
432 pool_cache_put(vcache_pool, vip);
433 }
434
435 /*
436 * Test a vnode for being a marker vnode.
437 */
438 bool
439 vnis_marker(vnode_t *vp)
440 {
441
442 return (VNODE_TO_VIMPL(vp)->vi_state == VS_MARKER);
443 }
444
445 /*
446 * Return the lru list this node should be on.
447 */
448 static vnodelst_t *
449 lru_which(vnode_t *vp)
450 {
451
452 KASSERT(mutex_owned(vp->v_interlock));
453
454 if (vp->v_holdcnt > 0)
455 return &lru_hold_list;
456 else
457 return &lru_free_list;
458 }
459
460 /*
461 * Put vnode to end of given list.
462 * Both the current and the new list may be NULL, used on vnode alloc/free.
463 * Adjust numvnodes and signal vdrain thread if there is work.
464 */
465 static void
466 lru_requeue(vnode_t *vp, vnodelst_t *listhd)
467 {
468 vnode_impl_t *vip;
469
470 mutex_enter(&vdrain_lock);
471 vip = VNODE_TO_VIMPL(vp);
472 if (vip->vi_lrulisthd != NULL)
473 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
474 else
475 numvnodes++;
476 vip->vi_lrulisthd = listhd;
477 if (vip->vi_lrulisthd != NULL)
478 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
479 else
480 numvnodes--;
481 if (numvnodes > desiredvnodes || listhd == &lru_vrele_list)
482 cv_broadcast(&vdrain_cv);
483 mutex_exit(&vdrain_lock);
484 }
485
486 /*
487 * Release deferred vrele vnodes for this mount.
488 * Called with file system suspended.
489 */
490 void
491 vrele_flush(struct mount *mp)
492 {
493 vnode_impl_t *vip, *marker;
494
495 KASSERT(fstrans_is_owner(mp));
496
497 marker = VNODE_TO_VIMPL(vnalloc_marker(NULL));
498
499 mutex_enter(&vdrain_lock);
500 TAILQ_INSERT_HEAD(&lru_vrele_list, marker, vi_lrulist);
501
502 while ((vip = TAILQ_NEXT(marker, vi_lrulist))) {
503 TAILQ_REMOVE(&lru_vrele_list, marker, vi_lrulist);
504 TAILQ_INSERT_AFTER(&lru_vrele_list, vip, marker, vi_lrulist);
505 if (vnis_marker(VIMPL_TO_VNODE(vip)))
506 continue;
507
508 KASSERT(vip->vi_lrulisthd == &lru_vrele_list);
509 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
510 vip->vi_lrulisthd = &lru_hold_list;
511 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
512 mutex_exit(&vdrain_lock);
513
514 mutex_enter(VIMPL_TO_VNODE(vip)->v_interlock);
515 vrelel(VIMPL_TO_VNODE(vip), VRELEL_FORCE_RELE);
516
517 mutex_enter(&vdrain_lock);
518 }
519
520 TAILQ_REMOVE(&lru_vrele_list, marker, vi_lrulist);
521 mutex_exit(&vdrain_lock);
522
523 vnfree_marker(VIMPL_TO_VNODE(marker));
524 }
525
526 /*
527 * Reclaim a cached vnode. Used from vdrain_thread only.
528 */
529 static __inline void
530 vdrain_remove(vnode_t *vp)
531 {
532 struct mount *mp;
533
534 KASSERT(mutex_owned(&vdrain_lock));
535
536 /* Probe usecount (unlocked). */
537 if (vp->v_usecount > 0)
538 return;
539 /* Try v_interlock -- we lock the wrong direction! */
540 if (!mutex_tryenter(vp->v_interlock))
541 return;
542 /* Probe usecount and state. */
543 if (vp->v_usecount > 0 || VSTATE_GET(vp) != VS_LOADED) {
544 mutex_exit(vp->v_interlock);
545 return;
546 }
547 mp = vp->v_mount;
548 if (fstrans_start_nowait(mp) != 0) {
549 mutex_exit(vp->v_interlock);
550 return;
551 }
552 vdrain_retry = true;
553 mutex_exit(&vdrain_lock);
554
555 if (vcache_vget(vp) == 0) {
556 if (!vrecycle(vp)) {
557 mutex_enter(vp->v_interlock);
558 vrelel(vp, VRELEL_FORCE_RELE);
559 }
560 }
561 fstrans_done(mp);
562
563 mutex_enter(&vdrain_lock);
564 }
565
566 /*
567 * Release a cached vnode. Used from vdrain_thread only.
568 */
569 static __inline void
570 vdrain_vrele(vnode_t *vp)
571 {
572 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
573 struct mount *mp;
574
575 KASSERT(mutex_owned(&vdrain_lock));
576
577 mp = vp->v_mount;
578 if (fstrans_start_nowait(mp) != 0)
579 return;
580
581 /*
582 * First remove the vnode from the vrele list.
583 * Put it on the last lru list, the last vrele()
584 * will put it back onto the right list before
585 * its v_usecount reaches zero.
586 */
587 KASSERT(vip->vi_lrulisthd == &lru_vrele_list);
588 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
589 vip->vi_lrulisthd = &lru_hold_list;
590 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
591
592 vdrain_retry = true;
593 mutex_exit(&vdrain_lock);
594
595 mutex_enter(vp->v_interlock);
596 vrelel(vp, VRELEL_FORCE_RELE);
597 fstrans_done(mp);
598
599 mutex_enter(&vdrain_lock);
600 }
601
602 /*
603 * Helper thread to keep the number of vnodes below desiredvnodes
604 * and release vnodes from asynchronous vrele.
605 */
606 static void
607 vdrain_thread(void *cookie)
608 {
609 vnodelst_t *listhd[] = {
610 &lru_vrele_list, &lru_free_list, &lru_hold_list
611 };
612 int i;
613 u_int target;
614 vnode_impl_t *vip, *marker;
615
616 marker = VNODE_TO_VIMPL(vnalloc_marker(NULL));
617
618 mutex_enter(&vdrain_lock);
619
620 for (;;) {
621 vdrain_retry = false;
622 target = desiredvnodes - desiredvnodes/10;
623
624 for (i = 0; i < __arraycount(listhd); i++) {
625 TAILQ_INSERT_HEAD(listhd[i], marker, vi_lrulist);
626 while ((vip = TAILQ_NEXT(marker, vi_lrulist))) {
627 TAILQ_REMOVE(listhd[i], marker, vi_lrulist);
628 TAILQ_INSERT_AFTER(listhd[i], vip, marker,
629 vi_lrulist);
630 if (vnis_marker(VIMPL_TO_VNODE(vip)))
631 continue;
632 if (listhd[i] == &lru_vrele_list)
633 vdrain_vrele(VIMPL_TO_VNODE(vip));
634 else if (numvnodes < target)
635 break;
636 else
637 vdrain_remove(VIMPL_TO_VNODE(vip));
638 }
639 TAILQ_REMOVE(listhd[i], marker, vi_lrulist);
640 }
641
642 if (vdrain_retry) {
643 mutex_exit(&vdrain_lock);
644 yield();
645 mutex_enter(&vdrain_lock);
646 } else {
647 vdrain_gen++;
648 cv_broadcast(&vdrain_gen_cv);
649 cv_wait(&vdrain_cv, &vdrain_lock);
650 }
651 }
652 }
653
654 /*
655 * vput: unlock and release the reference.
656 */
657 void
658 vput(vnode_t *vp)
659 {
660
661 VOP_UNLOCK(vp);
662 vrele(vp);
663 }
664
665 /*
666 * Try to drop reference on a vnode. Abort if we are releasing the
667 * last reference. Note: this _must_ succeed if not the last reference.
668 */
669 static inline bool
670 vtryrele(vnode_t *vp)
671 {
672 u_int use, next;
673
674 for (use = vp->v_usecount;; use = next) {
675 if (use == 1) {
676 return false;
677 }
678 KASSERT(use > 1);
679 next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
680 if (__predict_true(next == use)) {
681 return true;
682 }
683 }
684 }
685
686 /*
687 * Vnode release. If reference count drops to zero, call inactive
688 * routine and either return to freelist or free to the pool.
689 */
690 static void
691 vrelel(vnode_t *vp, int flags)
692 {
693 const bool async = ((flags & VRELEL_ASYNC_RELE) != 0);
694 const bool force = ((flags & VRELEL_FORCE_RELE) != 0);
695 bool recycle, defer;
696 int error;
697
698 KASSERT(mutex_owned(vp->v_interlock));
699
700 if (__predict_false(vp->v_op == dead_vnodeop_p &&
701 VSTATE_GET(vp) != VS_RECLAIMED)) {
702 vnpanic(vp, "dead but not clean");
703 }
704
705 /*
706 * If not the last reference, just drop the reference count
707 * and unlock.
708 */
709 if (vtryrele(vp)) {
710 mutex_exit(vp->v_interlock);
711 return;
712 }
713 if (vp->v_usecount <= 0 || vp->v_writecount != 0) {
714 vnpanic(vp, "%s: bad ref count", __func__);
715 }
716
717 #ifdef DIAGNOSTIC
718 if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
719 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
720 vprint("vrelel: missing VOP_CLOSE()", vp);
721 }
722 #endif
723
724 /*
725 * First try to get the vnode locked for VOP_INACTIVE().
726 * Defer vnode release to vdrain_thread if caller requests
727 * it explicitly, is the pagedaemon or the lock failed.
728 */
729 if ((curlwp == uvm.pagedaemon_lwp) || async) {
730 defer = true;
731 } else {
732 mutex_exit(vp->v_interlock);
733 error = vn_lock(vp,
734 LK_EXCLUSIVE | LK_RETRY | (force ? 0 : LK_NOWAIT));
735 defer = (error != 0);
736 mutex_enter(vp->v_interlock);
737 }
738 KASSERT(mutex_owned(vp->v_interlock));
739 KASSERT(! (force && defer));
740 if (defer) {
741 /*
742 * Defer reclaim to the kthread; it's not safe to
743 * clean it here. We donate it our last reference.
744 */
745 lru_requeue(vp, &lru_vrele_list);
746 mutex_exit(vp->v_interlock);
747 return;
748 }
749
750 /*
751 * If the node got another reference while we
752 * released the interlock, don't try to inactivate it yet.
753 */
754 if (__predict_false(vtryrele(vp))) {
755 VOP_UNLOCK(vp);
756 mutex_exit(vp->v_interlock);
757 return;
758 }
759
760 /*
761 * If not clean, deactivate the vnode, but preserve
762 * our reference across the call to VOP_INACTIVE().
763 */
764 if (VSTATE_GET(vp) == VS_RECLAIMED) {
765 VOP_UNLOCK(vp);
766 } else {
767 VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
768 mutex_exit(vp->v_interlock);
769
770 /*
771 * The vnode must not gain another reference while being
772 * deactivated. If VOP_INACTIVE() indicates that
773 * the described file has been deleted, then recycle
774 * the vnode.
775 *
776 * Note that VOP_INACTIVE() will not drop the vnode lock.
777 */
778 recycle = false;
779 VOP_INACTIVE(vp, &recycle);
780 if (!recycle)
781 VOP_UNLOCK(vp);
782 mutex_enter(vp->v_interlock);
783 VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
784 if (!recycle) {
785 if (vtryrele(vp)) {
786 mutex_exit(vp->v_interlock);
787 return;
788 }
789 }
790
791 /* Take care of space accounting. */
792 if (vp->v_iflag & VI_EXECMAP) {
793 atomic_add_int(&uvmexp.execpages,
794 -vp->v_uobj.uo_npages);
795 atomic_add_int(&uvmexp.filepages,
796 vp->v_uobj.uo_npages);
797 }
798 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP);
799 vp->v_vflag &= ~VV_MAPPED;
800
801 /*
802 * Recycle the vnode if the file is now unused (unlinked),
803 * otherwise just free it.
804 */
805 if (recycle) {
806 VSTATE_ASSERT(vp, VS_LOADED);
807 /* vcache_reclaim drops the lock. */
808 vcache_reclaim(vp);
809 }
810 KASSERT(vp->v_usecount > 0);
811 }
812
813 if (atomic_dec_uint_nv(&vp->v_usecount) != 0) {
814 /* Gained another reference while being reclaimed. */
815 mutex_exit(vp->v_interlock);
816 return;
817 }
818
819 if (VSTATE_GET(vp) == VS_RECLAIMED && vp->v_holdcnt == 0) {
820 /*
821 * It's clean so destroy it. It isn't referenced
822 * anywhere since it has been reclaimed.
823 */
824 vcache_free(VNODE_TO_VIMPL(vp));
825 } else {
826 /*
827 * Otherwise, put it back onto the freelist. It
828 * can't be destroyed while still associated with
829 * a file system.
830 */
831 lru_requeue(vp, lru_which(vp));
832 mutex_exit(vp->v_interlock);
833 }
834 }
835
836 void
837 vrele(vnode_t *vp)
838 {
839
840 if (vtryrele(vp)) {
841 return;
842 }
843 mutex_enter(vp->v_interlock);
844 vrelel(vp, 0);
845 }
846
847 /*
848 * Asynchronous vnode release, vnode is released in different context.
849 */
850 void
851 vrele_async(vnode_t *vp)
852 {
853
854 if (vtryrele(vp)) {
855 return;
856 }
857 mutex_enter(vp->v_interlock);
858 vrelel(vp, VRELEL_ASYNC_RELE);
859 }
860
861 /*
862 * Vnode reference, where a reference is already held by some other
863 * object (for example, a file structure).
864 */
865 void
866 vref(vnode_t *vp)
867 {
868
869 KASSERT(vp->v_usecount != 0);
870
871 atomic_inc_uint(&vp->v_usecount);
872 }
873
874 /*
875 * Page or buffer structure gets a reference.
876 * Called with v_interlock held.
877 */
878 void
879 vholdl(vnode_t *vp)
880 {
881
882 KASSERT(mutex_owned(vp->v_interlock));
883
884 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0)
885 lru_requeue(vp, lru_which(vp));
886 }
887
888 /*
889 * Page or buffer structure frees a reference.
890 * Called with v_interlock held.
891 */
892 void
893 holdrelel(vnode_t *vp)
894 {
895
896 KASSERT(mutex_owned(vp->v_interlock));
897
898 if (vp->v_holdcnt <= 0) {
899 vnpanic(vp, "%s: holdcnt vp %p", __func__, vp);
900 }
901
902 vp->v_holdcnt--;
903 if (vp->v_holdcnt == 0 && vp->v_usecount == 0)
904 lru_requeue(vp, lru_which(vp));
905 }
906
907 /*
908 * Recycle an unused vnode if caller holds the last reference.
909 */
910 bool
911 vrecycle(vnode_t *vp)
912 {
913 int error __diagused;
914
915 mutex_enter(vp->v_interlock);
916
917 /* Make sure we hold the last reference. */
918 VSTATE_WAIT_STABLE(vp);
919 if (vp->v_usecount != 1) {
920 mutex_exit(vp->v_interlock);
921 return false;
922 }
923
924 /* If the vnode is already clean we're done. */
925 if (VSTATE_GET(vp) != VS_LOADED) {
926 VSTATE_ASSERT(vp, VS_RECLAIMED);
927 vrelel(vp, 0);
928 return true;
929 }
930
931 /* Prevent further references until the vnode is locked. */
932 VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
933 mutex_exit(vp->v_interlock);
934
935 /*
936 * On a leaf file system this lock will always succeed as we hold
937 * the last reference and prevent further references.
938 * On layered file systems waiting for the lock would open a can of
939 * deadlocks as the lower vnodes may have other active references.
940 */
941 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
942
943 mutex_enter(vp->v_interlock);
944 VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
945
946 if (error) {
947 mutex_exit(vp->v_interlock);
948 return false;
949 }
950
951 KASSERT(vp->v_usecount == 1);
952 vcache_reclaim(vp);
953 vrelel(vp, 0);
954
955 return true;
956 }
957
958 /*
959 * Helper for vrevoke() to propagate suspension from lastmp
960 * to thismp. Both args may be NULL.
961 * Returns the currently suspended file system or NULL.
962 */
963 static struct mount *
964 vrevoke_suspend_next(struct mount *lastmp, struct mount *thismp)
965 {
966 int error;
967
968 if (lastmp == thismp)
969 return thismp;
970
971 if (lastmp != NULL)
972 vfs_resume(lastmp);
973
974 if (thismp == NULL)
975 return NULL;
976
977 do {
978 error = vfs_suspend(thismp, 0);
979 } while (error == EINTR || error == ERESTART);
980
981 if (error == 0)
982 return thismp;
983
984 KASSERT(error == EOPNOTSUPP);
985 return NULL;
986 }
987
988 /*
989 * Eliminate all activity associated with the requested vnode
990 * and with all vnodes aliased to the requested vnode.
991 */
992 void
993 vrevoke(vnode_t *vp)
994 {
995 struct mount *mp;
996 vnode_t *vq;
997 enum vtype type;
998 dev_t dev;
999
1000 KASSERT(vp->v_usecount > 0);
1001
1002 mp = vrevoke_suspend_next(NULL, vp->v_mount);
1003
1004 mutex_enter(vp->v_interlock);
1005 VSTATE_WAIT_STABLE(vp);
1006 if (VSTATE_GET(vp) == VS_RECLAIMED) {
1007 mutex_exit(vp->v_interlock);
1008 } else if (vp->v_type != VBLK && vp->v_type != VCHR) {
1009 atomic_inc_uint(&vp->v_usecount);
1010 mutex_exit(vp->v_interlock);
1011 vgone(vp);
1012 } else {
1013 dev = vp->v_rdev;
1014 type = vp->v_type;
1015 mutex_exit(vp->v_interlock);
1016
1017 while (spec_node_lookup_by_dev(type, dev, &vq) == 0) {
1018 mp = vrevoke_suspend_next(mp, vq->v_mount);
1019 vgone(vq);
1020 }
1021 }
1022 vrevoke_suspend_next(mp, NULL);
1023 }
1024
1025 /*
1026 * Eliminate all activity associated with a vnode in preparation for
1027 * reuse. Drops a reference from the vnode.
1028 */
1029 void
1030 vgone(vnode_t *vp)
1031 {
1032
1033 KASSERT(vp->v_mount == dead_rootmount || fstrans_is_owner(vp->v_mount));
1034
1035 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1036 mutex_enter(vp->v_interlock);
1037 VSTATE_WAIT_STABLE(vp);
1038 if (VSTATE_GET(vp) == VS_LOADED)
1039 vcache_reclaim(vp);
1040 VSTATE_ASSERT(vp, VS_RECLAIMED);
1041 vrelel(vp, 0);
1042 }
1043
1044 static inline uint32_t
1045 vcache_hash(const struct vcache_key *key)
1046 {
1047 uint32_t hash = HASH32_BUF_INIT;
1048
1049 KASSERT(key->vk_key_len > 0);
1050
1051 hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash);
1052 hash = hash32_buf(key->vk_key, key->vk_key_len, hash);
1053 return hash;
1054 }
1055
1056 static void
1057 vcache_init(void)
1058 {
1059
1060 vcache_pool = pool_cache_init(sizeof(vnode_impl_t), 0, 0, 0,
1061 "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL);
1062 KASSERT(vcache_pool != NULL);
1063 mutex_init(&vcache_lock, MUTEX_DEFAULT, IPL_NONE);
1064 cv_init(&vcache_cv, "vcache");
1065 vcache_hashsize = desiredvnodes;
1066 vcache_hashtab = hashinit(desiredvnodes, HASH_SLIST, true,
1067 &vcache_hashmask);
1068 }
1069
1070 static void
1071 vcache_reinit(void)
1072 {
1073 int i;
1074 uint32_t hash;
1075 u_long oldmask, newmask;
1076 struct hashhead *oldtab, *newtab;
1077 vnode_impl_t *vip;
1078
1079 newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask);
1080 mutex_enter(&vcache_lock);
1081 oldtab = vcache_hashtab;
1082 oldmask = vcache_hashmask;
1083 vcache_hashsize = desiredvnodes;
1084 vcache_hashtab = newtab;
1085 vcache_hashmask = newmask;
1086 for (i = 0; i <= oldmask; i++) {
1087 while ((vip = SLIST_FIRST(&oldtab[i])) != NULL) {
1088 SLIST_REMOVE(&oldtab[i], vip, vnode_impl, vi_hash);
1089 hash = vcache_hash(&vip->vi_key);
1090 SLIST_INSERT_HEAD(&newtab[hash & vcache_hashmask],
1091 vip, vi_hash);
1092 }
1093 }
1094 mutex_exit(&vcache_lock);
1095 hashdone(oldtab, HASH_SLIST, oldmask);
1096 }
1097
1098 static inline vnode_impl_t *
1099 vcache_hash_lookup(const struct vcache_key *key, uint32_t hash)
1100 {
1101 struct hashhead *hashp;
1102 vnode_impl_t *vip;
1103
1104 KASSERT(mutex_owned(&vcache_lock));
1105
1106 hashp = &vcache_hashtab[hash & vcache_hashmask];
1107 SLIST_FOREACH(vip, hashp, vi_hash) {
1108 if (key->vk_mount != vip->vi_key.vk_mount)
1109 continue;
1110 if (key->vk_key_len != vip->vi_key.vk_key_len)
1111 continue;
1112 if (memcmp(key->vk_key, vip->vi_key.vk_key, key->vk_key_len))
1113 continue;
1114 return vip;
1115 }
1116 return NULL;
1117 }
1118
1119 /*
1120 * Allocate a new, uninitialized vcache node.
1121 */
1122 static vnode_impl_t *
1123 vcache_alloc(void)
1124 {
1125 vnode_impl_t *vip;
1126 vnode_t *vp;
1127
1128 vip = pool_cache_get(vcache_pool, PR_WAITOK);
1129 memset(vip, 0, sizeof(*vip));
1130
1131 rw_init(&vip->vi_lock);
1132 /* SLIST_INIT(&vip->vi_hash); */
1133 /* LIST_INIT(&vip->vi_nclist); */
1134 /* LIST_INIT(&vip->vi_dnclist); */
1135
1136 vp = VIMPL_TO_VNODE(vip);
1137 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0);
1138 cv_init(&vp->v_cv, "vnode");
1139
1140 vp->v_usecount = 1;
1141 vp->v_type = VNON;
1142 vp->v_size = vp->v_writesize = VSIZENOTSET;
1143
1144 vip->vi_state = VS_LOADING;
1145
1146 lru_requeue(vp, &lru_free_list);
1147
1148 return vip;
1149 }
1150
1151 /*
1152 * Deallocate a vcache node in state VS_LOADING.
1153 *
1154 * vcache_lock held on entry and released on return.
1155 */
1156 static void
1157 vcache_dealloc(vnode_impl_t *vip)
1158 {
1159 vnode_t *vp;
1160
1161 KASSERT(mutex_owned(&vcache_lock));
1162
1163 vp = VIMPL_TO_VNODE(vip);
1164 vfs_ref(dead_rootmount);
1165 vfs_insmntque(vp, dead_rootmount);
1166 mutex_enter(vp->v_interlock);
1167 vp->v_op = dead_vnodeop_p;
1168 VSTATE_CHANGE(vp, VS_LOADING, VS_RECLAIMED);
1169 mutex_exit(&vcache_lock);
1170 vrelel(vp, 0);
1171 }
1172
1173 /*
1174 * Free an unused, unreferenced vcache node.
1175 * v_interlock locked on entry.
1176 */
1177 static void
1178 vcache_free(vnode_impl_t *vip)
1179 {
1180 vnode_t *vp;
1181
1182 vp = VIMPL_TO_VNODE(vip);
1183 KASSERT(mutex_owned(vp->v_interlock));
1184
1185 KASSERT(vp->v_usecount == 0);
1186 KASSERT(vp->v_holdcnt == 0);
1187 KASSERT(vp->v_writecount == 0);
1188 lru_requeue(vp, NULL);
1189 mutex_exit(vp->v_interlock);
1190
1191 vfs_insmntque(vp, NULL);
1192 if (vp->v_type == VBLK || vp->v_type == VCHR)
1193 spec_node_destroy(vp);
1194
1195 rw_destroy(&vip->vi_lock);
1196 uvm_obj_destroy(&vp->v_uobj, true);
1197 cv_destroy(&vp->v_cv);
1198 pool_cache_put(vcache_pool, vip);
1199 }
1200
1201 /*
1202 * Try to get an initial reference on this cached vnode.
1203 * Returns zero on success, ENOENT if the vnode has been reclaimed and
1204 * EBUSY if the vnode state is unstable.
1205 *
1206 * v_interlock locked on entry and unlocked on exit.
1207 */
1208 int
1209 vcache_tryvget(vnode_t *vp)
1210 {
1211 int error = 0;
1212
1213 KASSERT(mutex_owned(vp->v_interlock));
1214
1215 if (__predict_false(VSTATE_GET(vp) == VS_RECLAIMED))
1216 error = ENOENT;
1217 else if (__predict_false(VSTATE_GET(vp) != VS_LOADED))
1218 error = EBUSY;
1219 else if (vp->v_usecount == 0)
1220 vp->v_usecount = 1;
1221 else
1222 atomic_inc_uint(&vp->v_usecount);
1223
1224 mutex_exit(vp->v_interlock);
1225
1226 return error;
1227 }
1228
1229 /*
1230 * Try to get an initial reference on this cached vnode.
1231 * Returns zero on success and ENOENT if the vnode has been reclaimed.
1232 * Will wait for the vnode state to be stable.
1233 *
1234 * v_interlock locked on entry and unlocked on exit.
1235 */
1236 int
1237 vcache_vget(vnode_t *vp)
1238 {
1239
1240 KASSERT(mutex_owned(vp->v_interlock));
1241
1242 /* Increment hold count to prevent vnode from disappearing. */
1243 vp->v_holdcnt++;
1244 VSTATE_WAIT_STABLE(vp);
1245 vp->v_holdcnt--;
1246
1247 /* If this was the last reference to a reclaimed vnode free it now. */
1248 if (__predict_false(VSTATE_GET(vp) == VS_RECLAIMED)) {
1249 if (vp->v_holdcnt == 0 && vp->v_usecount == 0)
1250 vcache_free(VNODE_TO_VIMPL(vp));
1251 else
1252 mutex_exit(vp->v_interlock);
1253 return ENOENT;
1254 }
1255 VSTATE_ASSERT(vp, VS_LOADED);
1256 if (vp->v_usecount == 0)
1257 vp->v_usecount = 1;
1258 else
1259 atomic_inc_uint(&vp->v_usecount);
1260
1261 mutex_exit(vp->v_interlock);
1262
1263 return 0;
1264 }
1265
1266 /*
1267 * Get a vnode / fs node pair by key and return it referenced through vpp.
1268 */
1269 int
1270 vcache_get(struct mount *mp, const void *key, size_t key_len,
1271 struct vnode **vpp)
1272 {
1273 int error;
1274 uint32_t hash;
1275 const void *new_key;
1276 struct vnode *vp;
1277 struct vcache_key vcache_key;
1278 vnode_impl_t *vip, *new_vip;
1279
1280 new_key = NULL;
1281 *vpp = NULL;
1282
1283 vcache_key.vk_mount = mp;
1284 vcache_key.vk_key = key;
1285 vcache_key.vk_key_len = key_len;
1286 hash = vcache_hash(&vcache_key);
1287
1288 again:
1289 mutex_enter(&vcache_lock);
1290 vip = vcache_hash_lookup(&vcache_key, hash);
1291
1292 /* If found, take a reference or retry. */
1293 if (__predict_true(vip != NULL)) {
1294 /*
1295 * If the vnode is loading we cannot take the v_interlock
1296 * here as it might change during load (see uvm_obj_setlock()).
1297 * As changing state from VS_LOADING requires both vcache_lock
1298 * and v_interlock it is safe to test with vcache_lock held.
1299 *
1300 * Wait for vnodes changing state from VS_LOADING and retry.
1301 */
1302 if (__predict_false(vip->vi_state == VS_LOADING)) {
1303 cv_wait(&vcache_cv, &vcache_lock);
1304 mutex_exit(&vcache_lock);
1305 goto again;
1306 }
1307 vp = VIMPL_TO_VNODE(vip);
1308 mutex_enter(vp->v_interlock);
1309 mutex_exit(&vcache_lock);
1310 error = vcache_vget(vp);
1311 if (error == ENOENT)
1312 goto again;
1313 if (error == 0)
1314 *vpp = vp;
1315 KASSERT((error != 0) == (*vpp == NULL));
1316 return error;
1317 }
1318 mutex_exit(&vcache_lock);
1319
1320 /* Allocate and initialize a new vcache / vnode pair. */
1321 error = vfs_busy(mp);
1322 if (error)
1323 return error;
1324 new_vip = vcache_alloc();
1325 new_vip->vi_key = vcache_key;
1326 vp = VIMPL_TO_VNODE(new_vip);
1327 mutex_enter(&vcache_lock);
1328 vip = vcache_hash_lookup(&vcache_key, hash);
1329 if (vip == NULL) {
1330 SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask],
1331 new_vip, vi_hash);
1332 vip = new_vip;
1333 }
1334
1335 /* If another thread beat us inserting this node, retry. */
1336 if (vip != new_vip) {
1337 vcache_dealloc(new_vip);
1338 vfs_unbusy(mp);
1339 goto again;
1340 }
1341 mutex_exit(&vcache_lock);
1342
1343 /* Load the fs node. Exclusive as new_node is VS_LOADING. */
1344 error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key);
1345 if (error) {
1346 mutex_enter(&vcache_lock);
1347 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
1348 new_vip, vnode_impl, vi_hash);
1349 vcache_dealloc(new_vip);
1350 vfs_unbusy(mp);
1351 KASSERT(*vpp == NULL);
1352 return error;
1353 }
1354 KASSERT(new_key != NULL);
1355 KASSERT(memcmp(key, new_key, key_len) == 0);
1356 KASSERT(vp->v_op != NULL);
1357 vfs_insmntque(vp, mp);
1358 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1359 vp->v_vflag |= VV_MPSAFE;
1360 vfs_ref(mp);
1361 vfs_unbusy(mp);
1362
1363 /* Finished loading, finalize node. */
1364 mutex_enter(&vcache_lock);
1365 new_vip->vi_key.vk_key = new_key;
1366 mutex_enter(vp->v_interlock);
1367 VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED);
1368 mutex_exit(vp->v_interlock);
1369 mutex_exit(&vcache_lock);
1370 *vpp = vp;
1371 return 0;
1372 }
1373
1374 /*
1375 * Create a new vnode / fs node pair and return it referenced through vpp.
1376 */
1377 int
1378 vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap,
1379 kauth_cred_t cred, void *extra, struct vnode **vpp)
1380 {
1381 int error;
1382 uint32_t hash;
1383 struct vnode *vp, *ovp;
1384 vnode_impl_t *vip, *ovip;
1385
1386 *vpp = NULL;
1387
1388 /* Allocate and initialize a new vcache / vnode pair. */
1389 error = vfs_busy(mp);
1390 if (error)
1391 return error;
1392 vip = vcache_alloc();
1393 vip->vi_key.vk_mount = mp;
1394 vp = VIMPL_TO_VNODE(vip);
1395
1396 /* Create and load the fs node. */
1397 error = VFS_NEWVNODE(mp, dvp, vp, vap, cred, extra,
1398 &vip->vi_key.vk_key_len, &vip->vi_key.vk_key);
1399 if (error) {
1400 mutex_enter(&vcache_lock);
1401 vcache_dealloc(vip);
1402 vfs_unbusy(mp);
1403 KASSERT(*vpp == NULL);
1404 return error;
1405 }
1406 KASSERT(vp->v_op != NULL);
1407 KASSERT((vip->vi_key.vk_key_len == 0) == (mp == dead_rootmount));
1408 if (vip->vi_key.vk_key_len > 0) {
1409 KASSERT(vip->vi_key.vk_key != NULL);
1410 hash = vcache_hash(&vip->vi_key);
1411
1412 /*
1413 * Wait for previous instance to be reclaimed,
1414 * then insert new node.
1415 */
1416 mutex_enter(&vcache_lock);
1417 while ((ovip = vcache_hash_lookup(&vip->vi_key, hash))) {
1418 ovp = VIMPL_TO_VNODE(ovip);
1419 mutex_enter(ovp->v_interlock);
1420 mutex_exit(&vcache_lock);
1421 error = vcache_vget(ovp);
1422 KASSERT(error == ENOENT);
1423 mutex_enter(&vcache_lock);
1424 }
1425 SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask],
1426 vip, vi_hash);
1427 mutex_exit(&vcache_lock);
1428 }
1429 vfs_insmntque(vp, mp);
1430 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1431 vp->v_vflag |= VV_MPSAFE;
1432 vfs_ref(mp);
1433 vfs_unbusy(mp);
1434
1435 /* Finished loading, finalize node. */
1436 mutex_enter(&vcache_lock);
1437 mutex_enter(vp->v_interlock);
1438 VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED);
1439 mutex_exit(&vcache_lock);
1440 mutex_exit(vp->v_interlock);
1441 *vpp = vp;
1442 return 0;
1443 }
1444
1445 /*
1446 * Prepare key change: update old cache nodes key and lock new cache node.
1447 * Return an error if the new node already exists.
1448 */
1449 int
1450 vcache_rekey_enter(struct mount *mp, struct vnode *vp,
1451 const void *old_key, size_t old_key_len,
1452 const void *new_key, size_t new_key_len)
1453 {
1454 uint32_t old_hash, new_hash;
1455 struct vcache_key old_vcache_key, new_vcache_key;
1456 vnode_impl_t *vip, *new_vip;
1457
1458 old_vcache_key.vk_mount = mp;
1459 old_vcache_key.vk_key = old_key;
1460 old_vcache_key.vk_key_len = old_key_len;
1461 old_hash = vcache_hash(&old_vcache_key);
1462
1463 new_vcache_key.vk_mount = mp;
1464 new_vcache_key.vk_key = new_key;
1465 new_vcache_key.vk_key_len = new_key_len;
1466 new_hash = vcache_hash(&new_vcache_key);
1467
1468 new_vip = vcache_alloc();
1469 new_vip->vi_key = new_vcache_key;
1470
1471 /* Insert locked new node used as placeholder. */
1472 mutex_enter(&vcache_lock);
1473 vip = vcache_hash_lookup(&new_vcache_key, new_hash);
1474 if (vip != NULL) {
1475 vcache_dealloc(new_vip);
1476 return EEXIST;
1477 }
1478 SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask],
1479 new_vip, vi_hash);
1480
1481 /* Replace old nodes key with the temporary copy. */
1482 vip = vcache_hash_lookup(&old_vcache_key, old_hash);
1483 KASSERT(vip != NULL);
1484 KASSERT(VIMPL_TO_VNODE(vip) == vp);
1485 KASSERT(vip->vi_key.vk_key != old_vcache_key.vk_key);
1486 vip->vi_key = old_vcache_key;
1487 mutex_exit(&vcache_lock);
1488 return 0;
1489 }
1490
1491 /*
1492 * Key change complete: update old node and remove placeholder.
1493 */
1494 void
1495 vcache_rekey_exit(struct mount *mp, struct vnode *vp,
1496 const void *old_key, size_t old_key_len,
1497 const void *new_key, size_t new_key_len)
1498 {
1499 uint32_t old_hash, new_hash;
1500 struct vcache_key old_vcache_key, new_vcache_key;
1501 vnode_impl_t *vip, *new_vip;
1502 struct vnode *new_vp;
1503
1504 old_vcache_key.vk_mount = mp;
1505 old_vcache_key.vk_key = old_key;
1506 old_vcache_key.vk_key_len = old_key_len;
1507 old_hash = vcache_hash(&old_vcache_key);
1508
1509 new_vcache_key.vk_mount = mp;
1510 new_vcache_key.vk_key = new_key;
1511 new_vcache_key.vk_key_len = new_key_len;
1512 new_hash = vcache_hash(&new_vcache_key);
1513
1514 mutex_enter(&vcache_lock);
1515
1516 /* Lookup old and new node. */
1517 vip = vcache_hash_lookup(&old_vcache_key, old_hash);
1518 KASSERT(vip != NULL);
1519 KASSERT(VIMPL_TO_VNODE(vip) == vp);
1520
1521 new_vip = vcache_hash_lookup(&new_vcache_key, new_hash);
1522 KASSERT(new_vip != NULL);
1523 KASSERT(new_vip->vi_key.vk_key_len == new_key_len);
1524 new_vp = VIMPL_TO_VNODE(new_vip);
1525 mutex_enter(new_vp->v_interlock);
1526 VSTATE_ASSERT(VIMPL_TO_VNODE(new_vip), VS_LOADING);
1527 mutex_exit(new_vp->v_interlock);
1528
1529 /* Rekey old node and put it onto its new hashlist. */
1530 vip->vi_key = new_vcache_key;
1531 if (old_hash != new_hash) {
1532 SLIST_REMOVE(&vcache_hashtab[old_hash & vcache_hashmask],
1533 vip, vnode_impl, vi_hash);
1534 SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask],
1535 vip, vi_hash);
1536 }
1537
1538 /* Remove new node used as placeholder. */
1539 SLIST_REMOVE(&vcache_hashtab[new_hash & vcache_hashmask],
1540 new_vip, vnode_impl, vi_hash);
1541 vcache_dealloc(new_vip);
1542 }
1543
1544 /*
1545 * Disassociate the underlying file system from a vnode.
1546 *
1547 * Must be called with vnode locked and will return unlocked.
1548 * Must be called with the interlock held, and will return with it held.
1549 */
1550 static void
1551 vcache_reclaim(vnode_t *vp)
1552 {
1553 lwp_t *l = curlwp;
1554 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
1555 struct mount *mp = vp->v_mount;
1556 uint32_t hash;
1557 uint8_t temp_buf[64], *temp_key;
1558 size_t temp_key_len;
1559 bool recycle, active;
1560 int error;
1561
1562 KASSERT((vp->v_vflag & VV_LOCKSWORK) == 0 ||
1563 VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
1564 KASSERT(mutex_owned(vp->v_interlock));
1565 KASSERT(vp->v_usecount != 0);
1566
1567 active = (vp->v_usecount > 1);
1568 temp_key_len = vip->vi_key.vk_key_len;
1569 /*
1570 * Prevent the vnode from being recycled or brought into use
1571 * while we clean it out.
1572 */
1573 VSTATE_CHANGE(vp, VS_LOADED, VS_RECLAIMING);
1574 if (vp->v_iflag & VI_EXECMAP) {
1575 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages);
1576 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages);
1577 }
1578 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
1579 mutex_exit(vp->v_interlock);
1580
1581 /* Replace the vnode key with a temporary copy. */
1582 if (vip->vi_key.vk_key_len > sizeof(temp_buf)) {
1583 temp_key = kmem_alloc(temp_key_len, KM_SLEEP);
1584 } else {
1585 temp_key = temp_buf;
1586 }
1587 if (vip->vi_key.vk_key_len > 0) {
1588 mutex_enter(&vcache_lock);
1589 memcpy(temp_key, vip->vi_key.vk_key, temp_key_len);
1590 vip->vi_key.vk_key = temp_key;
1591 mutex_exit(&vcache_lock);
1592 }
1593
1594 fstrans_start(mp);
1595
1596 /*
1597 * Clean out any cached data associated with the vnode.
1598 * If purging an active vnode, it must be closed and
1599 * deactivated before being reclaimed.
1600 */
1601 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
1602 if (error != 0) {
1603 if (wapbl_vphaswapbl(vp))
1604 WAPBL_DISCARD(wapbl_vptomp(vp));
1605 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
1606 }
1607 KASSERTMSG((error == 0), "vinvalbuf failed: %d", error);
1608 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1609 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) {
1610 spec_node_revoke(vp);
1611 }
1612
1613 /*
1614 * Disassociate the underlying file system from the vnode.
1615 * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks
1616 * the vnode, and may destroy the vnode so that VOP_UNLOCK
1617 * would no longer function.
1618 */
1619 VOP_INACTIVE(vp, &recycle);
1620 KASSERT((vp->v_vflag & VV_LOCKSWORK) == 0 ||
1621 VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
1622 if (VOP_RECLAIM(vp)) {
1623 vnpanic(vp, "%s: cannot reclaim", __func__);
1624 }
1625
1626 KASSERT(vp->v_data == NULL);
1627 KASSERT(vp->v_uobj.uo_npages == 0);
1628
1629 if (vp->v_type == VREG && vp->v_ractx != NULL) {
1630 uvm_ra_freectx(vp->v_ractx);
1631 vp->v_ractx = NULL;
1632 }
1633
1634 /* Purge name cache. */
1635 cache_purge(vp);
1636
1637 if (vip->vi_key.vk_key_len > 0) {
1638 /* Remove from vnode cache. */
1639 hash = vcache_hash(&vip->vi_key);
1640 mutex_enter(&vcache_lock);
1641 KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash));
1642 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
1643 vip, vnode_impl, vi_hash);
1644 mutex_exit(&vcache_lock);
1645 }
1646 if (temp_key != temp_buf)
1647 kmem_free(temp_key, temp_key_len);
1648
1649 /* Done with purge, notify sleepers of the grim news. */
1650 mutex_enter(vp->v_interlock);
1651 vp->v_op = dead_vnodeop_p;
1652 vp->v_vflag |= VV_LOCKSWORK;
1653 VSTATE_CHANGE(vp, VS_RECLAIMING, VS_RECLAIMED);
1654 vp->v_tag = VT_NON;
1655 KNOTE(&vp->v_klist, NOTE_REVOKE);
1656 mutex_exit(vp->v_interlock);
1657
1658 /*
1659 * Move to dead mount. Must be after changing the operations
1660 * vector as vnode operations enter the mount before using the
1661 * operations vector. See sys/kern/vnode_if.c.
1662 */
1663 vp->v_vflag &= ~VV_ROOT;
1664 vfs_ref(dead_rootmount);
1665 vfs_insmntque(vp, dead_rootmount);
1666
1667 mutex_enter(vp->v_interlock);
1668 fstrans_done(mp);
1669 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1670 }
1671
1672 /*
1673 * Disassociate the underlying file system from an open device vnode
1674 * and make it anonymous.
1675 *
1676 * Vnode unlocked on entry, drops a reference to the vnode.
1677 */
1678 void
1679 vcache_make_anon(vnode_t *vp)
1680 {
1681 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
1682 uint32_t hash;
1683 bool recycle;
1684
1685 KASSERT(vp->v_type == VBLK || vp->v_type == VCHR);
1686 KASSERT(vp->v_mount == dead_rootmount || fstrans_is_owner(vp->v_mount));
1687 VSTATE_ASSERT_UNLOCKED(vp, VS_ACTIVE);
1688
1689 /* Remove from vnode cache. */
1690 hash = vcache_hash(&vip->vi_key);
1691 mutex_enter(&vcache_lock);
1692 KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash));
1693 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
1694 vip, vnode_impl, vi_hash);
1695 vip->vi_key.vk_mount = dead_rootmount;
1696 vip->vi_key.vk_key_len = 0;
1697 vip->vi_key.vk_key = NULL;
1698 mutex_exit(&vcache_lock);
1699
1700 /*
1701 * Disassociate the underlying file system from the vnode.
1702 * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks
1703 * the vnode, and may destroy the vnode so that VOP_UNLOCK
1704 * would no longer function.
1705 */
1706 if (vn_lock(vp, LK_EXCLUSIVE)) {
1707 vnpanic(vp, "%s: cannot lock", __func__);
1708 }
1709 VOP_INACTIVE(vp, &recycle);
1710 KASSERT((vp->v_vflag & VV_LOCKSWORK) == 0 ||
1711 VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
1712 if (VOP_RECLAIM(vp)) {
1713 vnpanic(vp, "%s: cannot reclaim", __func__);
1714 }
1715
1716 /* Purge name cache. */
1717 cache_purge(vp);
1718
1719 /* Done with purge, change operations vector. */
1720 mutex_enter(vp->v_interlock);
1721 vp->v_op = spec_vnodeop_p;
1722 vp->v_vflag |= VV_MPSAFE;
1723 vp->v_vflag &= ~VV_LOCKSWORK;
1724 mutex_exit(vp->v_interlock);
1725
1726 /*
1727 * Move to dead mount. Must be after changing the operations
1728 * vector as vnode operations enter the mount before using the
1729 * operations vector. See sys/kern/vnode_if.c.
1730 */
1731 vfs_ref(dead_rootmount);
1732 vfs_insmntque(vp, dead_rootmount);
1733
1734 vrele(vp);
1735 }
1736
1737 /*
1738 * Update outstanding I/O count and do wakeup if requested.
1739 */
1740 void
1741 vwakeup(struct buf *bp)
1742 {
1743 vnode_t *vp;
1744
1745 if ((vp = bp->b_vp) == NULL)
1746 return;
1747
1748 KASSERT(bp->b_objlock == vp->v_interlock);
1749 KASSERT(mutex_owned(bp->b_objlock));
1750
1751 if (--vp->v_numoutput < 0)
1752 vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp);
1753 if (vp->v_numoutput == 0)
1754 cv_broadcast(&vp->v_cv);
1755 }
1756
1757 /*
1758 * Test a vnode for being or becoming dead. Returns one of:
1759 * EBUSY: vnode is becoming dead, with "flags == VDEAD_NOWAIT" only.
1760 * ENOENT: vnode is dead.
1761 * 0: otherwise.
1762 *
1763 * Whenever this function returns a non-zero value all future
1764 * calls will also return a non-zero value.
1765 */
1766 int
1767 vdead_check(struct vnode *vp, int flags)
1768 {
1769
1770 KASSERT(mutex_owned(vp->v_interlock));
1771
1772 if (! ISSET(flags, VDEAD_NOWAIT))
1773 VSTATE_WAIT_STABLE(vp);
1774
1775 if (VSTATE_GET(vp) == VS_RECLAIMING) {
1776 KASSERT(ISSET(flags, VDEAD_NOWAIT));
1777 return EBUSY;
1778 } else if (VSTATE_GET(vp) == VS_RECLAIMED) {
1779 return ENOENT;
1780 }
1781
1782 return 0;
1783 }
1784
1785 int
1786 vfs_drainvnodes(void)
1787 {
1788 int i, gen;
1789
1790 mutex_enter(&vdrain_lock);
1791 for (i = 0; i < 2; i++) {
1792 gen = vdrain_gen;
1793 while (gen == vdrain_gen) {
1794 cv_broadcast(&vdrain_cv);
1795 cv_wait(&vdrain_gen_cv, &vdrain_lock);
1796 }
1797 }
1798 mutex_exit(&vdrain_lock);
1799
1800 if (numvnodes >= desiredvnodes)
1801 return EBUSY;
1802
1803 if (vcache_hashsize != desiredvnodes)
1804 vcache_reinit();
1805
1806 return 0;
1807 }
1808
1809 void
1810 vnpanic(vnode_t *vp, const char *fmt, ...)
1811 {
1812 va_list ap;
1813
1814 #ifdef DIAGNOSTIC
1815 vprint(NULL, vp);
1816 #endif
1817 va_start(ap, fmt);
1818 vpanic(fmt, ap);
1819 va_end(ap);
1820 }
1821