vfs_vnode.c revision 1.53.2.4 1 /* $NetBSD: vfs_vnode.c,v 1.53.2.4 2017/04/26 02:53:27 pgoyette Exp $ */
2
3 /*-
4 * Copyright (c) 1997-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /*
34 * Copyright (c) 1989, 1993
35 * The Regents of the University of California. All rights reserved.
36 * (c) UNIX System Laboratories, Inc.
37 * All or some portions of this file are derived from material licensed
38 * to the University of California by American Telephone and Telegraph
39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40 * the permission of UNIX System Laboratories, Inc.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
67 */
68
69 /*
70 * The vnode cache subsystem.
71 *
72 * Life-cycle
73 *
74 * Normally, there are two points where new vnodes are created:
75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode
76 * starts in one of the following ways:
77 *
78 * - Allocation, via vcache_get(9) or vcache_new(9).
79 * - Reclamation of inactive vnode, via vcache_vget(9).
80 *
81 * Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9)
82 * was another, traditional way. Currently, only the draining thread
83 * recycles the vnodes. This behaviour might be revisited.
84 *
85 * The life-cycle ends when the last reference is dropped, usually
86 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform
87 * the file system that vnode is inactive. Via this call, file system
88 * indicates whether vnode can be recycled (usually, it checks its own
89 * references, e.g. count of links, whether the file was removed).
90 *
91 * Depending on indication, vnode can be put into a free list (cache),
92 * or cleaned via vclean(9), which calls VOP_RECLAIM(9) to disassociate
93 * underlying file system from the vnode, and finally destroyed.
94 *
95 * Vnode state
96 *
97 * Vnode is always in one of six states:
98 * - MARKER This is a marker vnode to help list traversal. It
99 * will never change its state.
100 * - LOADING Vnode is associating underlying file system and not
101 * yet ready to use.
102 * - ACTIVE Vnode has associated underlying file system and is
103 * ready to use.
104 * - BLOCKED Vnode is active but cannot get new references.
105 * - RECLAIMING Vnode is disassociating from the underlying file
106 * system.
107 * - RECLAIMED Vnode has disassociated from underlying file system
108 * and is dead.
109 *
110 * Valid state changes are:
111 * LOADING -> ACTIVE
112 * Vnode has been initialised in vcache_get() or
113 * vcache_new() and is ready to use.
114 * ACTIVE -> RECLAIMING
115 * Vnode starts disassociation from underlying file
116 * system in vclean().
117 * RECLAIMING -> RECLAIMED
118 * Vnode finished disassociation from underlying file
119 * system in vclean().
120 * ACTIVE -> BLOCKED
121 * Either vcache_rekey*() is changing the vnode key or
122 * vrelel() is about to call VOP_INACTIVE().
123 * BLOCKED -> ACTIVE
124 * The block condition is over.
125 * LOADING -> RECLAIMED
126 * Either vcache_get() or vcache_new() failed to
127 * associate the underlying file system or vcache_rekey*()
128 * drops a vnode used as placeholder.
129 *
130 * Of these states LOADING, BLOCKED and RECLAIMING are intermediate
131 * and it is possible to wait for state change.
132 *
133 * State is protected with v_interlock with one exception:
134 * to change from LOADING both v_interlock and vcache_lock must be held
135 * so it is possible to check "state == LOADING" without holding
136 * v_interlock. See vcache_get() for details.
137 *
138 * Reference counting
139 *
140 * Vnode is considered active, if reference count (vnode_t::v_usecount)
141 * is non-zero. It is maintained using: vref(9) and vrele(9), as well
142 * as vput(9), routines. Common points holding references are e.g.
143 * file openings, current working directory, mount points, etc.
144 *
145 * Note on v_usecount and its locking
146 *
147 * At nearly all points it is known that v_usecount could be zero,
148 * the vnode_t::v_interlock will be held. To change v_usecount away
149 * from zero, the interlock must be held. To change from a non-zero
150 * value to zero, again the interlock must be held.
151 *
152 * Changing the usecount from a non-zero value to a non-zero value can
153 * safely be done using atomic operations, without the interlock held.
154 *
155 */
156
157 #include <sys/cdefs.h>
158 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.53.2.4 2017/04/26 02:53:27 pgoyette Exp $");
159
160 #include <sys/param.h>
161 #include <sys/kernel.h>
162
163 #include <sys/atomic.h>
164 #include <sys/buf.h>
165 #include <sys/conf.h>
166 #include <sys/device.h>
167 #include <sys/hash.h>
168 #include <sys/kauth.h>
169 #include <sys/kmem.h>
170 #include <sys/kthread.h>
171 #include <sys/module.h>
172 #include <sys/mount.h>
173 #include <sys/namei.h>
174 #include <sys/syscallargs.h>
175 #include <sys/sysctl.h>
176 #include <sys/systm.h>
177 #include <sys/vnode_impl.h>
178 #include <sys/wapbl.h>
179 #include <sys/fstrans.h>
180
181 #include <uvm/uvm.h>
182 #include <uvm/uvm_readahead.h>
183
184 /* Flags to vrelel. */
185 #define VRELEL_ASYNC_RELE 0x0001 /* Always defer to vrele thread. */
186 #define VRELEL_FORCE_RELE 0x0002 /* Must always succeed. */
187
188 u_int numvnodes __cacheline_aligned;
189
190 /*
191 * There are three lru lists: one holds vnodes waiting for async release,
192 * one is for vnodes which have no buffer/page references and
193 * one for those which do (i.e. v_holdcnt is non-zero).
194 */
195 static vnodelst_t lru_vrele_list __cacheline_aligned;
196 static vnodelst_t lru_free_list __cacheline_aligned;
197 static vnodelst_t lru_hold_list __cacheline_aligned;
198 static kmutex_t vdrain_lock __cacheline_aligned;
199 static kcondvar_t vdrain_cv __cacheline_aligned;
200 static int vdrain_gen;
201 static kcondvar_t vdrain_gen_cv;
202 static bool vdrain_retry;
203 static lwp_t * vdrain_lwp;
204 SLIST_HEAD(hashhead, vnode_impl);
205 static kmutex_t vcache_lock __cacheline_aligned;
206 static kcondvar_t vcache_cv __cacheline_aligned;
207 static u_int vcache_hashsize;
208 static u_long vcache_hashmask;
209 static struct hashhead *vcache_hashtab __cacheline_aligned;
210 static pool_cache_t vcache_pool;
211 static void lru_requeue(vnode_t *, vnodelst_t *);
212 static vnodelst_t * lru_which(vnode_t *);
213 static vnode_impl_t * vcache_alloc(void);
214 static void vcache_dealloc(vnode_impl_t *);
215 static void vcache_free(vnode_impl_t *);
216 static void vcache_init(void);
217 static void vcache_reinit(void);
218 static void vclean(vnode_t *);
219 static void vrelel(vnode_t *, int);
220 static void vdrain_thread(void *);
221 static void vnpanic(vnode_t *, const char *, ...)
222 __printflike(2, 3);
223
224 /* Routines having to do with the management of the vnode table. */
225 extern struct mount *dead_rootmount;
226 extern int (**dead_vnodeop_p)(void *);
227 extern struct vfsops dead_vfsops;
228
229 /* Vnode state operations and diagnostics. */
230
231 #if defined(DIAGNOSTIC)
232
233 #define VSTATE_GET(vp) \
234 vstate_assert_get((vp), __func__, __LINE__)
235 #define VSTATE_CHANGE(vp, from, to) \
236 vstate_assert_change((vp), (from), (to), __func__, __LINE__)
237 #define VSTATE_WAIT_STABLE(vp) \
238 vstate_assert_wait_stable((vp), __func__, __LINE__)
239 #define VSTATE_ASSERT(vp, state) \
240 vstate_assert((vp), (state), __func__, __LINE__)
241
242 static void
243 vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line)
244 {
245 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
246
247 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
248
249 if (__predict_true(vip->vi_state == state))
250 return;
251 vnpanic(vp, "state is %s, expected %s at %s:%d",
252 vstate_name(vip->vi_state), vstate_name(state), func, line);
253 }
254
255 static enum vnode_state
256 vstate_assert_get(vnode_t *vp, const char *func, int line)
257 {
258 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
259
260 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
261 if (vip->vi_state == VS_MARKER)
262 vnpanic(vp, "state is %s at %s:%d",
263 vstate_name(vip->vi_state), func, line);
264
265 return vip->vi_state;
266 }
267
268 static void
269 vstate_assert_wait_stable(vnode_t *vp, const char *func, int line)
270 {
271 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
272
273 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
274 if (vip->vi_state == VS_MARKER)
275 vnpanic(vp, "state is %s at %s:%d",
276 vstate_name(vip->vi_state), func, line);
277
278 while (vip->vi_state != VS_ACTIVE && vip->vi_state != VS_RECLAIMED)
279 cv_wait(&vp->v_cv, vp->v_interlock);
280
281 if (vip->vi_state == VS_MARKER)
282 vnpanic(vp, "state is %s at %s:%d",
283 vstate_name(vip->vi_state), func, line);
284 }
285
286 static void
287 vstate_assert_change(vnode_t *vp, enum vnode_state from, enum vnode_state to,
288 const char *func, int line)
289 {
290 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
291
292 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
293 if (from == VS_LOADING)
294 KASSERTMSG(mutex_owned(&vcache_lock), "at %s:%d", func, line);
295
296 if (from == VS_MARKER)
297 vnpanic(vp, "from is %s at %s:%d",
298 vstate_name(from), func, line);
299 if (to == VS_MARKER)
300 vnpanic(vp, "to is %s at %s:%d",
301 vstate_name(to), func, line);
302 if (vip->vi_state != from)
303 vnpanic(vp, "from is %s, expected %s at %s:%d\n",
304 vstate_name(vip->vi_state), vstate_name(from), func, line);
305 if ((from == VS_BLOCKED || to == VS_BLOCKED) && vp->v_usecount != 1)
306 vnpanic(vp, "%s to %s with usecount %d at %s:%d",
307 vstate_name(from), vstate_name(to), vp->v_usecount,
308 func, line);
309
310 vip->vi_state = to;
311 if (from == VS_LOADING)
312 cv_broadcast(&vcache_cv);
313 if (to == VS_ACTIVE || to == VS_RECLAIMED)
314 cv_broadcast(&vp->v_cv);
315 }
316
317 #else /* defined(DIAGNOSTIC) */
318
319 #define VSTATE_GET(vp) \
320 (VNODE_TO_VIMPL((vp))->vi_state)
321 #define VSTATE_CHANGE(vp, from, to) \
322 vstate_change((vp), (from), (to))
323 #define VSTATE_WAIT_STABLE(vp) \
324 vstate_wait_stable((vp))
325 #define VSTATE_ASSERT(vp, state)
326
327 static void
328 vstate_wait_stable(vnode_t *vp)
329 {
330 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
331
332 while (vip->vi_state != VS_ACTIVE && vip->vi_state != VS_RECLAIMED)
333 cv_wait(&vp->v_cv, vp->v_interlock);
334 }
335
336 static void
337 vstate_change(vnode_t *vp, enum vnode_state from, enum vnode_state to)
338 {
339 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
340
341 vip->vi_state = to;
342 if (from == VS_LOADING)
343 cv_broadcast(&vcache_cv);
344 if (to == VS_ACTIVE || to == VS_RECLAIMED)
345 cv_broadcast(&vp->v_cv);
346 }
347
348 #endif /* defined(DIAGNOSTIC) */
349
350 void
351 vfs_vnode_sysinit(void)
352 {
353 int error __diagused;
354
355 dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL);
356 KASSERT(dead_rootmount != NULL);
357 dead_rootmount->mnt_iflag = IMNT_MPSAFE;
358
359 mutex_init(&vdrain_lock, MUTEX_DEFAULT, IPL_NONE);
360 TAILQ_INIT(&lru_free_list);
361 TAILQ_INIT(&lru_hold_list);
362 TAILQ_INIT(&lru_vrele_list);
363
364 vcache_init();
365
366 cv_init(&vdrain_cv, "vdrain");
367 cv_init(&vdrain_gen_cv, "vdrainwt");
368 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread,
369 NULL, &vdrain_lwp, "vdrain");
370 KASSERTMSG((error == 0), "kthread_create(vdrain) failed: %d", error);
371 }
372
373 /*
374 * Allocate a new marker vnode.
375 */
376 vnode_t *
377 vnalloc_marker(struct mount *mp)
378 {
379 vnode_impl_t *vip;
380 vnode_t *vp;
381
382 vip = pool_cache_get(vcache_pool, PR_WAITOK);
383 memset(vip, 0, sizeof(*vip));
384 vp = VIMPL_TO_VNODE(vip);
385 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0);
386 vp->v_mount = mp;
387 vp->v_type = VBAD;
388 vip->vi_state = VS_MARKER;
389
390 return vp;
391 }
392
393 /*
394 * Free a marker vnode.
395 */
396 void
397 vnfree_marker(vnode_t *vp)
398 {
399 vnode_impl_t *vip;
400
401 vip = VNODE_TO_VIMPL(vp);
402 KASSERT(vip->vi_state == VS_MARKER);
403 uvm_obj_destroy(&vp->v_uobj, true);
404 pool_cache_put(vcache_pool, vip);
405 }
406
407 /*
408 * Test a vnode for being a marker vnode.
409 */
410 bool
411 vnis_marker(vnode_t *vp)
412 {
413
414 return (VNODE_TO_VIMPL(vp)->vi_state == VS_MARKER);
415 }
416
417 /*
418 * Set vnode to share another vnodes lock.
419 */
420 void
421 vshare_lock(vnode_t *vp, vnode_t *src_vp)
422 {
423 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
424 vnode_impl_t *src_vip = VNODE_TO_VIMPL(src_vp);
425 krwlock_t *oldlock = vip->vi_lock;
426
427 rw_obj_hold(src_vip->vi_lock);
428 vip->vi_lock = src_vip->vi_lock;
429 rw_obj_free(oldlock);
430 }
431
432 /*
433 * Return the lru list this node should be on.
434 */
435 static vnodelst_t *
436 lru_which(vnode_t *vp)
437 {
438
439 KASSERT(mutex_owned(vp->v_interlock));
440
441 if (vp->v_holdcnt > 0)
442 return &lru_hold_list;
443 else
444 return &lru_free_list;
445 }
446
447 /*
448 * Put vnode to end of given list.
449 * Both the current and the new list may be NULL, used on vnode alloc/free.
450 * Adjust numvnodes and signal vdrain thread if there is work.
451 */
452 static void
453 lru_requeue(vnode_t *vp, vnodelst_t *listhd)
454 {
455 vnode_impl_t *vip;
456
457 mutex_enter(&vdrain_lock);
458 vip = VNODE_TO_VIMPL(vp);
459 if (vip->vi_lrulisthd != NULL)
460 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
461 else
462 numvnodes++;
463 vip->vi_lrulisthd = listhd;
464 if (vip->vi_lrulisthd != NULL)
465 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
466 else
467 numvnodes--;
468 if (numvnodes > desiredvnodes || listhd == &lru_vrele_list)
469 cv_broadcast(&vdrain_cv);
470 mutex_exit(&vdrain_lock);
471 }
472
473 /*
474 * Release deferred vrele vnodes for this mount.
475 * Called with file system suspended.
476 */
477 void
478 vrele_flush(struct mount *mp)
479 {
480 vnode_impl_t *vip, *marker;
481
482 KASSERT(fstrans_is_owner(mp));
483
484 marker = VNODE_TO_VIMPL(vnalloc_marker(NULL));
485
486 mutex_enter(&vdrain_lock);
487 TAILQ_INSERT_HEAD(&lru_vrele_list, marker, vi_lrulist);
488
489 while ((vip = TAILQ_NEXT(marker, vi_lrulist))) {
490 TAILQ_REMOVE(&lru_vrele_list, marker, vi_lrulist);
491 TAILQ_INSERT_AFTER(&lru_vrele_list, vip, marker, vi_lrulist);
492 if (vnis_marker(VIMPL_TO_VNODE(vip)))
493 continue;
494
495 KASSERT(vip->vi_lrulisthd == &lru_vrele_list);
496 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
497 vip->vi_lrulisthd = &lru_hold_list;
498 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
499 mutex_exit(&vdrain_lock);
500
501 mutex_enter(VIMPL_TO_VNODE(vip)->v_interlock);
502 vrelel(VIMPL_TO_VNODE(vip), VRELEL_FORCE_RELE);
503
504 mutex_enter(&vdrain_lock);
505 }
506
507 TAILQ_REMOVE(&lru_vrele_list, marker, vi_lrulist);
508 mutex_exit(&vdrain_lock);
509
510 vnfree_marker(VIMPL_TO_VNODE(marker));
511 }
512
513 /*
514 * Reclaim a cached vnode. Used from vdrain_thread only.
515 */
516 static __inline void
517 vdrain_remove(vnode_t *vp)
518 {
519 struct mount *mp;
520
521 KASSERT(mutex_owned(&vdrain_lock));
522
523 /* Probe usecount (unlocked). */
524 if (vp->v_usecount > 0)
525 return;
526 /* Try v_interlock -- we lock the wrong direction! */
527 if (!mutex_tryenter(vp->v_interlock))
528 return;
529 /* Probe usecount and state. */
530 if (vp->v_usecount > 0 || VSTATE_GET(vp) != VS_ACTIVE) {
531 mutex_exit(vp->v_interlock);
532 return;
533 }
534 mp = vp->v_mount;
535 if (fstrans_start_nowait(mp, FSTRANS_SHARED) != 0) {
536 mutex_exit(vp->v_interlock);
537 return;
538 }
539 vdrain_retry = true;
540 mutex_exit(&vdrain_lock);
541
542 if (vcache_vget(vp) == 0) {
543 if (!vrecycle(vp)) {
544 mutex_enter(vp->v_interlock);
545 vrelel(vp, VRELEL_FORCE_RELE);
546 }
547 }
548 fstrans_done(mp);
549
550 mutex_enter(&vdrain_lock);
551 }
552
553 /*
554 * Release a cached vnode. Used from vdrain_thread only.
555 */
556 static __inline void
557 vdrain_vrele(vnode_t *vp)
558 {
559 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
560 struct mount *mp;
561
562 KASSERT(mutex_owned(&vdrain_lock));
563
564 mp = vp->v_mount;
565 if (fstrans_start_nowait(mp, FSTRANS_SHARED) != 0)
566 return;
567
568 /*
569 * First remove the vnode from the vrele list.
570 * Put it on the last lru list, the last vrele()
571 * will put it back onto the right list before
572 * its v_usecount reaches zero.
573 */
574 KASSERT(vip->vi_lrulisthd == &lru_vrele_list);
575 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
576 vip->vi_lrulisthd = &lru_hold_list;
577 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
578
579 vdrain_retry = true;
580 mutex_exit(&vdrain_lock);
581
582 mutex_enter(vp->v_interlock);
583 vrelel(vp, VRELEL_FORCE_RELE);
584 fstrans_done(mp);
585
586 mutex_enter(&vdrain_lock);
587 }
588
589 /*
590 * Helper thread to keep the number of vnodes below desiredvnodes
591 * and release vnodes from asynchronous vrele.
592 */
593 static void
594 vdrain_thread(void *cookie)
595 {
596 vnodelst_t *listhd[] = {
597 &lru_vrele_list, &lru_free_list, &lru_hold_list
598 };
599 int i;
600 u_int target;
601 vnode_impl_t *vip, *marker;
602
603 marker = VNODE_TO_VIMPL(vnalloc_marker(NULL));
604
605 mutex_enter(&vdrain_lock);
606
607 for (;;) {
608 vdrain_retry = false;
609 target = desiredvnodes - desiredvnodes/10;
610
611 for (i = 0; i < __arraycount(listhd); i++) {
612 TAILQ_INSERT_HEAD(listhd[i], marker, vi_lrulist);
613 while ((vip = TAILQ_NEXT(marker, vi_lrulist))) {
614 TAILQ_REMOVE(listhd[i], marker, vi_lrulist);
615 TAILQ_INSERT_AFTER(listhd[i], vip, marker,
616 vi_lrulist);
617 if (vnis_marker(VIMPL_TO_VNODE(vip)))
618 continue;
619 if (listhd[i] == &lru_vrele_list)
620 vdrain_vrele(VIMPL_TO_VNODE(vip));
621 else if (numvnodes < target)
622 break;
623 else
624 vdrain_remove(VIMPL_TO_VNODE(vip));
625 }
626 TAILQ_REMOVE(listhd[i], marker, vi_lrulist);
627 }
628
629 if (vdrain_retry) {
630 mutex_exit(&vdrain_lock);
631 yield();
632 mutex_enter(&vdrain_lock);
633 } else {
634 vdrain_gen++;
635 cv_broadcast(&vdrain_gen_cv);
636 cv_wait(&vdrain_cv, &vdrain_lock);
637 }
638 }
639 }
640
641 /*
642 * vput: unlock and release the reference.
643 */
644 void
645 vput(vnode_t *vp)
646 {
647
648 VOP_UNLOCK(vp);
649 vrele(vp);
650 }
651
652 /*
653 * Try to drop reference on a vnode. Abort if we are releasing the
654 * last reference. Note: this _must_ succeed if not the last reference.
655 */
656 static inline bool
657 vtryrele(vnode_t *vp)
658 {
659 u_int use, next;
660
661 for (use = vp->v_usecount;; use = next) {
662 if (use == 1) {
663 return false;
664 }
665 KASSERT(use > 1);
666 next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
667 if (__predict_true(next == use)) {
668 return true;
669 }
670 }
671 }
672
673 /*
674 * Vnode release. If reference count drops to zero, call inactive
675 * routine and either return to freelist or free to the pool.
676 */
677 static void
678 vrelel(vnode_t *vp, int flags)
679 {
680 const bool async = ((flags & VRELEL_ASYNC_RELE) != 0);
681 const bool force = ((flags & VRELEL_FORCE_RELE) != 0);
682 bool recycle, defer;
683 int error;
684
685 KASSERT(mutex_owned(vp->v_interlock));
686
687 if (__predict_false(vp->v_op == dead_vnodeop_p &&
688 VSTATE_GET(vp) != VS_RECLAIMED)) {
689 vnpanic(vp, "dead but not clean");
690 }
691
692 /*
693 * If not the last reference, just drop the reference count
694 * and unlock.
695 */
696 if (vtryrele(vp)) {
697 mutex_exit(vp->v_interlock);
698 return;
699 }
700 if (vp->v_usecount <= 0 || vp->v_writecount != 0) {
701 vnpanic(vp, "%s: bad ref count", __func__);
702 }
703
704 #ifdef DIAGNOSTIC
705 if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
706 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
707 vprint("vrelel: missing VOP_CLOSE()", vp);
708 }
709 #endif
710
711 /*
712 * First try to get the vnode locked for VOP_INACTIVE().
713 * Defer vnode release to vdrain_thread if caller requests
714 * it explicitly, is the pagedaemon or the lock failed.
715 */
716 if ((curlwp == uvm.pagedaemon_lwp) || async) {
717 defer = true;
718 } else {
719 mutex_exit(vp->v_interlock);
720 error = vn_lock(vp,
721 LK_EXCLUSIVE | LK_RETRY | (force ? 0 : LK_NOWAIT));
722 defer = (error != 0);
723 mutex_enter(vp->v_interlock);
724 }
725 KASSERT(mutex_owned(vp->v_interlock));
726 KASSERT(! (force && defer));
727 if (defer) {
728 /*
729 * Defer reclaim to the kthread; it's not safe to
730 * clean it here. We donate it our last reference.
731 */
732 lru_requeue(vp, &lru_vrele_list);
733 mutex_exit(vp->v_interlock);
734 return;
735 }
736
737 /*
738 * If the node got another reference while we
739 * released the interlock, don't try to inactivate it yet.
740 */
741 if (__predict_false(vtryrele(vp))) {
742 VOP_UNLOCK(vp);
743 mutex_exit(vp->v_interlock);
744 return;
745 }
746
747 /*
748 * If not clean, deactivate the vnode, but preserve
749 * our reference across the call to VOP_INACTIVE().
750 */
751 if (VSTATE_GET(vp) == VS_RECLAIMED) {
752 VOP_UNLOCK(vp);
753 } else {
754 VSTATE_CHANGE(vp, VS_ACTIVE, VS_BLOCKED);
755 mutex_exit(vp->v_interlock);
756
757 /*
758 * The vnode must not gain another reference while being
759 * deactivated. If VOP_INACTIVE() indicates that
760 * the described file has been deleted, then recycle
761 * the vnode.
762 *
763 * Note that VOP_INACTIVE() will not drop the vnode lock.
764 */
765 recycle = false;
766 VOP_INACTIVE(vp, &recycle);
767 if (!recycle)
768 VOP_UNLOCK(vp);
769 mutex_enter(vp->v_interlock);
770 VSTATE_CHANGE(vp, VS_BLOCKED, VS_ACTIVE);
771 if (!recycle) {
772 if (vtryrele(vp)) {
773 mutex_exit(vp->v_interlock);
774 return;
775 }
776 }
777
778 /* Take care of space accounting. */
779 if (vp->v_iflag & VI_EXECMAP) {
780 atomic_add_int(&uvmexp.execpages,
781 -vp->v_uobj.uo_npages);
782 atomic_add_int(&uvmexp.filepages,
783 vp->v_uobj.uo_npages);
784 }
785 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP);
786 vp->v_vflag &= ~VV_MAPPED;
787
788 /*
789 * Recycle the vnode if the file is now unused (unlinked),
790 * otherwise just free it.
791 */
792 if (recycle) {
793 VSTATE_ASSERT(vp, VS_ACTIVE);
794 /* vcache_reclaim drops the lock. */
795 vcache_reclaim(vp);
796 }
797 KASSERT(vp->v_usecount > 0);
798 }
799
800 if (atomic_dec_uint_nv(&vp->v_usecount) != 0) {
801 /* Gained another reference while being reclaimed. */
802 mutex_exit(vp->v_interlock);
803 return;
804 }
805
806 if (VSTATE_GET(vp) == VS_RECLAIMED && vp->v_holdcnt == 0) {
807 /*
808 * It's clean so destroy it. It isn't referenced
809 * anywhere since it has been reclaimed.
810 */
811 vcache_free(VNODE_TO_VIMPL(vp));
812 } else {
813 /*
814 * Otherwise, put it back onto the freelist. It
815 * can't be destroyed while still associated with
816 * a file system.
817 */
818 lru_requeue(vp, lru_which(vp));
819 mutex_exit(vp->v_interlock);
820 }
821 }
822
823 void
824 vrele(vnode_t *vp)
825 {
826
827 if (vtryrele(vp)) {
828 return;
829 }
830 mutex_enter(vp->v_interlock);
831 vrelel(vp, 0);
832 }
833
834 /*
835 * Asynchronous vnode release, vnode is released in different context.
836 */
837 void
838 vrele_async(vnode_t *vp)
839 {
840
841 if (vtryrele(vp)) {
842 return;
843 }
844 mutex_enter(vp->v_interlock);
845 vrelel(vp, VRELEL_ASYNC_RELE);
846 }
847
848 /*
849 * Vnode reference, where a reference is already held by some other
850 * object (for example, a file structure).
851 */
852 void
853 vref(vnode_t *vp)
854 {
855
856 KASSERT(vp->v_usecount != 0);
857
858 atomic_inc_uint(&vp->v_usecount);
859 }
860
861 /*
862 * Page or buffer structure gets a reference.
863 * Called with v_interlock held.
864 */
865 void
866 vholdl(vnode_t *vp)
867 {
868
869 KASSERT(mutex_owned(vp->v_interlock));
870
871 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0)
872 lru_requeue(vp, lru_which(vp));
873 }
874
875 /*
876 * Page or buffer structure frees a reference.
877 * Called with v_interlock held.
878 */
879 void
880 holdrelel(vnode_t *vp)
881 {
882
883 KASSERT(mutex_owned(vp->v_interlock));
884
885 if (vp->v_holdcnt <= 0) {
886 vnpanic(vp, "%s: holdcnt vp %p", __func__, vp);
887 }
888
889 vp->v_holdcnt--;
890 if (vp->v_holdcnt == 0 && vp->v_usecount == 0)
891 lru_requeue(vp, lru_which(vp));
892 }
893
894 /*
895 * Disassociate the underlying file system from a vnode.
896 *
897 * Must be called with vnode locked and will return unlocked.
898 * Must be called with the interlock held, and will return with it held.
899 */
900 static void
901 vclean(vnode_t *vp)
902 {
903 lwp_t *l = curlwp;
904 bool recycle, active;
905 int error;
906
907 KASSERT((vp->v_vflag & VV_LOCKSWORK) == 0 ||
908 VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
909 KASSERT(mutex_owned(vp->v_interlock));
910 KASSERT(vp->v_usecount != 0);
911
912 active = (vp->v_usecount > 1);
913 /*
914 * Prevent the vnode from being recycled or brought into use
915 * while we clean it out.
916 */
917 VSTATE_CHANGE(vp, VN_ACTIVE, VN_RECLAIMING);
918 if (vp->v_iflag & VI_EXECMAP) {
919 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages);
920 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages);
921 }
922 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
923 mutex_exit(vp->v_interlock);
924
925 /*
926 * Clean out any cached data associated with the vnode.
927 * If purging an active vnode, it must be closed and
928 * deactivated before being reclaimed. Note that the
929 * VOP_INACTIVE will unlock the vnode.
930 */
931 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
932 if (error != 0) {
933 if (wapbl_vphaswapbl(vp))
934 WAPBL_DISCARD(wapbl_vptomp(vp));
935 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
936 }
937 KASSERTMSG((error == 0), "vinvalbuf failed: %d", error);
938 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
939 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) {
940 spec_node_revoke(vp);
941 }
942 if (active) {
943 VOP_INACTIVE(vp, &recycle);
944 } else {
945 /*
946 * Any other processes trying to obtain this lock must first
947 * wait for VN_RECLAIMED, then call the new lock operation.
948 */
949 VOP_UNLOCK(vp);
950 }
951
952 /* Disassociate the underlying file system from the vnode. */
953 if (VOP_RECLAIM(vp)) {
954 vnpanic(vp, "%s: cannot reclaim", __func__);
955 }
956
957 KASSERT(vp->v_data == NULL);
958 KASSERT(vp->v_uobj.uo_npages == 0);
959
960 if (vp->v_type == VREG && vp->v_ractx != NULL) {
961 uvm_ra_freectx(vp->v_ractx);
962 vp->v_ractx = NULL;
963 }
964
965 /* Purge name cache. */
966 cache_purge(vp);
967
968 /* Move to dead mount. */
969 vp->v_vflag &= ~VV_ROOT;
970 atomic_inc_uint(&dead_rootmount->mnt_refcnt);
971 vfs_insmntque(vp, dead_rootmount);
972
973 /* Done with purge, notify sleepers of the grim news. */
974 mutex_enter(vp->v_interlock);
975 vp->v_op = dead_vnodeop_p;
976 vp->v_vflag |= VV_LOCKSWORK;
977 VSTATE_CHANGE(vp, VN_RECLAIMING, VN_RECLAIMED);
978 vp->v_tag = VT_NON;
979 KNOTE(&vp->v_klist, NOTE_REVOKE);
980
981 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
982 }
983
984 /*
985 * Recycle an unused vnode if caller holds the last reference.
986 */
987 bool
988 vrecycle(vnode_t *vp)
989 {
990 int error __diagused;
991
992 mutex_enter(vp->v_interlock);
993
994 /* Make sure we hold the last reference. */
995 VSTATE_WAIT_STABLE(vp);
996 if (vp->v_usecount != 1) {
997 mutex_exit(vp->v_interlock);
998 return false;
999 }
1000
1001 /* If the vnode is already clean we're done. */
1002 if (VSTATE_GET(vp) != VS_ACTIVE) {
1003 VSTATE_ASSERT(vp, VS_RECLAIMED);
1004 vrelel(vp, 0);
1005 return true;
1006 }
1007
1008 /* Prevent further references until the vnode is locked. */
1009 VSTATE_CHANGE(vp, VS_ACTIVE, VS_BLOCKED);
1010 mutex_exit(vp->v_interlock);
1011
1012 /*
1013 * On a leaf file system this lock will always succeed as we hold
1014 * the last reference and prevent further references.
1015 * On layered file systems waiting for the lock would open a can of
1016 * deadlocks as the lower vnodes may have other active references.
1017 */
1018 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
1019
1020 mutex_enter(vp->v_interlock);
1021 VSTATE_CHANGE(vp, VS_BLOCKED, VS_ACTIVE);
1022
1023 if (error) {
1024 mutex_exit(vp->v_interlock);
1025 return false;
1026 }
1027
1028 KASSERT(vp->v_usecount == 1);
1029 vcache_reclaim(vp);
1030 vrelel(vp, 0);
1031
1032 return true;
1033 }
1034
1035 /*
1036 * Eliminate all activity associated with the requested vnode
1037 * and with all vnodes aliased to the requested vnode.
1038 */
1039 void
1040 vrevoke(vnode_t *vp)
1041 {
1042 vnode_t *vq;
1043 enum vtype type;
1044 dev_t dev;
1045
1046 KASSERT(vp->v_usecount > 0);
1047
1048 mutex_enter(vp->v_interlock);
1049 VSTATE_WAIT_STABLE(vp);
1050 if (VSTATE_GET(vp) == VS_RECLAIMED) {
1051 mutex_exit(vp->v_interlock);
1052 return;
1053 } else if (vp->v_type != VBLK && vp->v_type != VCHR) {
1054 atomic_inc_uint(&vp->v_usecount);
1055 mutex_exit(vp->v_interlock);
1056 vgone(vp);
1057 return;
1058 } else {
1059 dev = vp->v_rdev;
1060 type = vp->v_type;
1061 mutex_exit(vp->v_interlock);
1062 }
1063
1064 while (spec_node_lookup_by_dev(type, dev, &vq) == 0) {
1065 vgone(vq);
1066 }
1067 }
1068
1069 /*
1070 * Eliminate all activity associated with a vnode in preparation for
1071 * reuse. Drops a reference from the vnode.
1072 */
1073 void
1074 vgone(vnode_t *vp)
1075 {
1076
1077 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1078 mutex_enter(vp->v_interlock);
1079 VSTATE_WAIT_STABLE(vp);
1080 if (VSTATE_GET(vp) == VS_ACTIVE)
1081 vcache_reclaim(vp);
1082 VSTATE_ASSERT(vp, VS_RECLAIMED);
1083 vrelel(vp, 0);
1084 }
1085
1086 static inline uint32_t
1087 vcache_hash(const struct vcache_key *key)
1088 {
1089 uint32_t hash = HASH32_BUF_INIT;
1090
1091 hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash);
1092 hash = hash32_buf(key->vk_key, key->vk_key_len, hash);
1093 return hash;
1094 }
1095
1096 static void
1097 vcache_init(void)
1098 {
1099
1100 vcache_pool = pool_cache_init(sizeof(vnode_impl_t), 0, 0, 0,
1101 "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL);
1102 KASSERT(vcache_pool != NULL);
1103 mutex_init(&vcache_lock, MUTEX_DEFAULT, IPL_NONE);
1104 cv_init(&vcache_cv, "vcache");
1105 vcache_hashsize = desiredvnodes;
1106 vcache_hashtab = hashinit(desiredvnodes, HASH_SLIST, true,
1107 &vcache_hashmask);
1108 }
1109
1110 static void
1111 vcache_reinit(void)
1112 {
1113 int i;
1114 uint32_t hash;
1115 u_long oldmask, newmask;
1116 struct hashhead *oldtab, *newtab;
1117 vnode_impl_t *vip;
1118
1119 newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask);
1120 mutex_enter(&vcache_lock);
1121 oldtab = vcache_hashtab;
1122 oldmask = vcache_hashmask;
1123 vcache_hashsize = desiredvnodes;
1124 vcache_hashtab = newtab;
1125 vcache_hashmask = newmask;
1126 for (i = 0; i <= oldmask; i++) {
1127 while ((vip = SLIST_FIRST(&oldtab[i])) != NULL) {
1128 SLIST_REMOVE(&oldtab[i], vip, vnode_impl, vi_hash);
1129 hash = vcache_hash(&vip->vi_key);
1130 SLIST_INSERT_HEAD(&newtab[hash & vcache_hashmask],
1131 vip, vi_hash);
1132 }
1133 }
1134 mutex_exit(&vcache_lock);
1135 hashdone(oldtab, HASH_SLIST, oldmask);
1136 }
1137
1138 static inline vnode_impl_t *
1139 vcache_hash_lookup(const struct vcache_key *key, uint32_t hash)
1140 {
1141 struct hashhead *hashp;
1142 vnode_impl_t *vip;
1143
1144 KASSERT(mutex_owned(&vcache_lock));
1145
1146 hashp = &vcache_hashtab[hash & vcache_hashmask];
1147 SLIST_FOREACH(vip, hashp, vi_hash) {
1148 if (key->vk_mount != vip->vi_key.vk_mount)
1149 continue;
1150 if (key->vk_key_len != vip->vi_key.vk_key_len)
1151 continue;
1152 if (memcmp(key->vk_key, vip->vi_key.vk_key, key->vk_key_len))
1153 continue;
1154 return vip;
1155 }
1156 return NULL;
1157 }
1158
1159 /*
1160 * Allocate a new, uninitialized vcache node.
1161 */
1162 static vnode_impl_t *
1163 vcache_alloc(void)
1164 {
1165 vnode_impl_t *vip;
1166 vnode_t *vp;
1167
1168 vip = pool_cache_get(vcache_pool, PR_WAITOK);
1169 memset(vip, 0, sizeof(*vip));
1170
1171 vip->vi_lock = rw_obj_alloc();
1172 /* SLIST_INIT(&vip->vi_hash); */
1173 /* LIST_INIT(&vip->vi_nclist); */
1174 /* LIST_INIT(&vip->vi_dnclist); */
1175
1176 vp = VIMPL_TO_VNODE(vip);
1177 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0);
1178 cv_init(&vp->v_cv, "vnode");
1179
1180 vp->v_usecount = 1;
1181 vp->v_type = VNON;
1182 vp->v_size = vp->v_writesize = VSIZENOTSET;
1183
1184 vip->vi_state = VS_LOADING;
1185
1186 lru_requeue(vp, &lru_free_list);
1187
1188 return vip;
1189 }
1190
1191 /*
1192 * Deallocate a vcache node in state VS_LOADING.
1193 *
1194 * vcache_lock held on entry and released on return.
1195 */
1196 static void
1197 vcache_dealloc(vnode_impl_t *vip)
1198 {
1199 vnode_t *vp;
1200
1201 KASSERT(mutex_owned(&vcache_lock));
1202
1203 vp = VIMPL_TO_VNODE(vip);
1204 mutex_enter(vp->v_interlock);
1205 vp->v_op = dead_vnodeop_p;
1206 VSTATE_CHANGE(vp, VS_LOADING, VS_RECLAIMED);
1207 mutex_exit(&vcache_lock);
1208 vrelel(vp, 0);
1209 }
1210
1211 /*
1212 * Free an unused, unreferenced vcache node.
1213 * v_interlock locked on entry.
1214 */
1215 static void
1216 vcache_free(vnode_impl_t *vip)
1217 {
1218 vnode_t *vp;
1219
1220 vp = VIMPL_TO_VNODE(vip);
1221 KASSERT(mutex_owned(vp->v_interlock));
1222
1223 KASSERT(vp->v_usecount == 0);
1224 KASSERT(vp->v_holdcnt == 0);
1225 KASSERT(vp->v_writecount == 0);
1226 lru_requeue(vp, NULL);
1227 mutex_exit(vp->v_interlock);
1228
1229 vfs_insmntque(vp, NULL);
1230 if (vp->v_type == VBLK || vp->v_type == VCHR)
1231 spec_node_destroy(vp);
1232
1233 rw_obj_free(vip->vi_lock);
1234 uvm_obj_destroy(&vp->v_uobj, true);
1235 cv_destroy(&vp->v_cv);
1236 pool_cache_put(vcache_pool, vip);
1237 }
1238
1239 /*
1240 * Try to get an initial reference on this cached vnode.
1241 * Returns zero on success, ENOENT if the vnode has been reclaimed and
1242 * EBUSY if the vnode state is unstable.
1243 *
1244 * v_interlock locked on entry and unlocked on exit.
1245 */
1246 int
1247 vcache_tryvget(vnode_t *vp)
1248 {
1249 int error = 0;
1250
1251 KASSERT(mutex_owned(vp->v_interlock));
1252
1253 if (__predict_false(VSTATE_GET(vp) == VS_RECLAIMED))
1254 error = ENOENT;
1255 else if (__predict_false(VSTATE_GET(vp) != VS_ACTIVE))
1256 error = EBUSY;
1257 else if (vp->v_usecount == 0)
1258 vp->v_usecount = 1;
1259 else
1260 atomic_inc_uint(&vp->v_usecount);
1261
1262 mutex_exit(vp->v_interlock);
1263
1264 return error;
1265 }
1266
1267 /*
1268 * Try to get an initial reference on this cached vnode.
1269 * Returns zero on success and ENOENT if the vnode has been reclaimed.
1270 * Will wait for the vnode state to be stable.
1271 *
1272 * v_interlock locked on entry and unlocked on exit.
1273 */
1274 int
1275 vcache_vget(vnode_t *vp)
1276 {
1277
1278 KASSERT(mutex_owned(vp->v_interlock));
1279
1280 /* Increment hold count to prevent vnode from disappearing. */
1281 vp->v_holdcnt++;
1282 VSTATE_WAIT_STABLE(vp);
1283 vp->v_holdcnt--;
1284
1285 /* If this was the last reference to a reclaimed vnode free it now. */
1286 if (__predict_false(VSTATE_GET(vp) == VS_RECLAIMED)) {
1287 if (vp->v_holdcnt == 0 && vp->v_usecount == 0)
1288 vcache_free(VNODE_TO_VIMPL(vp));
1289 else
1290 mutex_exit(vp->v_interlock);
1291 return ENOENT;
1292 }
1293 VSTATE_ASSERT(vp, VS_ACTIVE);
1294 if (vp->v_usecount == 0)
1295 vp->v_usecount = 1;
1296 else
1297 atomic_inc_uint(&vp->v_usecount);
1298
1299 mutex_exit(vp->v_interlock);
1300
1301 return 0;
1302 }
1303
1304 /*
1305 * Get a vnode / fs node pair by key and return it referenced through vpp.
1306 */
1307 int
1308 vcache_get(struct mount *mp, const void *key, size_t key_len,
1309 struct vnode **vpp)
1310 {
1311 int error;
1312 uint32_t hash;
1313 const void *new_key;
1314 struct vnode *vp;
1315 struct vcache_key vcache_key;
1316 vnode_impl_t *vip, *new_vip;
1317
1318 new_key = NULL;
1319 *vpp = NULL;
1320
1321 vcache_key.vk_mount = mp;
1322 vcache_key.vk_key = key;
1323 vcache_key.vk_key_len = key_len;
1324 hash = vcache_hash(&vcache_key);
1325
1326 again:
1327 mutex_enter(&vcache_lock);
1328 vip = vcache_hash_lookup(&vcache_key, hash);
1329
1330 /* If found, take a reference or retry. */
1331 if (__predict_true(vip != NULL)) {
1332 /*
1333 * If the vnode is loading we cannot take the v_interlock
1334 * here as it might change during load (see uvm_obj_setlock()).
1335 * As changing state from VS_LOADING requires both vcache_lock
1336 * and v_interlock it is safe to test with vcache_lock held.
1337 *
1338 * Wait for vnodes changing state from VS_LOADING and retry.
1339 */
1340 if (__predict_false(vip->vi_state == VS_LOADING)) {
1341 cv_wait(&vcache_cv, &vcache_lock);
1342 mutex_exit(&vcache_lock);
1343 goto again;
1344 }
1345 vp = VIMPL_TO_VNODE(vip);
1346 mutex_enter(vp->v_interlock);
1347 mutex_exit(&vcache_lock);
1348 error = vcache_vget(vp);
1349 if (error == ENOENT)
1350 goto again;
1351 if (error == 0)
1352 *vpp = vp;
1353 KASSERT((error != 0) == (*vpp == NULL));
1354 return error;
1355 }
1356 mutex_exit(&vcache_lock);
1357
1358 /* Allocate and initialize a new vcache / vnode pair. */
1359 error = vfs_busy(mp);
1360 if (error)
1361 return error;
1362 new_vip = vcache_alloc();
1363 new_vip->vi_key = vcache_key;
1364 vp = VIMPL_TO_VNODE(new_vip);
1365 mutex_enter(&vcache_lock);
1366 vip = vcache_hash_lookup(&vcache_key, hash);
1367 if (vip == NULL) {
1368 SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask],
1369 new_vip, vi_hash);
1370 vip = new_vip;
1371 }
1372
1373 /* If another thread beat us inserting this node, retry. */
1374 if (vip != new_vip) {
1375 vcache_dealloc(new_vip);
1376 vfs_unbusy(mp);
1377 goto again;
1378 }
1379 mutex_exit(&vcache_lock);
1380
1381 /* Load the fs node. Exclusive as new_node is VS_LOADING. */
1382 error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key);
1383 if (error) {
1384 mutex_enter(&vcache_lock);
1385 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
1386 new_vip, vnode_impl, vi_hash);
1387 vcache_dealloc(new_vip);
1388 vfs_unbusy(mp);
1389 KASSERT(*vpp == NULL);
1390 return error;
1391 }
1392 KASSERT(new_key != NULL);
1393 KASSERT(memcmp(key, new_key, key_len) == 0);
1394 KASSERT(vp->v_op != NULL);
1395 vfs_insmntque(vp, mp);
1396 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1397 vp->v_vflag |= VV_MPSAFE;
1398 vfs_ref(mp);
1399 vfs_unbusy(mp);
1400
1401 /* Finished loading, finalize node. */
1402 mutex_enter(&vcache_lock);
1403 new_vip->vi_key.vk_key = new_key;
1404 mutex_enter(vp->v_interlock);
1405 VSTATE_CHANGE(vp, VS_LOADING, VS_ACTIVE);
1406 mutex_exit(vp->v_interlock);
1407 mutex_exit(&vcache_lock);
1408 *vpp = vp;
1409 return 0;
1410 }
1411
1412 /*
1413 * Create a new vnode / fs node pair and return it referenced through vpp.
1414 */
1415 int
1416 vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap,
1417 kauth_cred_t cred, struct vnode **vpp)
1418 {
1419 int error;
1420 uint32_t hash;
1421 struct vnode *vp, *ovp;
1422 vnode_impl_t *vip, *ovip;
1423
1424 *vpp = NULL;
1425
1426 /* Allocate and initialize a new vcache / vnode pair. */
1427 error = vfs_busy(mp);
1428 if (error)
1429 return error;
1430 vip = vcache_alloc();
1431 vip->vi_key.vk_mount = mp;
1432 vp = VIMPL_TO_VNODE(vip);
1433
1434 /* Create and load the fs node. */
1435 error = VFS_NEWVNODE(mp, dvp, vp, vap, cred,
1436 &vip->vi_key.vk_key_len, &vip->vi_key.vk_key);
1437 if (error) {
1438 mutex_enter(&vcache_lock);
1439 vcache_dealloc(vip);
1440 vfs_unbusy(mp);
1441 KASSERT(*vpp == NULL);
1442 return error;
1443 }
1444 KASSERT(vip->vi_key.vk_key != NULL);
1445 KASSERT(vp->v_op != NULL);
1446 hash = vcache_hash(&vip->vi_key);
1447
1448 /* Wait for previous instance to be reclaimed, then insert new node. */
1449 mutex_enter(&vcache_lock);
1450 while ((ovip = vcache_hash_lookup(&vip->vi_key, hash))) {
1451 ovp = VIMPL_TO_VNODE(ovip);
1452 mutex_enter(ovp->v_interlock);
1453 mutex_exit(&vcache_lock);
1454 error = vcache_vget(ovp);
1455 KASSERT(error == ENOENT);
1456 mutex_enter(&vcache_lock);
1457 }
1458 SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask],
1459 vip, vi_hash);
1460 mutex_exit(&vcache_lock);
1461 vfs_insmntque(vp, mp);
1462 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1463 vp->v_vflag |= VV_MPSAFE;
1464 vfs_ref(mp);
1465 vfs_unbusy(mp);
1466
1467 /* Finished loading, finalize node. */
1468 mutex_enter(&vcache_lock);
1469 mutex_enter(vp->v_interlock);
1470 VSTATE_CHANGE(vp, VS_LOADING, VS_ACTIVE);
1471 mutex_exit(&vcache_lock);
1472 mutex_exit(vp->v_interlock);
1473 *vpp = vp;
1474 return 0;
1475 }
1476
1477 /*
1478 * Prepare key change: update old cache nodes key and lock new cache node.
1479 * Return an error if the new node already exists.
1480 */
1481 int
1482 vcache_rekey_enter(struct mount *mp, struct vnode *vp,
1483 const void *old_key, size_t old_key_len,
1484 const void *new_key, size_t new_key_len)
1485 {
1486 uint32_t old_hash, new_hash;
1487 struct vcache_key old_vcache_key, new_vcache_key;
1488 vnode_impl_t *vip, *new_vip;
1489
1490 old_vcache_key.vk_mount = mp;
1491 old_vcache_key.vk_key = old_key;
1492 old_vcache_key.vk_key_len = old_key_len;
1493 old_hash = vcache_hash(&old_vcache_key);
1494
1495 new_vcache_key.vk_mount = mp;
1496 new_vcache_key.vk_key = new_key;
1497 new_vcache_key.vk_key_len = new_key_len;
1498 new_hash = vcache_hash(&new_vcache_key);
1499
1500 new_vip = vcache_alloc();
1501 new_vip->vi_key = new_vcache_key;
1502
1503 /* Insert locked new node used as placeholder. */
1504 mutex_enter(&vcache_lock);
1505 vip = vcache_hash_lookup(&new_vcache_key, new_hash);
1506 if (vip != NULL) {
1507 vcache_dealloc(new_vip);
1508 return EEXIST;
1509 }
1510 SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask],
1511 new_vip, vi_hash);
1512
1513 /* Replace old nodes key with the temporary copy. */
1514 vip = vcache_hash_lookup(&old_vcache_key, old_hash);
1515 KASSERT(vip != NULL);
1516 KASSERT(VIMPL_TO_VNODE(vip) == vp);
1517 KASSERT(vip->vi_key.vk_key != old_vcache_key.vk_key);
1518 vip->vi_key = old_vcache_key;
1519 mutex_exit(&vcache_lock);
1520 return 0;
1521 }
1522
1523 /*
1524 * Key change complete: update old node and remove placeholder.
1525 */
1526 void
1527 vcache_rekey_exit(struct mount *mp, struct vnode *vp,
1528 const void *old_key, size_t old_key_len,
1529 const void *new_key, size_t new_key_len)
1530 {
1531 uint32_t old_hash, new_hash;
1532 struct vcache_key old_vcache_key, new_vcache_key;
1533 vnode_impl_t *vip, *new_vip;
1534 struct vnode *new_vp;
1535
1536 old_vcache_key.vk_mount = mp;
1537 old_vcache_key.vk_key = old_key;
1538 old_vcache_key.vk_key_len = old_key_len;
1539 old_hash = vcache_hash(&old_vcache_key);
1540
1541 new_vcache_key.vk_mount = mp;
1542 new_vcache_key.vk_key = new_key;
1543 new_vcache_key.vk_key_len = new_key_len;
1544 new_hash = vcache_hash(&new_vcache_key);
1545
1546 mutex_enter(&vcache_lock);
1547
1548 /* Lookup old and new node. */
1549 vip = vcache_hash_lookup(&old_vcache_key, old_hash);
1550 KASSERT(vip != NULL);
1551 KASSERT(VIMPL_TO_VNODE(vip) == vp);
1552
1553 new_vip = vcache_hash_lookup(&new_vcache_key, new_hash);
1554 KASSERT(new_vip != NULL);
1555 KASSERT(new_vip->vi_key.vk_key_len == new_key_len);
1556 new_vp = VIMPL_TO_VNODE(new_vip);
1557 mutex_enter(new_vp->v_interlock);
1558 VSTATE_ASSERT(VIMPL_TO_VNODE(new_vip), VS_LOADING);
1559 mutex_exit(new_vp->v_interlock);
1560
1561 /* Rekey old node and put it onto its new hashlist. */
1562 vip->vi_key = new_vcache_key;
1563 if (old_hash != new_hash) {
1564 SLIST_REMOVE(&vcache_hashtab[old_hash & vcache_hashmask],
1565 vip, vnode_impl, vi_hash);
1566 SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask],
1567 vip, vi_hash);
1568 }
1569
1570 /* Remove new node used as placeholder. */
1571 SLIST_REMOVE(&vcache_hashtab[new_hash & vcache_hashmask],
1572 new_vip, vnode_impl, vi_hash);
1573 vcache_dealloc(new_vip);
1574 }
1575
1576 /*
1577 * Remove a vnode / fs node pair from the cache.
1578 */
1579 void
1580 vcache_remove(struct mount *mp, const void *key, size_t key_len)
1581 {
1582 lwp_t *l = curlwp;
1583 vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
1584 struct mount *mp = vp->v_mount;
1585 uint32_t hash;
1586 struct vcache_key vcache_key;
1587 struct vcache_node *node;
1588
1589 vcache_key.vk_mount = mp;
1590 vcache_key.vk_key = key;
1591 vcache_key.vk_key_len = key_len;
1592 hash = vcache_hash(&vcache_key);
1593
1594 active = (vp->v_usecount > 1);
1595 temp_key_len = vip->vi_key.vk_key_len;
1596 /*
1597 * Prevent the vnode from being recycled or brought into use
1598 * while we clean it out.
1599 */
1600 VSTATE_CHANGE(vp, VS_ACTIVE, VS_RECLAIMING);
1601 if (vp->v_iflag & VI_EXECMAP) {
1602 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages);
1603 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages);
1604 }
1605 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
1606 mutex_exit(vp->v_interlock);
1607
1608 /* Replace the vnode key with a temporary copy. */
1609 if (vip->vi_key.vk_key_len > sizeof(temp_buf)) {
1610 temp_key = kmem_alloc(temp_key_len, KM_SLEEP);
1611 } else {
1612 temp_key = temp_buf;
1613 }
1614 mutex_enter(&vcache_lock);
1615 memcpy(temp_key, vip->vi_key.vk_key, temp_key_len);
1616 vip->vi_key.vk_key = temp_key;
1617 mutex_exit(&vcache_lock);
1618
1619 fstrans_start(mp, FSTRANS_SHARED);
1620
1621 /*
1622 * Clean out any cached data associated with the vnode.
1623 * If purging an active vnode, it must be closed and
1624 * deactivated before being reclaimed.
1625 */
1626 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
1627 if (error != 0) {
1628 if (wapbl_vphaswapbl(vp))
1629 WAPBL_DISCARD(wapbl_vptomp(vp));
1630 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
1631 }
1632 KASSERTMSG((error == 0), "vinvalbuf failed: %d", error);
1633 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1634 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) {
1635 spec_node_revoke(vp);
1636 }
1637
1638 /*
1639 * Disassociate the underlying file system from the vnode.
1640 * Note that the VOP_INACTIVE will not unlock the vnode.
1641 */
1642 VOP_INACTIVE(vp, &recycle);
1643 VOP_UNLOCK(vp);
1644 if (VOP_RECLAIM(vp)) {
1645 vnpanic(vp, "%s: cannot reclaim", __func__);
1646 }
1647
1648 KASSERT(vp->v_data == NULL);
1649 KASSERT(vp->v_uobj.uo_npages == 0);
1650
1651 if (vp->v_type == VREG && vp->v_ractx != NULL) {
1652 uvm_ra_freectx(vp->v_ractx);
1653 vp->v_ractx = NULL;
1654 }
1655
1656 /* Purge name cache. */
1657 cache_purge(vp);
1658
1659 /* Remove from vnode cache. */
1660 hash = vcache_hash(&vip->vi_key);
1661 mutex_enter(&vcache_lock);
1662 KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash));
1663 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
1664 vip, vnode_impl, vi_hash);
1665 mutex_exit(&vcache_lock);
1666 if (temp_key != temp_buf)
1667 kmem_free(temp_key, temp_key_len);
1668
1669 /* Done with purge, notify sleepers of the grim news. */
1670 mutex_enter(vp->v_interlock);
1671 vp->v_op = dead_vnodeop_p;
1672 vp->v_vflag |= VV_LOCKSWORK;
1673 VSTATE_CHANGE(vp, VS_RECLAIMING, VS_RECLAIMED);
1674 vp->v_tag = VT_NON;
1675 KNOTE(&vp->v_klist, NOTE_REVOKE);
1676 mutex_exit(vp->v_interlock);
1677
1678 /*
1679 * Move to dead mount. Must be after changing the operations
1680 * vector as vnode operations enter the mount before using the
1681 * operations vector. See sys/kern/vnode_if.c.
1682 */
1683 vp->v_vflag &= ~VV_ROOT;
1684 vfs_ref(dead_rootmount);
1685 vfs_insmntque(vp, dead_rootmount);
1686
1687 mutex_enter(vp->v_interlock);
1688 fstrans_done(mp);
1689 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1690 }
1691
1692 /*
1693 * Update outstanding I/O count and do wakeup if requested.
1694 */
1695 void
1696 vwakeup(struct buf *bp)
1697 {
1698 vnode_t *vp;
1699
1700 if ((vp = bp->b_vp) == NULL)
1701 return;
1702
1703 KASSERT(bp->b_objlock == vp->v_interlock);
1704 KASSERT(mutex_owned(bp->b_objlock));
1705
1706 if (--vp->v_numoutput < 0)
1707 vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp);
1708 if (vp->v_numoutput == 0)
1709 cv_broadcast(&vp->v_cv);
1710 }
1711
1712 /*
1713 * Test a vnode for being or becoming dead. Returns one of:
1714 * EBUSY: vnode is becoming dead, with "flags == VDEAD_NOWAIT" only.
1715 * ENOENT: vnode is dead.
1716 * 0: otherwise.
1717 *
1718 * Whenever this function returns a non-zero value all future
1719 * calls will also return a non-zero value.
1720 */
1721 int
1722 vdead_check(struct vnode *vp, int flags)
1723 {
1724
1725 KASSERT(mutex_owned(vp->v_interlock));
1726
1727 if (! ISSET(flags, VDEAD_NOWAIT))
1728 VSTATE_WAIT_STABLE(vp);
1729
1730 if (VSTATE_GET(vp) == VS_RECLAIMING) {
1731 KASSERT(ISSET(flags, VDEAD_NOWAIT));
1732 return EBUSY;
1733 } else if (VSTATE_GET(vp) == VS_RECLAIMED) {
1734 return ENOENT;
1735 }
1736
1737 return 0;
1738 }
1739
1740 int
1741 vfs_drainvnodes(void)
1742 {
1743 int i, gen;
1744
1745 mutex_enter(&vdrain_lock);
1746 for (i = 0; i < 2; i++) {
1747 gen = vdrain_gen;
1748 while (gen == vdrain_gen) {
1749 cv_broadcast(&vdrain_cv);
1750 cv_wait(&vdrain_gen_cv, &vdrain_lock);
1751 }
1752 }
1753 mutex_exit(&vdrain_lock);
1754
1755 if (numvnodes >= desiredvnodes)
1756 return EBUSY;
1757
1758 if (vcache_hashsize != desiredvnodes)
1759 vcache_reinit();
1760
1761 return 0;
1762 }
1763
1764 void
1765 vnpanic(vnode_t *vp, const char *fmt, ...)
1766 {
1767 va_list ap;
1768
1769 #ifdef DIAGNOSTIC
1770 vprint(NULL, vp);
1771 #endif
1772 va_start(ap, fmt);
1773 vpanic(fmt, ap);
1774 va_end(ap);
1775 }
1776