vfs_vnode.c revision 1.51 1 /* $NetBSD: vfs_vnode.c,v 1.51 2016/05/26 11:08:44 hannken Exp $ */
2
3 /*-
4 * Copyright (c) 1997-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /*
34 * Copyright (c) 1989, 1993
35 * The Regents of the University of California. All rights reserved.
36 * (c) UNIX System Laboratories, Inc.
37 * All or some portions of this file are derived from material licensed
38 * to the University of California by American Telephone and Telegraph
39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40 * the permission of UNIX System Laboratories, Inc.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
67 */
68
69 /*
70 * The vnode cache subsystem.
71 *
72 * Life-cycle
73 *
74 * Normally, there are two points where new vnodes are created:
75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode
76 * starts in one of the following ways:
77 *
78 * - Allocation, via vcache_get(9) or vcache_new(9).
79 * - Reclamation of inactive vnode, via vget(9).
80 *
81 * Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9)
82 * was another, traditional way. Currently, only the draining thread
83 * recycles the vnodes. This behaviour might be revisited.
84 *
85 * The life-cycle ends when the last reference is dropped, usually
86 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform
87 * the file system that vnode is inactive. Via this call, file system
88 * indicates whether vnode can be recycled (usually, it checks its own
89 * references, e.g. count of links, whether the file was removed).
90 *
91 * Depending on indication, vnode can be put into a free list (cache),
92 * or cleaned via vclean(9), which calls VOP_RECLAIM(9) to disassociate
93 * underlying file system from the vnode, and finally destroyed.
94 *
95 * Reference counting
96 *
97 * Vnode is considered active, if reference count (vnode_t::v_usecount)
98 * is non-zero. It is maintained using: vref(9) and vrele(9), as well
99 * as vput(9), routines. Common points holding references are e.g.
100 * file openings, current working directory, mount points, etc.
101 *
102 * Note on v_usecount and its locking
103 *
104 * At nearly all points it is known that v_usecount could be zero,
105 * the vnode_t::v_interlock will be held. To change v_usecount away
106 * from zero, the interlock must be held. To change from a non-zero
107 * value to zero, again the interlock must be held.
108 *
109 * Changing the usecount from a non-zero value to a non-zero value can
110 * safely be done using atomic operations, without the interlock held.
111 *
112 * Note: if VI_CLEAN is set, vnode_t::v_interlock will be released while
113 * mntvnode_lock is still held.
114 *
115 * See PR 41374.
116 */
117
118 #include <sys/cdefs.h>
119 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.51 2016/05/26 11:08:44 hannken Exp $");
120
121 #define _VFS_VNODE_PRIVATE
122
123 #include <sys/param.h>
124 #include <sys/kernel.h>
125
126 #include <sys/atomic.h>
127 #include <sys/buf.h>
128 #include <sys/conf.h>
129 #include <sys/device.h>
130 #include <sys/hash.h>
131 #include <sys/kauth.h>
132 #include <sys/kmem.h>
133 #include <sys/kthread.h>
134 #include <sys/module.h>
135 #include <sys/mount.h>
136 #include <sys/namei.h>
137 #include <sys/syscallargs.h>
138 #include <sys/sysctl.h>
139 #include <sys/systm.h>
140 #include <sys/vnode.h>
141 #include <sys/wapbl.h>
142 #include <sys/fstrans.h>
143
144 #include <uvm/uvm.h>
145 #include <uvm/uvm_readahead.h>
146
147 /* Flags to vrelel. */
148 #define VRELEL_ASYNC_RELE 0x0001 /* Always defer to vrele thread. */
149 #define VRELEL_CHANGING_SET 0x0002 /* VI_CHANGING set by caller. */
150
151 enum vcache_state {
152 VN_MARKER, /* Stable, used as marker. Will not change. */
153 VN_LOADING, /* Intermediate, initialising the fs node. */
154 VN_ACTIVE, /* Stable, valid fs node attached. */
155 VN_BLOCKED, /* Intermediate, active, no new references allowed. */
156 VN_RECLAIMING, /* Intermediate, detaching the fs node. */
157 VN_RECLAIMED /* Stable, no fs node attached. */
158 };
159 struct vcache_key {
160 struct mount *vk_mount;
161 const void *vk_key;
162 size_t vk_key_len;
163 };
164 struct vcache_node {
165 struct vnode vn_data;
166 enum vcache_state vn_state;
167 SLIST_ENTRY(vcache_node) vn_hash;
168 struct vnode *vn_vnode;
169 struct vcache_key vn_key;
170 };
171
172 #define VN_TO_VP(node) ((vnode_t *)(node))
173 #define VP_TO_VN(vp) ((struct vcache_node *)(vp))
174
175 u_int numvnodes __cacheline_aligned;
176
177 /*
178 * There are two free lists: one is for vnodes which have no buffer/page
179 * references and one for those which do (i.e. v_holdcnt is non-zero).
180 * Vnode recycling mechanism first attempts to look into the former list.
181 */
182 static kmutex_t vnode_free_list_lock __cacheline_aligned;
183 static vnodelst_t vnode_free_list __cacheline_aligned;
184 static vnodelst_t vnode_hold_list __cacheline_aligned;
185 static kcondvar_t vdrain_cv __cacheline_aligned;
186
187 static vnodelst_t vrele_list __cacheline_aligned;
188 static kmutex_t vrele_lock __cacheline_aligned;
189 static kcondvar_t vrele_cv __cacheline_aligned;
190 static lwp_t * vrele_lwp __cacheline_aligned;
191 static int vrele_pending __cacheline_aligned;
192 static int vrele_gen __cacheline_aligned;
193
194 SLIST_HEAD(hashhead, vcache_node);
195 static struct {
196 kmutex_t lock;
197 kcondvar_t cv;
198 u_long hashmask;
199 struct hashhead *hashtab;
200 pool_cache_t pool;
201 } vcache __cacheline_aligned;
202
203 static int cleanvnode(void);
204 static struct vcache_node *vcache_alloc(void);
205 static void vcache_free(struct vcache_node *);
206 static void vcache_init(void);
207 static void vcache_reinit(void);
208 static void vclean(vnode_t *);
209 static void vrelel(vnode_t *, int);
210 static void vdrain_thread(void *);
211 static void vrele_thread(void *);
212 static void vnpanic(vnode_t *, const char *, ...)
213 __printflike(2, 3);
214 static void vwait(vnode_t *, int);
215
216 /* Routines having to do with the management of the vnode table. */
217 extern struct mount *dead_rootmount;
218 extern int (**dead_vnodeop_p)(void *);
219 extern struct vfsops dead_vfsops;
220
221 /* Vnode state operations and diagnostics. */
222
223 static const char *
224 vstate_name(enum vcache_state state)
225 {
226
227 switch (state) {
228 case VN_MARKER:
229 return "MARKER";
230 case VN_LOADING:
231 return "LOADING";
232 case VN_ACTIVE:
233 return "ACTIVE";
234 case VN_BLOCKED:
235 return "BLOCKED";
236 case VN_RECLAIMING:
237 return "RECLAIMING";
238 case VN_RECLAIMED:
239 return "RECLAIMED";
240 default:
241 return "ILLEGAL";
242 }
243 }
244
245 #if defined(DIAGNOSTIC)
246
247 #define VSTATE_GET(vp) \
248 vstate_assert_get((vp), __func__, __LINE__)
249 #define VSTATE_CHANGE(vp, from, to) \
250 vstate_assert_change((vp), (from), (to), __func__, __LINE__)
251 #define VSTATE_WAIT_STABLE(vp) \
252 vstate_assert_wait_stable((vp), __func__, __LINE__)
253 #define VSTATE_ASSERT(vp, state) \
254 vstate_assert((vp), (state), __func__, __LINE__)
255
256 static void __unused
257 vstate_assert(vnode_t *vp, enum vcache_state state, const char *func, int line)
258 {
259 struct vcache_node *node = VP_TO_VN(vp);
260
261 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
262
263 if (__predict_true(node->vn_state == state))
264 return;
265 vnpanic(vp, "state is %s, expected %s at %s:%d",
266 vstate_name(node->vn_state), vstate_name(state), func, line);
267 }
268
269 static enum vcache_state __unused
270 vstate_assert_get(vnode_t *vp, const char *func, int line)
271 {
272 struct vcache_node *node = VP_TO_VN(vp);
273
274 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
275 if (node->vn_state == VN_MARKER)
276 vnpanic(vp, "state is %s at %s:%d",
277 vstate_name(node->vn_state), func, line);
278
279 return node->vn_state;
280 }
281
282 static void __unused
283 vstate_assert_wait_stable(vnode_t *vp, const char *func, int line)
284 {
285 struct vcache_node *node = VP_TO_VN(vp);
286
287 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
288 if (node->vn_state == VN_MARKER)
289 vnpanic(vp, "state is %s at %s:%d",
290 vstate_name(node->vn_state), func, line);
291
292 while (node->vn_state != VN_ACTIVE && node->vn_state != VN_RECLAIMED)
293 cv_wait(&vp->v_cv, vp->v_interlock);
294
295 if (node->vn_state == VN_MARKER)
296 vnpanic(vp, "state is %s at %s:%d",
297 vstate_name(node->vn_state), func, line);
298 }
299
300 static void __unused
301 vstate_assert_change(vnode_t *vp, enum vcache_state from, enum vcache_state to,
302 const char *func, int line)
303 {
304 struct vcache_node *node = VP_TO_VN(vp);
305
306 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
307 if (from == VN_LOADING)
308 KASSERTMSG(mutex_owned(&vcache.lock), "at %s:%d", func, line);
309
310 if (from == VN_MARKER)
311 vnpanic(vp, "from is %s at %s:%d",
312 vstate_name(from), func, line);
313 if (to == VN_MARKER)
314 vnpanic(vp, "to is %s at %s:%d",
315 vstate_name(to), func, line);
316 if (node->vn_state != from)
317 vnpanic(vp, "from is %s, expected %s at %s:%d\n",
318 vstate_name(node->vn_state), vstate_name(from), func, line);
319
320 node->vn_state = to;
321 if (from == VN_LOADING)
322 cv_broadcast(&vcache.cv);
323 if (to == VN_ACTIVE || to == VN_RECLAIMED)
324 cv_broadcast(&vp->v_cv);
325 }
326
327 #else /* defined(DIAGNOSTIC) */
328
329 #define VSTATE_GET(vp) \
330 (VP_TO_VN((vp))->vn_state)
331 #define VSTATE_CHANGE(vp, from, to) \
332 vstate_change((vp), (from), (to))
333 #define VSTATE_WAIT_STABLE(vp) \
334 vstate_wait_stable((vp))
335 #define VSTATE_ASSERT(vp, state)
336
337 static void __unused
338 vstate_wait_stable(vnode_t *vp)
339 {
340 struct vcache_node *node = VP_TO_VN(vp);
341
342 while (node->vn_state != VN_ACTIVE && node->vn_state != VN_RECLAIMED)
343 cv_wait(&vp->v_cv, vp->v_interlock);
344 }
345
346 static void __unused
347 vstate_change(vnode_t *vp, enum vcache_state from, enum vcache_state to)
348 {
349 struct vcache_node *node = VP_TO_VN(vp);
350
351 node->vn_state = to;
352 if (from == VN_LOADING)
353 cv_broadcast(&vcache.cv);
354 if (to == VN_ACTIVE || to == VN_RECLAIMED)
355 cv_broadcast(&vp->v_cv);
356 }
357
358 #endif /* defined(DIAGNOSTIC) */
359
360 void
361 vfs_vnode_sysinit(void)
362 {
363 int error __diagused;
364
365 dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL);
366 KASSERT(dead_rootmount != NULL);
367 dead_rootmount->mnt_iflag = IMNT_MPSAFE;
368
369 mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE);
370 TAILQ_INIT(&vnode_free_list);
371 TAILQ_INIT(&vnode_hold_list);
372 TAILQ_INIT(&vrele_list);
373
374 vcache_init();
375
376 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE);
377 cv_init(&vdrain_cv, "vdrain");
378 cv_init(&vrele_cv, "vrele");
379 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread,
380 NULL, NULL, "vdrain");
381 KASSERTMSG((error == 0), "kthread_create(vdrain) failed: %d", error);
382 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread,
383 NULL, &vrele_lwp, "vrele");
384 KASSERTMSG((error == 0), "kthread_create(vrele) failed: %d", error);
385 }
386
387 /*
388 * Allocate a new marker vnode.
389 */
390 vnode_t *
391 vnalloc_marker(struct mount *mp)
392 {
393 struct vcache_node *node;
394 vnode_t *vp;
395
396 node = pool_cache_get(vcache.pool, PR_WAITOK);
397 memset(node, 0, sizeof(*node));
398 vp = VN_TO_VP(node);
399 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0);
400 vp->v_mount = mp;
401 vp->v_type = VBAD;
402 vp->v_iflag = VI_MARKER;
403
404 return vp;
405 }
406
407 /*
408 * Free a marker vnode.
409 */
410 void
411 vnfree_marker(vnode_t *vp)
412 {
413 struct vcache_node *node;
414
415 node = VP_TO_VN(vp);
416 KASSERT(ISSET(vp->v_iflag, VI_MARKER));
417 uvm_obj_destroy(&vp->v_uobj, true);
418 pool_cache_put(vcache.pool, node);
419 }
420
421 /*
422 * Test a vnode for being a marker vnode.
423 */
424 bool
425 vnis_marker(vnode_t *vp)
426 {
427
428 return (ISSET(vp->v_iflag, VI_MARKER));
429 }
430
431 /*
432 * cleanvnode: grab a vnode from freelist, clean and free it.
433 *
434 * => Releases vnode_free_list_lock.
435 */
436 static int
437 cleanvnode(void)
438 {
439 vnode_t *vp;
440 vnodelst_t *listhd;
441 struct mount *mp;
442
443 KASSERT(mutex_owned(&vnode_free_list_lock));
444
445 listhd = &vnode_free_list;
446 try_nextlist:
447 TAILQ_FOREACH(vp, listhd, v_freelist) {
448 /*
449 * It's safe to test v_usecount and v_iflag
450 * without holding the interlock here, since
451 * these vnodes should never appear on the
452 * lists.
453 */
454 KASSERT(vp->v_usecount == 0);
455 KASSERT((vp->v_iflag & VI_CLEAN) == 0);
456 KASSERT(vp->v_freelisthd == listhd);
457
458 if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
459 continue;
460 if (!mutex_tryenter(vp->v_interlock)) {
461 VOP_UNLOCK(vp);
462 continue;
463 }
464 KASSERT((vp->v_iflag & VI_XLOCK) == 0);
465 mp = vp->v_mount;
466 if (fstrans_start_nowait(mp, FSTRANS_SHARED) != 0) {
467 mutex_exit(vp->v_interlock);
468 VOP_UNLOCK(vp);
469 continue;
470 }
471 break;
472 }
473
474 if (vp == NULL) {
475 if (listhd == &vnode_free_list) {
476 listhd = &vnode_hold_list;
477 goto try_nextlist;
478 }
479 mutex_exit(&vnode_free_list_lock);
480 return EBUSY;
481 }
482
483 /* Remove it from the freelist. */
484 TAILQ_REMOVE(listhd, vp, v_freelist);
485 vp->v_freelisthd = NULL;
486 mutex_exit(&vnode_free_list_lock);
487
488 KASSERT(vp->v_usecount == 0);
489
490 /*
491 * The vnode is still associated with a file system, so we must
492 * clean it out before freeing it. We need to add a reference
493 * before doing this.
494 */
495 vp->v_usecount = 1;
496 KASSERT((vp->v_iflag & VI_CHANGING) == 0);
497 vp->v_iflag |= VI_CHANGING;
498 vclean(vp);
499 vrelel(vp, VRELEL_CHANGING_SET);
500 fstrans_done(mp);
501
502 return 0;
503 }
504
505 /*
506 * Helper thread to keep the number of vnodes below desiredvnodes.
507 */
508 static void
509 vdrain_thread(void *cookie)
510 {
511 int error;
512
513 mutex_enter(&vnode_free_list_lock);
514
515 for (;;) {
516 cv_timedwait(&vdrain_cv, &vnode_free_list_lock, hz);
517 while (numvnodes > desiredvnodes) {
518 error = cleanvnode();
519 if (error)
520 kpause("vndsbusy", false, hz, NULL);
521 mutex_enter(&vnode_free_list_lock);
522 if (error)
523 break;
524 }
525 }
526 }
527
528 /*
529 * Remove a vnode from its freelist.
530 */
531 void
532 vremfree(vnode_t *vp)
533 {
534
535 KASSERT(mutex_owned(vp->v_interlock));
536 KASSERT(vp->v_usecount == 0);
537
538 /*
539 * Note that the reference count must not change until
540 * the vnode is removed.
541 */
542 mutex_enter(&vnode_free_list_lock);
543 if (vp->v_holdcnt > 0) {
544 KASSERT(vp->v_freelisthd == &vnode_hold_list);
545 } else {
546 KASSERT(vp->v_freelisthd == &vnode_free_list);
547 }
548 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
549 vp->v_freelisthd = NULL;
550 mutex_exit(&vnode_free_list_lock);
551 }
552
553 /*
554 * vget: get a particular vnode from the free list, increment its reference
555 * count and lock it.
556 *
557 * => Should be called with v_interlock held.
558 *
559 * If VI_CHANGING is set, the vnode may be eliminated in vgone()/vclean().
560 * In that case, we cannot grab the vnode, so the process is awakened when
561 * the transition is completed, and an error returned to indicate that the
562 * vnode is no longer usable.
563 */
564 int
565 vget(vnode_t *vp, int flags, bool waitok)
566 {
567 int error = 0;
568
569 KASSERT((vp->v_iflag & VI_MARKER) == 0);
570 KASSERT(mutex_owned(vp->v_interlock));
571 KASSERT((flags & ~LK_NOWAIT) == 0);
572 KASSERT(waitok == ((flags & LK_NOWAIT) == 0));
573
574 /*
575 * Before adding a reference, we must remove the vnode
576 * from its freelist.
577 */
578 if (vp->v_usecount == 0) {
579 vremfree(vp);
580 vp->v_usecount = 1;
581 } else {
582 atomic_inc_uint(&vp->v_usecount);
583 }
584
585 /*
586 * If the vnode is in the process of changing state we wait
587 * for the change to complete and take care not to return
588 * a clean vnode.
589 */
590 if ((vp->v_iflag & VI_CHANGING) != 0) {
591 if ((flags & LK_NOWAIT) != 0) {
592 vrelel(vp, 0);
593 return EBUSY;
594 }
595 vwait(vp, VI_CHANGING);
596 if ((vp->v_iflag & VI_CLEAN) != 0) {
597 vrelel(vp, 0);
598 return ENOENT;
599 }
600 }
601
602 /*
603 * Ok, we got it in good shape.
604 */
605 KASSERT((vp->v_iflag & VI_CLEAN) == 0);
606 mutex_exit(vp->v_interlock);
607 return error;
608 }
609
610 /*
611 * vput: unlock and release the reference.
612 */
613 void
614 vput(vnode_t *vp)
615 {
616
617 KASSERT((vp->v_iflag & VI_MARKER) == 0);
618
619 VOP_UNLOCK(vp);
620 vrele(vp);
621 }
622
623 /*
624 * Try to drop reference on a vnode. Abort if we are releasing the
625 * last reference. Note: this _must_ succeed if not the last reference.
626 */
627 static inline bool
628 vtryrele(vnode_t *vp)
629 {
630 u_int use, next;
631
632 for (use = vp->v_usecount;; use = next) {
633 if (use == 1) {
634 return false;
635 }
636 KASSERT(use > 1);
637 next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
638 if (__predict_true(next == use)) {
639 return true;
640 }
641 }
642 }
643
644 /*
645 * Vnode release. If reference count drops to zero, call inactive
646 * routine and either return to freelist or free to the pool.
647 */
648 static void
649 vrelel(vnode_t *vp, int flags)
650 {
651 bool recycle, defer;
652 int error;
653
654 KASSERT(mutex_owned(vp->v_interlock));
655 KASSERT((vp->v_iflag & VI_MARKER) == 0);
656 KASSERT(vp->v_freelisthd == NULL);
657
658 if (__predict_false(vp->v_op == dead_vnodeop_p &&
659 (vp->v_iflag & (VI_CLEAN|VI_XLOCK)) == 0)) {
660 vnpanic(vp, "dead but not clean");
661 }
662
663 /*
664 * If not the last reference, just drop the reference count
665 * and unlock.
666 */
667 if (vtryrele(vp)) {
668 if ((flags & VRELEL_CHANGING_SET) != 0) {
669 KASSERT((vp->v_iflag & VI_CHANGING) != 0);
670 vp->v_iflag &= ~VI_CHANGING;
671 cv_broadcast(&vp->v_cv);
672 }
673 mutex_exit(vp->v_interlock);
674 return;
675 }
676 if (vp->v_usecount <= 0 || vp->v_writecount != 0) {
677 vnpanic(vp, "%s: bad ref count", __func__);
678 }
679
680 KASSERT((vp->v_iflag & VI_XLOCK) == 0);
681
682 #ifdef DIAGNOSTIC
683 if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
684 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
685 vprint("vrelel: missing VOP_CLOSE()", vp);
686 }
687 #endif
688
689 /*
690 * If not clean, deactivate the vnode, but preserve
691 * our reference across the call to VOP_INACTIVE().
692 */
693 if ((vp->v_iflag & VI_CLEAN) == 0) {
694 recycle = false;
695
696 /*
697 * XXX This ugly block can be largely eliminated if
698 * locking is pushed down into the file systems.
699 *
700 * Defer vnode release to vrele_thread if caller
701 * requests it explicitly or is the pagedaemon.
702 */
703 if ((curlwp == uvm.pagedaemon_lwp) ||
704 (flags & VRELEL_ASYNC_RELE) != 0) {
705 defer = true;
706 } else if (curlwp == vrele_lwp) {
707 /*
708 * We have to try harder.
709 */
710 mutex_exit(vp->v_interlock);
711 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
712 KASSERTMSG((error == 0), "vn_lock failed: %d", error);
713 mutex_enter(vp->v_interlock);
714 defer = false;
715 } else {
716 /* If we can't acquire the lock, then defer. */
717 mutex_exit(vp->v_interlock);
718 error = vn_lock(vp,
719 LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
720 defer = (error != 0);
721 mutex_enter(vp->v_interlock);
722 }
723
724 KASSERT(mutex_owned(vp->v_interlock));
725 KASSERT(! (curlwp == vrele_lwp && defer));
726
727 if (defer) {
728 /*
729 * Defer reclaim to the kthread; it's not safe to
730 * clean it here. We donate it our last reference.
731 */
732 if ((flags & VRELEL_CHANGING_SET) != 0) {
733 KASSERT((vp->v_iflag & VI_CHANGING) != 0);
734 vp->v_iflag &= ~VI_CHANGING;
735 cv_broadcast(&vp->v_cv);
736 }
737 mutex_enter(&vrele_lock);
738 TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist);
739 if (++vrele_pending > (desiredvnodes >> 8))
740 cv_signal(&vrele_cv);
741 mutex_exit(&vrele_lock);
742 mutex_exit(vp->v_interlock);
743 return;
744 }
745
746 /*
747 * If the node got another reference while we
748 * released the interlock, don't try to inactivate it yet.
749 */
750 if (__predict_false(vtryrele(vp))) {
751 VOP_UNLOCK(vp);
752 if ((flags & VRELEL_CHANGING_SET) != 0) {
753 KASSERT((vp->v_iflag & VI_CHANGING) != 0);
754 vp->v_iflag &= ~VI_CHANGING;
755 cv_broadcast(&vp->v_cv);
756 }
757 mutex_exit(vp->v_interlock);
758 return;
759 }
760
761 if ((flags & VRELEL_CHANGING_SET) == 0) {
762 KASSERT((vp->v_iflag & VI_CHANGING) == 0);
763 vp->v_iflag |= VI_CHANGING;
764 }
765 mutex_exit(vp->v_interlock);
766
767 /*
768 * The vnode can gain another reference while being
769 * deactivated. If VOP_INACTIVE() indicates that
770 * the described file has been deleted, then recycle
771 * the vnode irrespective of additional references.
772 * Another thread may be waiting to re-use the on-disk
773 * inode.
774 *
775 * Note that VOP_INACTIVE() will drop the vnode lock.
776 */
777 VOP_INACTIVE(vp, &recycle);
778 if (recycle) {
779 /* vclean() below will drop the lock. */
780 if (vn_lock(vp, LK_EXCLUSIVE) != 0)
781 recycle = false;
782 }
783 mutex_enter(vp->v_interlock);
784 if (!recycle) {
785 if (vtryrele(vp)) {
786 KASSERT((vp->v_iflag & VI_CHANGING) != 0);
787 vp->v_iflag &= ~VI_CHANGING;
788 cv_broadcast(&vp->v_cv);
789 mutex_exit(vp->v_interlock);
790 return;
791 }
792 }
793
794 /* Take care of space accounting. */
795 if (vp->v_iflag & VI_EXECMAP) {
796 atomic_add_int(&uvmexp.execpages,
797 -vp->v_uobj.uo_npages);
798 atomic_add_int(&uvmexp.filepages,
799 vp->v_uobj.uo_npages);
800 }
801 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP);
802 vp->v_vflag &= ~VV_MAPPED;
803
804 /*
805 * Recycle the vnode if the file is now unused (unlinked),
806 * otherwise just free it.
807 */
808 if (recycle) {
809 vclean(vp);
810 }
811 KASSERT(vp->v_usecount > 0);
812 } else { /* vnode was already clean */
813 if ((flags & VRELEL_CHANGING_SET) == 0) {
814 KASSERT((vp->v_iflag & VI_CHANGING) == 0);
815 vp->v_iflag |= VI_CHANGING;
816 }
817 }
818
819 if (atomic_dec_uint_nv(&vp->v_usecount) != 0) {
820 /* Gained another reference while being reclaimed. */
821 KASSERT((vp->v_iflag & VI_CHANGING) != 0);
822 vp->v_iflag &= ~VI_CHANGING;
823 cv_broadcast(&vp->v_cv);
824 mutex_exit(vp->v_interlock);
825 return;
826 }
827
828 if ((vp->v_iflag & VI_CLEAN) != 0) {
829 /*
830 * It's clean so destroy it. It isn't referenced
831 * anywhere since it has been reclaimed.
832 */
833 KASSERT(vp->v_holdcnt == 0);
834 KASSERT(vp->v_writecount == 0);
835 mutex_exit(vp->v_interlock);
836 vfs_insmntque(vp, NULL);
837 if (vp->v_type == VBLK || vp->v_type == VCHR) {
838 spec_node_destroy(vp);
839 }
840 vcache_free(VP_TO_VN(vp));
841 } else {
842 /*
843 * Otherwise, put it back onto the freelist. It
844 * can't be destroyed while still associated with
845 * a file system.
846 */
847 mutex_enter(&vnode_free_list_lock);
848 if (vp->v_holdcnt > 0) {
849 vp->v_freelisthd = &vnode_hold_list;
850 } else {
851 vp->v_freelisthd = &vnode_free_list;
852 }
853 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
854 mutex_exit(&vnode_free_list_lock);
855 KASSERT((vp->v_iflag & VI_CHANGING) != 0);
856 vp->v_iflag &= ~VI_CHANGING;
857 cv_broadcast(&vp->v_cv);
858 mutex_exit(vp->v_interlock);
859 }
860 }
861
862 void
863 vrele(vnode_t *vp)
864 {
865
866 KASSERT((vp->v_iflag & VI_MARKER) == 0);
867
868 if (vtryrele(vp)) {
869 return;
870 }
871 mutex_enter(vp->v_interlock);
872 vrelel(vp, 0);
873 }
874
875 /*
876 * Asynchronous vnode release, vnode is released in different context.
877 */
878 void
879 vrele_async(vnode_t *vp)
880 {
881
882 KASSERT((vp->v_iflag & VI_MARKER) == 0);
883
884 if (vtryrele(vp)) {
885 return;
886 }
887 mutex_enter(vp->v_interlock);
888 vrelel(vp, VRELEL_ASYNC_RELE);
889 }
890
891 static void
892 vrele_thread(void *cookie)
893 {
894 vnodelst_t skip_list;
895 vnode_t *vp;
896 struct mount *mp;
897
898 TAILQ_INIT(&skip_list);
899
900 mutex_enter(&vrele_lock);
901 for (;;) {
902 while (TAILQ_EMPTY(&vrele_list)) {
903 vrele_gen++;
904 cv_broadcast(&vrele_cv);
905 cv_timedwait(&vrele_cv, &vrele_lock, hz);
906 TAILQ_CONCAT(&vrele_list, &skip_list, v_freelist);
907 }
908 vp = TAILQ_FIRST(&vrele_list);
909 mp = vp->v_mount;
910 TAILQ_REMOVE(&vrele_list, vp, v_freelist);
911 if (fstrans_start_nowait(mp, FSTRANS_LAZY) != 0) {
912 TAILQ_INSERT_TAIL(&skip_list, vp, v_freelist);
913 continue;
914 }
915 vrele_pending--;
916 mutex_exit(&vrele_lock);
917
918 /*
919 * If not the last reference, then ignore the vnode
920 * and look for more work.
921 */
922 mutex_enter(vp->v_interlock);
923 vrelel(vp, 0);
924 fstrans_done(mp);
925 mutex_enter(&vrele_lock);
926 }
927 }
928
929 void
930 vrele_flush(void)
931 {
932 int gen;
933
934 mutex_enter(&vrele_lock);
935 gen = vrele_gen;
936 while (vrele_pending && gen == vrele_gen) {
937 cv_broadcast(&vrele_cv);
938 cv_wait(&vrele_cv, &vrele_lock);
939 }
940 mutex_exit(&vrele_lock);
941 }
942
943 /*
944 * Vnode reference, where a reference is already held by some other
945 * object (for example, a file structure).
946 */
947 void
948 vref(vnode_t *vp)
949 {
950
951 KASSERT((vp->v_iflag & VI_MARKER) == 0);
952 KASSERT(vp->v_usecount != 0);
953
954 atomic_inc_uint(&vp->v_usecount);
955 }
956
957 /*
958 * Page or buffer structure gets a reference.
959 * Called with v_interlock held.
960 */
961 void
962 vholdl(vnode_t *vp)
963 {
964
965 KASSERT(mutex_owned(vp->v_interlock));
966 KASSERT((vp->v_iflag & VI_MARKER) == 0);
967
968 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) {
969 mutex_enter(&vnode_free_list_lock);
970 KASSERT(vp->v_freelisthd == &vnode_free_list);
971 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
972 vp->v_freelisthd = &vnode_hold_list;
973 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
974 mutex_exit(&vnode_free_list_lock);
975 }
976 }
977
978 /*
979 * Page or buffer structure frees a reference.
980 * Called with v_interlock held.
981 */
982 void
983 holdrelel(vnode_t *vp)
984 {
985
986 KASSERT(mutex_owned(vp->v_interlock));
987 KASSERT((vp->v_iflag & VI_MARKER) == 0);
988
989 if (vp->v_holdcnt <= 0) {
990 vnpanic(vp, "%s: holdcnt vp %p", __func__, vp);
991 }
992
993 vp->v_holdcnt--;
994 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) {
995 mutex_enter(&vnode_free_list_lock);
996 KASSERT(vp->v_freelisthd == &vnode_hold_list);
997 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
998 vp->v_freelisthd = &vnode_free_list;
999 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
1000 mutex_exit(&vnode_free_list_lock);
1001 }
1002 }
1003
1004 /*
1005 * Disassociate the underlying file system from a vnode.
1006 *
1007 * Must be called with vnode locked and will return unlocked.
1008 * Must be called with the interlock held, and will return with it held.
1009 */
1010 static void
1011 vclean(vnode_t *vp)
1012 {
1013 lwp_t *l = curlwp;
1014 bool recycle, active;
1015 int error;
1016
1017 KASSERT((vp->v_vflag & VV_LOCKSWORK) == 0 ||
1018 VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
1019 KASSERT(mutex_owned(vp->v_interlock));
1020 KASSERT((vp->v_iflag & VI_MARKER) == 0);
1021 KASSERT((vp->v_iflag & (VI_XLOCK | VI_CLEAN)) == 0);
1022 KASSERT(vp->v_usecount != 0);
1023
1024 active = (vp->v_usecount > 1);
1025 /*
1026 * Prevent the vnode from being recycled or brought into use
1027 * while we clean it out.
1028 */
1029 vp->v_iflag |= VI_XLOCK;
1030 if (vp->v_iflag & VI_EXECMAP) {
1031 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages);
1032 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages);
1033 }
1034 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
1035 mutex_exit(vp->v_interlock);
1036
1037 /*
1038 * Clean out any cached data associated with the vnode.
1039 * If purging an active vnode, it must be closed and
1040 * deactivated before being reclaimed. Note that the
1041 * VOP_INACTIVE will unlock the vnode.
1042 */
1043 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
1044 if (error != 0) {
1045 if (wapbl_vphaswapbl(vp))
1046 WAPBL_DISCARD(wapbl_vptomp(vp));
1047 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
1048 }
1049 KASSERTMSG((error == 0), "vinvalbuf failed: %d", error);
1050 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1051 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) {
1052 spec_node_revoke(vp);
1053 }
1054 if (active) {
1055 VOP_INACTIVE(vp, &recycle);
1056 } else {
1057 /*
1058 * Any other processes trying to obtain this lock must first
1059 * wait for VI_XLOCK to clear, then call the new lock operation.
1060 */
1061 VOP_UNLOCK(vp);
1062 }
1063
1064 /* Disassociate the underlying file system from the vnode. */
1065 if (VOP_RECLAIM(vp)) {
1066 vnpanic(vp, "%s: cannot reclaim", __func__);
1067 }
1068
1069 KASSERT(vp->v_data == NULL);
1070 KASSERT(vp->v_uobj.uo_npages == 0);
1071
1072 if (vp->v_type == VREG && vp->v_ractx != NULL) {
1073 uvm_ra_freectx(vp->v_ractx);
1074 vp->v_ractx = NULL;
1075 }
1076
1077 /* Purge name cache. */
1078 cache_purge(vp);
1079
1080 /* Move to dead mount. */
1081 vp->v_vflag &= ~VV_ROOT;
1082 atomic_inc_uint(&dead_rootmount->mnt_refcnt);
1083 vfs_insmntque(vp, dead_rootmount);
1084
1085 /* Done with purge, notify sleepers of the grim news. */
1086 mutex_enter(vp->v_interlock);
1087 vp->v_op = dead_vnodeop_p;
1088 vp->v_vflag |= VV_LOCKSWORK;
1089 vp->v_iflag |= VI_CLEAN;
1090 vp->v_tag = VT_NON;
1091 KNOTE(&vp->v_klist, NOTE_REVOKE);
1092 vp->v_iflag &= ~VI_XLOCK;
1093 cv_broadcast(&vp->v_cv);
1094
1095 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1096 }
1097
1098 /*
1099 * Recycle an unused vnode if caller holds the last reference.
1100 */
1101 bool
1102 vrecycle(vnode_t *vp)
1103 {
1104
1105 if (vn_lock(vp, LK_EXCLUSIVE) != 0)
1106 return false;
1107
1108 mutex_enter(vp->v_interlock);
1109
1110 KASSERT((vp->v_iflag & VI_MARKER) == 0);
1111
1112 if (vp->v_usecount != 1) {
1113 mutex_exit(vp->v_interlock);
1114 VOP_UNLOCK(vp);
1115 return false;
1116 }
1117 if ((vp->v_iflag & VI_CHANGING) != 0)
1118 vwait(vp, VI_CHANGING);
1119 if (vp->v_usecount != 1) {
1120 mutex_exit(vp->v_interlock);
1121 VOP_UNLOCK(vp);
1122 return false;
1123 }
1124 KASSERT((vp->v_iflag & VI_CLEAN) == 0);
1125 vp->v_iflag |= VI_CHANGING;
1126 vclean(vp);
1127 vrelel(vp, VRELEL_CHANGING_SET);
1128 return true;
1129 }
1130
1131 /*
1132 * Eliminate all activity associated with the requested vnode
1133 * and with all vnodes aliased to the requested vnode.
1134 */
1135 void
1136 vrevoke(vnode_t *vp)
1137 {
1138 vnode_t *vq;
1139 enum vtype type;
1140 dev_t dev;
1141
1142 KASSERT(vp->v_usecount > 0);
1143
1144 mutex_enter(vp->v_interlock);
1145 if ((vp->v_iflag & VI_CLEAN) != 0) {
1146 mutex_exit(vp->v_interlock);
1147 return;
1148 } else if (vp->v_type != VBLK && vp->v_type != VCHR) {
1149 atomic_inc_uint(&vp->v_usecount);
1150 mutex_exit(vp->v_interlock);
1151 vgone(vp);
1152 return;
1153 } else {
1154 dev = vp->v_rdev;
1155 type = vp->v_type;
1156 mutex_exit(vp->v_interlock);
1157 }
1158
1159 while (spec_node_lookup_by_dev(type, dev, &vq) == 0) {
1160 vgone(vq);
1161 }
1162 }
1163
1164 /*
1165 * Eliminate all activity associated with a vnode in preparation for
1166 * reuse. Drops a reference from the vnode.
1167 */
1168 void
1169 vgone(vnode_t *vp)
1170 {
1171
1172 if (vn_lock(vp, LK_EXCLUSIVE) != 0) {
1173 KASSERT((vp->v_iflag & VI_CLEAN) != 0);
1174 vrele(vp);
1175 }
1176
1177 mutex_enter(vp->v_interlock);
1178 if ((vp->v_iflag & VI_CHANGING) != 0)
1179 vwait(vp, VI_CHANGING);
1180 vp->v_iflag |= VI_CHANGING;
1181 vclean(vp);
1182 vrelel(vp, VRELEL_CHANGING_SET);
1183 }
1184
1185 static inline uint32_t
1186 vcache_hash(const struct vcache_key *key)
1187 {
1188 uint32_t hash = HASH32_BUF_INIT;
1189
1190 hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash);
1191 hash = hash32_buf(key->vk_key, key->vk_key_len, hash);
1192 return hash;
1193 }
1194
1195 static void
1196 vcache_init(void)
1197 {
1198
1199 vcache.pool = pool_cache_init(sizeof(struct vcache_node), 0, 0, 0,
1200 "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL);
1201 KASSERT(vcache.pool != NULL);
1202 mutex_init(&vcache.lock, MUTEX_DEFAULT, IPL_NONE);
1203 cv_init(&vcache.cv, "vcache");
1204 vcache.hashtab = hashinit(desiredvnodes, HASH_SLIST, true,
1205 &vcache.hashmask);
1206 }
1207
1208 static void
1209 vcache_reinit(void)
1210 {
1211 int i;
1212 uint32_t hash;
1213 u_long oldmask, newmask;
1214 struct hashhead *oldtab, *newtab;
1215 struct vcache_node *node;
1216
1217 newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask);
1218 mutex_enter(&vcache.lock);
1219 oldtab = vcache.hashtab;
1220 oldmask = vcache.hashmask;
1221 vcache.hashtab = newtab;
1222 vcache.hashmask = newmask;
1223 for (i = 0; i <= oldmask; i++) {
1224 while ((node = SLIST_FIRST(&oldtab[i])) != NULL) {
1225 SLIST_REMOVE(&oldtab[i], node, vcache_node, vn_hash);
1226 hash = vcache_hash(&node->vn_key);
1227 SLIST_INSERT_HEAD(&newtab[hash & vcache.hashmask],
1228 node, vn_hash);
1229 }
1230 }
1231 mutex_exit(&vcache.lock);
1232 hashdone(oldtab, HASH_SLIST, oldmask);
1233 }
1234
1235 static inline struct vcache_node *
1236 vcache_hash_lookup(const struct vcache_key *key, uint32_t hash)
1237 {
1238 struct hashhead *hashp;
1239 struct vcache_node *node;
1240
1241 KASSERT(mutex_owned(&vcache.lock));
1242
1243 hashp = &vcache.hashtab[hash & vcache.hashmask];
1244 SLIST_FOREACH(node, hashp, vn_hash) {
1245 if (key->vk_mount != node->vn_key.vk_mount)
1246 continue;
1247 if (key->vk_key_len != node->vn_key.vk_key_len)
1248 continue;
1249 if (memcmp(key->vk_key, node->vn_key.vk_key, key->vk_key_len))
1250 continue;
1251 return node;
1252 }
1253 return NULL;
1254 }
1255
1256 /*
1257 * Allocate a new, uninitialized vcache node.
1258 */
1259 static struct vcache_node *
1260 vcache_alloc(void)
1261 {
1262 struct vcache_node *node;
1263 vnode_t *vp;
1264
1265 node = pool_cache_get(vcache.pool, PR_WAITOK);
1266 memset(node, 0, sizeof(*node));
1267
1268 /* SLIST_INIT(&node->vn_hash); */
1269
1270 vp = VN_TO_VP(node);
1271 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0);
1272 cv_init(&vp->v_cv, "vnode");
1273 /* LIST_INIT(&vp->v_nclist); */
1274 /* LIST_INIT(&vp->v_dnclist); */
1275
1276 mutex_enter(&vnode_free_list_lock);
1277 numvnodes++;
1278 if (numvnodes > desiredvnodes + desiredvnodes / 10)
1279 cv_signal(&vdrain_cv);
1280 mutex_exit(&vnode_free_list_lock);
1281
1282 rw_init(&vp->v_lock);
1283 vp->v_usecount = 1;
1284 vp->v_type = VNON;
1285 vp->v_size = vp->v_writesize = VSIZENOTSET;
1286
1287 node->vn_state = VN_LOADING;
1288
1289 return node;
1290 }
1291
1292 /*
1293 * Free an unused, unreferenced vcache node.
1294 */
1295 static void
1296 vcache_free(struct vcache_node *node)
1297 {
1298 vnode_t *vp;
1299
1300 vp = VN_TO_VP(node);
1301
1302 KASSERT(vp->v_usecount == 0);
1303 KASSERT((vp->v_iflag & VI_MARKER) == 0);
1304
1305 rw_destroy(&vp->v_lock);
1306 mutex_enter(&vnode_free_list_lock);
1307 numvnodes--;
1308 mutex_exit(&vnode_free_list_lock);
1309
1310 uvm_obj_destroy(&vp->v_uobj, true);
1311 cv_destroy(&vp->v_cv);
1312 pool_cache_put(vcache.pool, node);
1313 }
1314
1315 /*
1316 * Get a vnode / fs node pair by key and return it referenced through vpp.
1317 */
1318 int
1319 vcache_get(struct mount *mp, const void *key, size_t key_len,
1320 struct vnode **vpp)
1321 {
1322 int error;
1323 uint32_t hash;
1324 const void *new_key;
1325 struct vnode *vp;
1326 struct vcache_key vcache_key;
1327 struct vcache_node *node, *new_node;
1328
1329 new_key = NULL;
1330 *vpp = NULL;
1331
1332 vcache_key.vk_mount = mp;
1333 vcache_key.vk_key = key;
1334 vcache_key.vk_key_len = key_len;
1335 hash = vcache_hash(&vcache_key);
1336
1337 again:
1338 mutex_enter(&vcache.lock);
1339 node = vcache_hash_lookup(&vcache_key, hash);
1340
1341 /* If found, take a reference or retry. */
1342 if (__predict_true(node != NULL && node->vn_vnode != NULL)) {
1343 vp = node->vn_vnode;
1344 mutex_enter(vp->v_interlock);
1345 mutex_exit(&vcache.lock);
1346 error = vget(vp, 0, true /* wait */);
1347 if (error == ENOENT)
1348 goto again;
1349 if (error == 0)
1350 *vpp = vp;
1351 KASSERT((error != 0) == (*vpp == NULL));
1352 return error;
1353 }
1354
1355 /* If another thread loads this node, wait and retry. */
1356 if (node != NULL) {
1357 KASSERT(node->vn_vnode == NULL);
1358 mutex_exit(&vcache.lock);
1359 kpause("vcache", false, mstohz(20), NULL);
1360 goto again;
1361 }
1362 mutex_exit(&vcache.lock);
1363
1364 /* Allocate and initialize a new vcache / vnode pair. */
1365 error = vfs_busy(mp, NULL);
1366 if (error)
1367 return error;
1368 new_node = vcache_alloc();
1369 new_node->vn_key = vcache_key;
1370 vp = VN_TO_VP(new_node);
1371 mutex_enter(&vcache.lock);
1372 node = vcache_hash_lookup(&vcache_key, hash);
1373 if (node == NULL) {
1374 SLIST_INSERT_HEAD(&vcache.hashtab[hash & vcache.hashmask],
1375 new_node, vn_hash);
1376 node = new_node;
1377 }
1378 mutex_exit(&vcache.lock);
1379
1380 /* If another thread beat us inserting this node, retry. */
1381 if (node != new_node) {
1382 KASSERT(vp->v_usecount == 1);
1383 vp->v_usecount = 0;
1384 vcache_free(new_node);
1385 vfs_unbusy(mp, false, NULL);
1386 goto again;
1387 }
1388
1389 /* Load the fs node. Exclusive as new_node->vn_vnode is NULL. */
1390 vp->v_iflag |= VI_CHANGING;
1391 error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key);
1392 if (error) {
1393 mutex_enter(&vcache.lock);
1394 SLIST_REMOVE(&vcache.hashtab[hash & vcache.hashmask],
1395 new_node, vcache_node, vn_hash);
1396 mutex_exit(&vcache.lock);
1397 KASSERT(vp->v_usecount == 1);
1398 vp->v_usecount = 0;
1399 vcache_free(new_node);
1400 vfs_unbusy(mp, false, NULL);
1401 KASSERT(*vpp == NULL);
1402 return error;
1403 }
1404 KASSERT(new_key != NULL);
1405 KASSERT(memcmp(key, new_key, key_len) == 0);
1406 KASSERT(vp->v_op != NULL);
1407 vfs_insmntque(vp, mp);
1408 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1409 vp->v_vflag |= VV_MPSAFE;
1410 vfs_unbusy(mp, true, NULL);
1411
1412 /* Finished loading, finalize node. */
1413 mutex_enter(&vcache.lock);
1414 new_node->vn_key.vk_key = new_key;
1415 new_node->vn_vnode = vp;
1416 mutex_exit(&vcache.lock);
1417 mutex_enter(vp->v_interlock);
1418 vp->v_iflag &= ~VI_CHANGING;
1419 cv_broadcast(&vp->v_cv);
1420 mutex_exit(vp->v_interlock);
1421 *vpp = vp;
1422 return 0;
1423 }
1424
1425 /*
1426 * Create a new vnode / fs node pair and return it referenced through vpp.
1427 */
1428 int
1429 vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap,
1430 kauth_cred_t cred, struct vnode **vpp)
1431 {
1432 int error;
1433 uint32_t hash;
1434 struct vnode *vp;
1435 struct vcache_node *new_node;
1436 struct vcache_node *old_node __diagused;
1437
1438 *vpp = NULL;
1439
1440 /* Allocate and initialize a new vcache / vnode pair. */
1441 error = vfs_busy(mp, NULL);
1442 if (error)
1443 return error;
1444 new_node = vcache_alloc();
1445 new_node->vn_key.vk_mount = mp;
1446 vp = VN_TO_VP(new_node);
1447
1448 /* Create and load the fs node. */
1449 vp->v_iflag |= VI_CHANGING;
1450 error = VFS_NEWVNODE(mp, dvp, vp, vap, cred,
1451 &new_node->vn_key.vk_key_len, &new_node->vn_key.vk_key);
1452 if (error) {
1453 KASSERT(vp->v_usecount == 1);
1454 vp->v_usecount = 0;
1455 vcache_free(VP_TO_VN(vp));
1456 vfs_unbusy(mp, false, NULL);
1457 KASSERT(*vpp == NULL);
1458 return error;
1459 }
1460 KASSERT(new_node->vn_key.vk_key != NULL);
1461 KASSERT(vp->v_op != NULL);
1462 hash = vcache_hash(&new_node->vn_key);
1463
1464 /* Wait for previous instance to be reclaimed, then insert new node. */
1465 mutex_enter(&vcache.lock);
1466 while ((old_node = vcache_hash_lookup(&new_node->vn_key, hash))) {
1467 #ifdef DIAGNOSTIC
1468 if (old_node->vn_vnode != NULL)
1469 mutex_enter(old_node->vn_vnode->v_interlock);
1470 KASSERT(old_node->vn_vnode == NULL ||
1471 (old_node->vn_vnode->v_iflag & (VI_XLOCK | VI_CLEAN)) != 0);
1472 if (old_node->vn_vnode != NULL)
1473 mutex_exit(old_node->vn_vnode->v_interlock);
1474 #endif
1475 mutex_exit(&vcache.lock);
1476 kpause("vcache", false, mstohz(20), NULL);
1477 mutex_enter(&vcache.lock);
1478 }
1479 SLIST_INSERT_HEAD(&vcache.hashtab[hash & vcache.hashmask],
1480 new_node, vn_hash);
1481 mutex_exit(&vcache.lock);
1482 vfs_insmntque(vp, mp);
1483 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1484 vp->v_vflag |= VV_MPSAFE;
1485 vfs_unbusy(mp, true, NULL);
1486
1487 /* Finished loading, finalize node. */
1488 mutex_enter(&vcache.lock);
1489 new_node->vn_vnode = vp;
1490 mutex_exit(&vcache.lock);
1491 mutex_enter(vp->v_interlock);
1492 vp->v_iflag &= ~VI_CHANGING;
1493 cv_broadcast(&vp->v_cv);
1494 mutex_exit(vp->v_interlock);
1495 *vpp = vp;
1496 return 0;
1497 }
1498
1499 /*
1500 * Prepare key change: lock old and new cache node.
1501 * Return an error if the new node already exists.
1502 */
1503 int
1504 vcache_rekey_enter(struct mount *mp, struct vnode *vp,
1505 const void *old_key, size_t old_key_len,
1506 const void *new_key, size_t new_key_len)
1507 {
1508 uint32_t old_hash, new_hash;
1509 struct vcache_key old_vcache_key, new_vcache_key;
1510 struct vcache_node *node, *new_node;
1511
1512 old_vcache_key.vk_mount = mp;
1513 old_vcache_key.vk_key = old_key;
1514 old_vcache_key.vk_key_len = old_key_len;
1515 old_hash = vcache_hash(&old_vcache_key);
1516
1517 new_vcache_key.vk_mount = mp;
1518 new_vcache_key.vk_key = new_key;
1519 new_vcache_key.vk_key_len = new_key_len;
1520 new_hash = vcache_hash(&new_vcache_key);
1521
1522 new_node = vcache_alloc();
1523 new_node->vn_key = new_vcache_key;
1524
1525 mutex_enter(&vcache.lock);
1526
1527 /* Insert locked new node used as placeholder. */
1528 node = vcache_hash_lookup(&new_vcache_key, new_hash);
1529 if (node != NULL) {
1530 mutex_exit(&vcache.lock);
1531 KASSERT(VN_TO_VP(new_node)->v_usecount == 1);
1532 VN_TO_VP(new_node)->v_usecount = 0;
1533 vcache_free(new_node);
1534 return EEXIST;
1535 }
1536 SLIST_INSERT_HEAD(&vcache.hashtab[new_hash & vcache.hashmask],
1537 new_node, vn_hash);
1538
1539 /* Lock old node. */
1540 node = vcache_hash_lookup(&old_vcache_key, old_hash);
1541 KASSERT(node != NULL);
1542 KASSERT(node->vn_vnode == vp);
1543 node->vn_vnode = NULL;
1544 node->vn_key = old_vcache_key;
1545 mutex_exit(&vcache.lock);
1546 return 0;
1547 }
1548
1549 /*
1550 * Key change complete: remove old node and unlock new node.
1551 */
1552 void
1553 vcache_rekey_exit(struct mount *mp, struct vnode *vp,
1554 const void *old_key, size_t old_key_len,
1555 const void *new_key, size_t new_key_len)
1556 {
1557 uint32_t old_hash, new_hash;
1558 struct vcache_key old_vcache_key, new_vcache_key;
1559 struct vcache_node *old_node, *new_node;
1560
1561 old_vcache_key.vk_mount = mp;
1562 old_vcache_key.vk_key = old_key;
1563 old_vcache_key.vk_key_len = old_key_len;
1564 old_hash = vcache_hash(&old_vcache_key);
1565
1566 new_vcache_key.vk_mount = mp;
1567 new_vcache_key.vk_key = new_key;
1568 new_vcache_key.vk_key_len = new_key_len;
1569 new_hash = vcache_hash(&new_vcache_key);
1570
1571 mutex_enter(&vcache.lock);
1572
1573 /* Lookup old and new node. */
1574 old_node = vcache_hash_lookup(&old_vcache_key, old_hash);
1575 KASSERT(old_node != NULL);
1576 KASSERT(old_node->vn_vnode == NULL);
1577 new_node = vcache_hash_lookup(&new_vcache_key, new_hash);
1578 KASSERT(new_node != NULL && new_node->vn_vnode == NULL);
1579 KASSERT(new_node->vn_key.vk_key_len == new_key_len);
1580
1581 /* Rekey old node and put it onto its new hashlist. */
1582 old_node->vn_vnode = vp;
1583 old_node->vn_key = new_vcache_key;
1584 if (old_hash != new_hash) {
1585 SLIST_REMOVE(&vcache.hashtab[old_hash & vcache.hashmask],
1586 old_node, vcache_node, vn_hash);
1587 SLIST_INSERT_HEAD(&vcache.hashtab[new_hash & vcache.hashmask],
1588 old_node, vn_hash);
1589 }
1590
1591 /* Remove new node used as placeholder. */
1592 SLIST_REMOVE(&vcache.hashtab[new_hash & vcache.hashmask],
1593 new_node, vcache_node, vn_hash);
1594 mutex_exit(&vcache.lock);
1595 KASSERT(VN_TO_VP(new_node)->v_usecount == 1);
1596 VN_TO_VP(new_node)->v_usecount = 0;
1597 vcache_free(new_node);
1598 }
1599
1600 /*
1601 * Remove a vnode / fs node pair from the cache.
1602 */
1603 void
1604 vcache_remove(struct mount *mp, const void *key, size_t key_len)
1605 {
1606 uint32_t hash;
1607 struct vcache_key vcache_key;
1608 struct vcache_node *node;
1609
1610 vcache_key.vk_mount = mp;
1611 vcache_key.vk_key = key;
1612 vcache_key.vk_key_len = key_len;
1613 hash = vcache_hash(&vcache_key);
1614
1615 mutex_enter(&vcache.lock);
1616 node = vcache_hash_lookup(&vcache_key, hash);
1617 KASSERT(node != NULL);
1618 SLIST_REMOVE(&vcache.hashtab[hash & vcache.hashmask],
1619 node, vcache_node, vn_hash);
1620 mutex_exit(&vcache.lock);
1621 }
1622
1623 /*
1624 * Print a vcache node.
1625 */
1626 void
1627 vcache_print(vnode_t *vp, const char *prefix, void (*pr)(const char *, ...))
1628 {
1629 int n;
1630 const uint8_t *cp;
1631 struct vcache_node *node;
1632
1633 node = VP_TO_VN(vp);
1634 n = node->vn_key.vk_key_len;
1635 cp = node->vn_key.vk_key;
1636
1637 (*pr)("%sstate %s, key(%d)", prefix, vstate_name(node->vn_state), n);
1638
1639 while (n-- > 0)
1640 (*pr)(" %02x", *cp++);
1641 (*pr)("\n");
1642 }
1643
1644 /*
1645 * Update outstanding I/O count and do wakeup if requested.
1646 */
1647 void
1648 vwakeup(struct buf *bp)
1649 {
1650 vnode_t *vp;
1651
1652 if ((vp = bp->b_vp) == NULL)
1653 return;
1654
1655 KASSERT(bp->b_objlock == vp->v_interlock);
1656 KASSERT(mutex_owned(bp->b_objlock));
1657
1658 if (--vp->v_numoutput < 0)
1659 vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp);
1660 if (vp->v_numoutput == 0)
1661 cv_broadcast(&vp->v_cv);
1662 }
1663
1664 /*
1665 * Test a vnode for being or becoming dead. Returns one of:
1666 * EBUSY: vnode is becoming dead, with "flags == VDEAD_NOWAIT" only.
1667 * ENOENT: vnode is dead.
1668 * 0: otherwise.
1669 *
1670 * Whenever this function returns a non-zero value all future
1671 * calls will also return a non-zero value.
1672 */
1673 int
1674 vdead_check(struct vnode *vp, int flags)
1675 {
1676
1677 KASSERT(mutex_owned(vp->v_interlock));
1678 if (ISSET(vp->v_iflag, VI_XLOCK)) {
1679 if (ISSET(flags, VDEAD_NOWAIT))
1680 return EBUSY;
1681 vwait(vp, VI_XLOCK);
1682 KASSERT(ISSET(vp->v_iflag, VI_CLEAN));
1683 }
1684 if (ISSET(vp->v_iflag, VI_CLEAN))
1685 return ENOENT;
1686 return 0;
1687 }
1688
1689 /*
1690 * Wait for a vnode (typically with VI_XLOCK set) to be cleaned or
1691 * recycled.
1692 */
1693 static void
1694 vwait(vnode_t *vp, int flags)
1695 {
1696
1697 KASSERT(mutex_owned(vp->v_interlock));
1698 KASSERT(vp->v_usecount != 0);
1699
1700 while ((vp->v_iflag & flags) != 0)
1701 cv_wait(&vp->v_cv, vp->v_interlock);
1702 }
1703
1704 int
1705 vfs_drainvnodes(long target)
1706 {
1707 int error;
1708
1709 mutex_enter(&vnode_free_list_lock);
1710
1711 while (numvnodes > target) {
1712 error = cleanvnode();
1713 if (error != 0)
1714 return error;
1715 mutex_enter(&vnode_free_list_lock);
1716 }
1717
1718 mutex_exit(&vnode_free_list_lock);
1719
1720 vcache_reinit();
1721
1722 return 0;
1723 }
1724
1725 void
1726 vnpanic(vnode_t *vp, const char *fmt, ...)
1727 {
1728 va_list ap;
1729
1730 #ifdef DIAGNOSTIC
1731 vprint(NULL, vp);
1732 #endif
1733 va_start(ap, fmt);
1734 vpanic(fmt, ap);
1735 va_end(ap);
1736 }
1737