vfs_vnode.c revision 1.44 1 /* $NetBSD: vfs_vnode.c,v 1.44 2015/06/23 10:41:59 hannken Exp $ */
2
3 /*-
4 * Copyright (c) 1997-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /*
34 * Copyright (c) 1989, 1993
35 * The Regents of the University of California. All rights reserved.
36 * (c) UNIX System Laboratories, Inc.
37 * All or some portions of this file are derived from material licensed
38 * to the University of California by American Telephone and Telegraph
39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40 * the permission of UNIX System Laboratories, Inc.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
67 */
68
69 /*
70 * The vnode cache subsystem.
71 *
72 * Life-cycle
73 *
74 * Normally, there are two points where new vnodes are created:
75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode
76 * starts in one of the following ways:
77 *
78 * - Allocation, via getnewvnode(9) and/or vnalloc(9).
79 * - Reclamation of inactive vnode, via vget(9).
80 *
81 * Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9)
82 * was another, traditional way. Currently, only the draining thread
83 * recycles the vnodes. This behaviour might be revisited.
84 *
85 * The life-cycle ends when the last reference is dropped, usually
86 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform
87 * the file system that vnode is inactive. Via this call, file system
88 * indicates whether vnode can be recycled (usually, it checks its own
89 * references, e.g. count of links, whether the file was removed).
90 *
91 * Depending on indication, vnode can be put into a free list (cache),
92 * or cleaned via vclean(9), which calls VOP_RECLAIM(9) to disassociate
93 * underlying file system from the vnode, and finally destroyed.
94 *
95 * Reference counting
96 *
97 * Vnode is considered active, if reference count (vnode_t::v_usecount)
98 * is non-zero. It is maintained using: vref(9) and vrele(9), as well
99 * as vput(9), routines. Common points holding references are e.g.
100 * file openings, current working directory, mount points, etc.
101 *
102 * Note on v_usecount and its locking
103 *
104 * At nearly all points it is known that v_usecount could be zero,
105 * the vnode_t::v_interlock will be held. To change v_usecount away
106 * from zero, the interlock must be held. To change from a non-zero
107 * value to zero, again the interlock must be held.
108 *
109 * Changing the usecount from a non-zero value to a non-zero value can
110 * safely be done using atomic operations, without the interlock held.
111 *
112 * Note: if VI_CLEAN is set, vnode_t::v_interlock will be released while
113 * mntvnode_lock is still held.
114 *
115 * See PR 41374.
116 */
117
118 #include <sys/cdefs.h>
119 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.44 2015/06/23 10:41:59 hannken Exp $");
120
121 #define _VFS_VNODE_PRIVATE
122
123 #include <sys/param.h>
124 #include <sys/kernel.h>
125
126 #include <sys/atomic.h>
127 #include <sys/buf.h>
128 #include <sys/conf.h>
129 #include <sys/device.h>
130 #include <sys/hash.h>
131 #include <sys/kauth.h>
132 #include <sys/kmem.h>
133 #include <sys/kthread.h>
134 #include <sys/module.h>
135 #include <sys/mount.h>
136 #include <sys/namei.h>
137 #include <sys/syscallargs.h>
138 #include <sys/sysctl.h>
139 #include <sys/systm.h>
140 #include <sys/vnode.h>
141 #include <sys/wapbl.h>
142 #include <sys/fstrans.h>
143
144 #include <uvm/uvm.h>
145 #include <uvm/uvm_readahead.h>
146
147 /* Flags to vrelel. */
148 #define VRELEL_ASYNC_RELE 0x0001 /* Always defer to vrele thread. */
149 #define VRELEL_CHANGING_SET 0x0002 /* VI_CHANGING set by caller. */
150
151 struct vcache_key {
152 struct mount *vk_mount;
153 const void *vk_key;
154 size_t vk_key_len;
155 };
156 struct vcache_node {
157 SLIST_ENTRY(vcache_node) vn_hash;
158 struct vnode *vn_vnode;
159 struct vcache_key vn_key;
160 };
161
162 u_int numvnodes __cacheline_aligned;
163
164 static pool_cache_t vnode_cache __read_mostly;
165
166 /*
167 * There are two free lists: one is for vnodes which have no buffer/page
168 * references and one for those which do (i.e. v_holdcnt is non-zero).
169 * Vnode recycling mechanism first attempts to look into the former list.
170 */
171 static kmutex_t vnode_free_list_lock __cacheline_aligned;
172 static vnodelst_t vnode_free_list __cacheline_aligned;
173 static vnodelst_t vnode_hold_list __cacheline_aligned;
174 static kcondvar_t vdrain_cv __cacheline_aligned;
175
176 static vnodelst_t vrele_list __cacheline_aligned;
177 static kmutex_t vrele_lock __cacheline_aligned;
178 static kcondvar_t vrele_cv __cacheline_aligned;
179 static lwp_t * vrele_lwp __cacheline_aligned;
180 static int vrele_pending __cacheline_aligned;
181 static int vrele_gen __cacheline_aligned;
182
183 SLIST_HEAD(hashhead, vcache_node);
184 static struct {
185 kmutex_t lock;
186 u_long hashmask;
187 struct hashhead *hashtab;
188 pool_cache_t pool;
189 } vcache __cacheline_aligned;
190
191 static int cleanvnode(void);
192 static void vcache_init(void);
193 static void vcache_reinit(void);
194 static void vclean(vnode_t *);
195 static void vrelel(vnode_t *, int);
196 static void vdrain_thread(void *);
197 static void vrele_thread(void *);
198 static void vnpanic(vnode_t *, const char *, ...)
199 __printflike(2, 3);
200 static void vwait(vnode_t *, int);
201
202 /* Routines having to do with the management of the vnode table. */
203 extern struct mount *dead_rootmount;
204 extern int (**dead_vnodeop_p)(void *);
205 extern struct vfsops dead_vfsops;
206
207 void
208 vfs_vnode_sysinit(void)
209 {
210 int error __diagused;
211
212 vnode_cache = pool_cache_init(sizeof(vnode_t), 0, 0, 0, "vnodepl",
213 NULL, IPL_NONE, NULL, NULL, NULL);
214 KASSERT(vnode_cache != NULL);
215
216 dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL);
217 KASSERT(dead_rootmount != NULL);
218 dead_rootmount->mnt_iflag = IMNT_MPSAFE;
219
220 mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE);
221 TAILQ_INIT(&vnode_free_list);
222 TAILQ_INIT(&vnode_hold_list);
223 TAILQ_INIT(&vrele_list);
224
225 vcache_init();
226
227 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE);
228 cv_init(&vdrain_cv, "vdrain");
229 cv_init(&vrele_cv, "vrele");
230 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread,
231 NULL, NULL, "vdrain");
232 KASSERT(error == 0);
233 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread,
234 NULL, &vrele_lwp, "vrele");
235 KASSERT(error == 0);
236 }
237
238 /*
239 * Allocate a new, uninitialized vnode. If 'mp' is non-NULL, this is a
240 * marker vnode.
241 */
242 vnode_t *
243 vnalloc(struct mount *mp)
244 {
245 vnode_t *vp;
246
247 vp = pool_cache_get(vnode_cache, PR_WAITOK);
248 KASSERT(vp != NULL);
249
250 memset(vp, 0, sizeof(*vp));
251 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0);
252 cv_init(&vp->v_cv, "vnode");
253 /*
254 * Done by memset() above.
255 * LIST_INIT(&vp->v_nclist);
256 * LIST_INIT(&vp->v_dnclist);
257 */
258
259 if (mp != NULL) {
260 vp->v_mount = mp;
261 vp->v_type = VBAD;
262 vp->v_iflag = VI_MARKER;
263 return vp;
264 }
265
266 mutex_enter(&vnode_free_list_lock);
267 numvnodes++;
268 if (numvnodes > desiredvnodes + desiredvnodes / 10)
269 cv_signal(&vdrain_cv);
270 mutex_exit(&vnode_free_list_lock);
271
272 rw_init(&vp->v_lock);
273 vp->v_usecount = 1;
274 vp->v_type = VNON;
275 vp->v_size = vp->v_writesize = VSIZENOTSET;
276
277 return vp;
278 }
279
280 /*
281 * Free an unused, unreferenced vnode.
282 */
283 void
284 vnfree(vnode_t *vp)
285 {
286
287 KASSERT(vp->v_usecount == 0);
288
289 if ((vp->v_iflag & VI_MARKER) == 0) {
290 rw_destroy(&vp->v_lock);
291 mutex_enter(&vnode_free_list_lock);
292 numvnodes--;
293 mutex_exit(&vnode_free_list_lock);
294 }
295
296 uvm_obj_destroy(&vp->v_uobj, true);
297 cv_destroy(&vp->v_cv);
298 pool_cache_put(vnode_cache, vp);
299 }
300
301 /*
302 * cleanvnode: grab a vnode from freelist, clean and free it.
303 *
304 * => Releases vnode_free_list_lock.
305 */
306 static int
307 cleanvnode(void)
308 {
309 vnode_t *vp;
310 vnodelst_t *listhd;
311 struct mount *mp;
312
313 KASSERT(mutex_owned(&vnode_free_list_lock));
314
315 listhd = &vnode_free_list;
316 try_nextlist:
317 TAILQ_FOREACH(vp, listhd, v_freelist) {
318 /*
319 * It's safe to test v_usecount and v_iflag
320 * without holding the interlock here, since
321 * these vnodes should never appear on the
322 * lists.
323 */
324 KASSERT(vp->v_usecount == 0);
325 KASSERT((vp->v_iflag & VI_CLEAN) == 0);
326 KASSERT(vp->v_freelisthd == listhd);
327
328 if (!mutex_tryenter(vp->v_interlock))
329 continue;
330 if ((vp->v_iflag & VI_XLOCK) != 0) {
331 mutex_exit(vp->v_interlock);
332 continue;
333 }
334 mp = vp->v_mount;
335 if (fstrans_start_nowait(mp, FSTRANS_SHARED) != 0) {
336 mutex_exit(vp->v_interlock);
337 continue;
338 }
339 break;
340 }
341
342 if (vp == NULL) {
343 if (listhd == &vnode_free_list) {
344 listhd = &vnode_hold_list;
345 goto try_nextlist;
346 }
347 mutex_exit(&vnode_free_list_lock);
348 return EBUSY;
349 }
350
351 /* Remove it from the freelist. */
352 TAILQ_REMOVE(listhd, vp, v_freelist);
353 vp->v_freelisthd = NULL;
354 mutex_exit(&vnode_free_list_lock);
355
356 KASSERT(vp->v_usecount == 0);
357
358 /*
359 * The vnode is still associated with a file system, so we must
360 * clean it out before freeing it. We need to add a reference
361 * before doing this.
362 */
363 vp->v_usecount = 1;
364 KASSERT((vp->v_iflag & VI_CHANGING) == 0);
365 vp->v_iflag |= VI_CHANGING;
366 vclean(vp);
367 vrelel(vp, VRELEL_CHANGING_SET);
368 fstrans_done(mp);
369
370 return 0;
371 }
372
373 /*
374 * getnewvnode: return a fresh vnode.
375 *
376 * => Returns referenced vnode, moved into the mount queue.
377 * => Shares the interlock specified by 'slock', if it is not NULL.
378 */
379 int
380 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *),
381 kmutex_t *slock, vnode_t **vpp)
382 {
383 struct uvm_object *uobj __diagused;
384 vnode_t *vp;
385 int error = 0;
386
387 if (mp != NULL) {
388 /*
389 * Mark filesystem busy while we are creating a vnode.
390 * If unmount is in progress, this will fail.
391 */
392 error = vfs_busy(mp, NULL);
393 if (error)
394 return error;
395 }
396
397 vp = NULL;
398
399 /* Allocate a new vnode. */
400 vp = vnalloc(NULL);
401
402 KASSERT(vp->v_freelisthd == NULL);
403 KASSERT(LIST_EMPTY(&vp->v_nclist));
404 KASSERT(LIST_EMPTY(&vp->v_dnclist));
405 KASSERT(vp->v_data == NULL);
406
407 /* Initialize vnode. */
408 vp->v_tag = tag;
409 vp->v_op = vops;
410
411 uobj = &vp->v_uobj;
412 KASSERT(uobj->pgops == &uvm_vnodeops);
413 KASSERT(uobj->uo_npages == 0);
414 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL);
415
416 /* Share the vnode_t::v_interlock, if requested. */
417 if (slock) {
418 /* Set the interlock and mark that it is shared. */
419 KASSERT(vp->v_mount == NULL);
420 mutex_obj_hold(slock);
421 uvm_obj_setlock(&vp->v_uobj, slock);
422 KASSERT(vp->v_interlock == slock);
423 }
424
425 /* Finally, move vnode into the mount queue. */
426 vfs_insmntque(vp, mp);
427
428 if (mp != NULL) {
429 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
430 vp->v_vflag |= VV_MPSAFE;
431 vfs_unbusy(mp, true, NULL);
432 }
433
434 *vpp = vp;
435 return 0;
436 }
437
438 /*
439 * This is really just the reverse of getnewvnode(). Needed for
440 * VFS_VGET functions who may need to push back a vnode in case
441 * of a locking race.
442 */
443 void
444 ungetnewvnode(vnode_t *vp)
445 {
446
447 KASSERT(vp->v_usecount == 1);
448 KASSERT(vp->v_data == NULL);
449 KASSERT(vp->v_freelisthd == NULL);
450
451 mutex_enter(vp->v_interlock);
452 vp->v_iflag |= VI_CLEAN;
453 vrelel(vp, 0);
454 }
455
456 /*
457 * Helper thread to keep the number of vnodes below desiredvnodes.
458 */
459 static void
460 vdrain_thread(void *cookie)
461 {
462 int error;
463
464 mutex_enter(&vnode_free_list_lock);
465
466 for (;;) {
467 cv_timedwait(&vdrain_cv, &vnode_free_list_lock, hz);
468 while (numvnodes > desiredvnodes) {
469 error = cleanvnode();
470 if (error)
471 kpause("vndsbusy", false, hz, NULL);
472 mutex_enter(&vnode_free_list_lock);
473 if (error)
474 break;
475 }
476 }
477 }
478
479 /*
480 * Remove a vnode from its freelist.
481 */
482 void
483 vremfree(vnode_t *vp)
484 {
485
486 KASSERT(mutex_owned(vp->v_interlock));
487 KASSERT(vp->v_usecount == 0);
488
489 /*
490 * Note that the reference count must not change until
491 * the vnode is removed.
492 */
493 mutex_enter(&vnode_free_list_lock);
494 if (vp->v_holdcnt > 0) {
495 KASSERT(vp->v_freelisthd == &vnode_hold_list);
496 } else {
497 KASSERT(vp->v_freelisthd == &vnode_free_list);
498 }
499 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
500 vp->v_freelisthd = NULL;
501 mutex_exit(&vnode_free_list_lock);
502 }
503
504 /*
505 * vget: get a particular vnode from the free list, increment its reference
506 * count and lock it.
507 *
508 * => Should be called with v_interlock held.
509 *
510 * If VI_CHANGING is set, the vnode may be eliminated in vgone()/vclean().
511 * In that case, we cannot grab the vnode, so the process is awakened when
512 * the transition is completed, and an error returned to indicate that the
513 * vnode is no longer usable.
514 */
515 int
516 vget(vnode_t *vp, int flags, bool waitok)
517 {
518 int error = 0;
519
520 KASSERT((vp->v_iflag & VI_MARKER) == 0);
521 KASSERT(mutex_owned(vp->v_interlock));
522 KASSERT((flags & ~LK_NOWAIT) == 0);
523 KASSERT(waitok == ((flags & LK_NOWAIT) == 0));
524
525 /*
526 * Before adding a reference, we must remove the vnode
527 * from its freelist.
528 */
529 if (vp->v_usecount == 0) {
530 vremfree(vp);
531 vp->v_usecount = 1;
532 } else {
533 atomic_inc_uint(&vp->v_usecount);
534 }
535
536 /*
537 * If the vnode is in the process of changing state we wait
538 * for the change to complete and take care not to return
539 * a clean vnode.
540 */
541 if ((vp->v_iflag & VI_CHANGING) != 0) {
542 if ((flags & LK_NOWAIT) != 0) {
543 vrelel(vp, 0);
544 return EBUSY;
545 }
546 vwait(vp, VI_CHANGING);
547 if ((vp->v_iflag & VI_CLEAN) != 0) {
548 vrelel(vp, 0);
549 return ENOENT;
550 }
551 }
552
553 /*
554 * Ok, we got it in good shape.
555 */
556 KASSERT((vp->v_iflag & VI_CLEAN) == 0);
557 mutex_exit(vp->v_interlock);
558 return error;
559 }
560
561 /*
562 * vput: unlock and release the reference.
563 */
564 void
565 vput(vnode_t *vp)
566 {
567
568 KASSERT((vp->v_iflag & VI_MARKER) == 0);
569
570 VOP_UNLOCK(vp);
571 vrele(vp);
572 }
573
574 /*
575 * Try to drop reference on a vnode. Abort if we are releasing the
576 * last reference. Note: this _must_ succeed if not the last reference.
577 */
578 static inline bool
579 vtryrele(vnode_t *vp)
580 {
581 u_int use, next;
582
583 for (use = vp->v_usecount;; use = next) {
584 if (use == 1) {
585 return false;
586 }
587 KASSERT(use > 1);
588 next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
589 if (__predict_true(next == use)) {
590 return true;
591 }
592 }
593 }
594
595 /*
596 * Vnode release. If reference count drops to zero, call inactive
597 * routine and either return to freelist or free to the pool.
598 */
599 static void
600 vrelel(vnode_t *vp, int flags)
601 {
602 bool recycle, defer;
603 int error;
604
605 KASSERT(mutex_owned(vp->v_interlock));
606 KASSERT((vp->v_iflag & VI_MARKER) == 0);
607 KASSERT(vp->v_freelisthd == NULL);
608
609 if (__predict_false(vp->v_op == dead_vnodeop_p &&
610 (vp->v_iflag & (VI_CLEAN|VI_XLOCK)) == 0)) {
611 vnpanic(vp, "dead but not clean");
612 }
613
614 /*
615 * If not the last reference, just drop the reference count
616 * and unlock.
617 */
618 if (vtryrele(vp)) {
619 if ((flags & VRELEL_CHANGING_SET) != 0) {
620 KASSERT((vp->v_iflag & VI_CHANGING) != 0);
621 vp->v_iflag &= ~VI_CHANGING;
622 cv_broadcast(&vp->v_cv);
623 }
624 mutex_exit(vp->v_interlock);
625 return;
626 }
627 if (vp->v_usecount <= 0 || vp->v_writecount != 0) {
628 vnpanic(vp, "%s: bad ref count", __func__);
629 }
630
631 KASSERT((vp->v_iflag & VI_XLOCK) == 0);
632
633 #ifdef DIAGNOSTIC
634 if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
635 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
636 vprint("vrelel: missing VOP_CLOSE()", vp);
637 }
638 #endif
639
640 /*
641 * If not clean, deactivate the vnode, but preserve
642 * our reference across the call to VOP_INACTIVE().
643 */
644 if ((vp->v_iflag & VI_CLEAN) == 0) {
645 recycle = false;
646
647 /*
648 * XXX This ugly block can be largely eliminated if
649 * locking is pushed down into the file systems.
650 *
651 * Defer vnode release to vrele_thread if caller
652 * requests it explicitly or is the pagedaemon.
653 */
654 if ((curlwp == uvm.pagedaemon_lwp) ||
655 (flags & VRELEL_ASYNC_RELE) != 0) {
656 defer = true;
657 } else if (curlwp == vrele_lwp) {
658 /*
659 * We have to try harder.
660 */
661 mutex_exit(vp->v_interlock);
662 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
663 KASSERT(error == 0);
664 mutex_enter(vp->v_interlock);
665 defer = false;
666 } else {
667 /* If we can't acquire the lock, then defer. */
668 mutex_exit(vp->v_interlock);
669 error = vn_lock(vp,
670 LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
671 defer = (error != 0);
672 mutex_enter(vp->v_interlock);
673 }
674
675 KASSERT(mutex_owned(vp->v_interlock));
676 KASSERT(! (curlwp == vrele_lwp && defer));
677
678 if (defer) {
679 /*
680 * Defer reclaim to the kthread; it's not safe to
681 * clean it here. We donate it our last reference.
682 */
683 if ((flags & VRELEL_CHANGING_SET) != 0) {
684 KASSERT((vp->v_iflag & VI_CHANGING) != 0);
685 vp->v_iflag &= ~VI_CHANGING;
686 cv_broadcast(&vp->v_cv);
687 }
688 mutex_enter(&vrele_lock);
689 TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist);
690 if (++vrele_pending > (desiredvnodes >> 8))
691 cv_signal(&vrele_cv);
692 mutex_exit(&vrele_lock);
693 mutex_exit(vp->v_interlock);
694 return;
695 }
696
697 /*
698 * If the node got another reference while we
699 * released the interlock, don't try to inactivate it yet.
700 */
701 if (__predict_false(vtryrele(vp))) {
702 VOP_UNLOCK(vp);
703 if ((flags & VRELEL_CHANGING_SET) != 0) {
704 KASSERT((vp->v_iflag & VI_CHANGING) != 0);
705 vp->v_iflag &= ~VI_CHANGING;
706 cv_broadcast(&vp->v_cv);
707 }
708 mutex_exit(vp->v_interlock);
709 return;
710 }
711
712 if ((flags & VRELEL_CHANGING_SET) == 0) {
713 KASSERT((vp->v_iflag & VI_CHANGING) == 0);
714 vp->v_iflag |= VI_CHANGING;
715 }
716 mutex_exit(vp->v_interlock);
717
718 /*
719 * The vnode can gain another reference while being
720 * deactivated. If VOP_INACTIVE() indicates that
721 * the described file has been deleted, then recycle
722 * the vnode irrespective of additional references.
723 * Another thread may be waiting to re-use the on-disk
724 * inode.
725 *
726 * Note that VOP_INACTIVE() will drop the vnode lock.
727 */
728 VOP_INACTIVE(vp, &recycle);
729 mutex_enter(vp->v_interlock);
730 if (!recycle) {
731 if (vtryrele(vp)) {
732 KASSERT((vp->v_iflag & VI_CHANGING) != 0);
733 vp->v_iflag &= ~VI_CHANGING;
734 cv_broadcast(&vp->v_cv);
735 mutex_exit(vp->v_interlock);
736 return;
737 }
738 }
739
740 /* Take care of space accounting. */
741 if (vp->v_iflag & VI_EXECMAP) {
742 atomic_add_int(&uvmexp.execpages,
743 -vp->v_uobj.uo_npages);
744 atomic_add_int(&uvmexp.filepages,
745 vp->v_uobj.uo_npages);
746 }
747 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP);
748 vp->v_vflag &= ~VV_MAPPED;
749
750 /*
751 * Recycle the vnode if the file is now unused (unlinked),
752 * otherwise just free it.
753 */
754 if (recycle) {
755 vclean(vp);
756 }
757 KASSERT(vp->v_usecount > 0);
758 } else { /* vnode was already clean */
759 if ((flags & VRELEL_CHANGING_SET) == 0) {
760 KASSERT((vp->v_iflag & VI_CHANGING) == 0);
761 vp->v_iflag |= VI_CHANGING;
762 }
763 }
764
765 if (atomic_dec_uint_nv(&vp->v_usecount) != 0) {
766 /* Gained another reference while being reclaimed. */
767 KASSERT((vp->v_iflag & VI_CHANGING) != 0);
768 vp->v_iflag &= ~VI_CHANGING;
769 cv_broadcast(&vp->v_cv);
770 mutex_exit(vp->v_interlock);
771 return;
772 }
773
774 if ((vp->v_iflag & VI_CLEAN) != 0) {
775 /*
776 * It's clean so destroy it. It isn't referenced
777 * anywhere since it has been reclaimed.
778 */
779 KASSERT(vp->v_holdcnt == 0);
780 KASSERT(vp->v_writecount == 0);
781 mutex_exit(vp->v_interlock);
782 vfs_insmntque(vp, NULL);
783 if (vp->v_type == VBLK || vp->v_type == VCHR) {
784 spec_node_destroy(vp);
785 }
786 vnfree(vp);
787 } else {
788 /*
789 * Otherwise, put it back onto the freelist. It
790 * can't be destroyed while still associated with
791 * a file system.
792 */
793 mutex_enter(&vnode_free_list_lock);
794 if (vp->v_holdcnt > 0) {
795 vp->v_freelisthd = &vnode_hold_list;
796 } else {
797 vp->v_freelisthd = &vnode_free_list;
798 }
799 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
800 mutex_exit(&vnode_free_list_lock);
801 KASSERT((vp->v_iflag & VI_CHANGING) != 0);
802 vp->v_iflag &= ~VI_CHANGING;
803 cv_broadcast(&vp->v_cv);
804 mutex_exit(vp->v_interlock);
805 }
806 }
807
808 void
809 vrele(vnode_t *vp)
810 {
811
812 KASSERT((vp->v_iflag & VI_MARKER) == 0);
813
814 if (vtryrele(vp)) {
815 return;
816 }
817 mutex_enter(vp->v_interlock);
818 vrelel(vp, 0);
819 }
820
821 /*
822 * Asynchronous vnode release, vnode is released in different context.
823 */
824 void
825 vrele_async(vnode_t *vp)
826 {
827
828 KASSERT((vp->v_iflag & VI_MARKER) == 0);
829
830 if (vtryrele(vp)) {
831 return;
832 }
833 mutex_enter(vp->v_interlock);
834 vrelel(vp, VRELEL_ASYNC_RELE);
835 }
836
837 static void
838 vrele_thread(void *cookie)
839 {
840 vnodelst_t skip_list;
841 vnode_t *vp;
842 struct mount *mp;
843
844 TAILQ_INIT(&skip_list);
845
846 mutex_enter(&vrele_lock);
847 for (;;) {
848 while (TAILQ_EMPTY(&vrele_list)) {
849 vrele_gen++;
850 cv_broadcast(&vrele_cv);
851 cv_timedwait(&vrele_cv, &vrele_lock, hz);
852 TAILQ_CONCAT(&vrele_list, &skip_list, v_freelist);
853 }
854 vp = TAILQ_FIRST(&vrele_list);
855 mp = vp->v_mount;
856 TAILQ_REMOVE(&vrele_list, vp, v_freelist);
857 if (fstrans_start_nowait(mp, FSTRANS_LAZY) != 0) {
858 TAILQ_INSERT_TAIL(&skip_list, vp, v_freelist);
859 continue;
860 }
861 vrele_pending--;
862 mutex_exit(&vrele_lock);
863
864 /*
865 * If not the last reference, then ignore the vnode
866 * and look for more work.
867 */
868 mutex_enter(vp->v_interlock);
869 vrelel(vp, 0);
870 fstrans_done(mp);
871 mutex_enter(&vrele_lock);
872 }
873 }
874
875 void
876 vrele_flush(void)
877 {
878 int gen;
879
880 mutex_enter(&vrele_lock);
881 gen = vrele_gen;
882 while (vrele_pending && gen == vrele_gen) {
883 cv_broadcast(&vrele_cv);
884 cv_wait(&vrele_cv, &vrele_lock);
885 }
886 mutex_exit(&vrele_lock);
887 }
888
889 /*
890 * Vnode reference, where a reference is already held by some other
891 * object (for example, a file structure).
892 */
893 void
894 vref(vnode_t *vp)
895 {
896
897 KASSERT((vp->v_iflag & VI_MARKER) == 0);
898 KASSERT(vp->v_usecount != 0);
899
900 atomic_inc_uint(&vp->v_usecount);
901 }
902
903 /*
904 * Page or buffer structure gets a reference.
905 * Called with v_interlock held.
906 */
907 void
908 vholdl(vnode_t *vp)
909 {
910
911 KASSERT(mutex_owned(vp->v_interlock));
912 KASSERT((vp->v_iflag & VI_MARKER) == 0);
913
914 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) {
915 mutex_enter(&vnode_free_list_lock);
916 KASSERT(vp->v_freelisthd == &vnode_free_list);
917 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
918 vp->v_freelisthd = &vnode_hold_list;
919 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
920 mutex_exit(&vnode_free_list_lock);
921 }
922 }
923
924 /*
925 * Page or buffer structure frees a reference.
926 * Called with v_interlock held.
927 */
928 void
929 holdrelel(vnode_t *vp)
930 {
931
932 KASSERT(mutex_owned(vp->v_interlock));
933 KASSERT((vp->v_iflag & VI_MARKER) == 0);
934
935 if (vp->v_holdcnt <= 0) {
936 vnpanic(vp, "%s: holdcnt vp %p", __func__, vp);
937 }
938
939 vp->v_holdcnt--;
940 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) {
941 mutex_enter(&vnode_free_list_lock);
942 KASSERT(vp->v_freelisthd == &vnode_hold_list);
943 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
944 vp->v_freelisthd = &vnode_free_list;
945 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
946 mutex_exit(&vnode_free_list_lock);
947 }
948 }
949
950 /*
951 * Disassociate the underlying file system from a vnode.
952 *
953 * Must be called with the interlock held, and will return with it held.
954 */
955 static void
956 vclean(vnode_t *vp)
957 {
958 lwp_t *l = curlwp;
959 bool recycle, active;
960 int error;
961
962 KASSERT(mutex_owned(vp->v_interlock));
963 KASSERT((vp->v_iflag & VI_MARKER) == 0);
964 KASSERT(vp->v_usecount != 0);
965
966 /* If already clean, nothing to do. */
967 if ((vp->v_iflag & VI_CLEAN) != 0) {
968 return;
969 }
970
971 active = (vp->v_usecount > 1);
972 mutex_exit(vp->v_interlock);
973
974 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
975
976 /*
977 * Prevent the vnode from being recycled or brought into use
978 * while we clean it out.
979 */
980 mutex_enter(vp->v_interlock);
981 KASSERT((vp->v_iflag & (VI_XLOCK | VI_CLEAN)) == 0);
982 vp->v_iflag |= VI_XLOCK;
983 if (vp->v_iflag & VI_EXECMAP) {
984 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages);
985 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages);
986 }
987 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
988 mutex_exit(vp->v_interlock);
989
990 /*
991 * Clean out any cached data associated with the vnode.
992 * If purging an active vnode, it must be closed and
993 * deactivated before being reclaimed. Note that the
994 * VOP_INACTIVE will unlock the vnode.
995 */
996 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
997 if (error != 0) {
998 if (wapbl_vphaswapbl(vp))
999 WAPBL_DISCARD(wapbl_vptomp(vp));
1000 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
1001 }
1002 KASSERT(error == 0);
1003 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1004 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) {
1005 spec_node_revoke(vp);
1006 }
1007 if (active) {
1008 VOP_INACTIVE(vp, &recycle);
1009 } else {
1010 /*
1011 * Any other processes trying to obtain this lock must first
1012 * wait for VI_XLOCK to clear, then call the new lock operation.
1013 */
1014 VOP_UNLOCK(vp);
1015 }
1016
1017 /* Disassociate the underlying file system from the vnode. */
1018 if (VOP_RECLAIM(vp)) {
1019 vnpanic(vp, "%s: cannot reclaim", __func__);
1020 }
1021
1022 KASSERT(vp->v_data == NULL);
1023 KASSERT(vp->v_uobj.uo_npages == 0);
1024
1025 if (vp->v_type == VREG && vp->v_ractx != NULL) {
1026 uvm_ra_freectx(vp->v_ractx);
1027 vp->v_ractx = NULL;
1028 }
1029
1030 /* Purge name cache. */
1031 cache_purge(vp);
1032
1033 /* Move to dead mount. */
1034 vp->v_vflag &= ~VV_ROOT;
1035 atomic_inc_uint(&dead_rootmount->mnt_refcnt);
1036 vfs_insmntque(vp, dead_rootmount);
1037
1038 /* Done with purge, notify sleepers of the grim news. */
1039 mutex_enter(vp->v_interlock);
1040 vp->v_op = dead_vnodeop_p;
1041 vp->v_vflag |= VV_LOCKSWORK;
1042 vp->v_iflag |= VI_CLEAN;
1043 vp->v_tag = VT_NON;
1044 KNOTE(&vp->v_klist, NOTE_REVOKE);
1045 vp->v_iflag &= ~VI_XLOCK;
1046 cv_broadcast(&vp->v_cv);
1047
1048 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1049 }
1050
1051 /*
1052 * Recycle an unused vnode if caller holds the last reference.
1053 */
1054 bool
1055 vrecycle(vnode_t *vp)
1056 {
1057
1058 mutex_enter(vp->v_interlock);
1059
1060 KASSERT((vp->v_iflag & VI_MARKER) == 0);
1061
1062 if (vp->v_usecount != 1) {
1063 mutex_exit(vp->v_interlock);
1064 return false;
1065 }
1066 if ((vp->v_iflag & VI_CHANGING) != 0)
1067 vwait(vp, VI_CHANGING);
1068 if (vp->v_usecount != 1) {
1069 mutex_exit(vp->v_interlock);
1070 return false;
1071 } else if ((vp->v_iflag & VI_CLEAN) != 0) {
1072 mutex_exit(vp->v_interlock);
1073 return true;
1074 }
1075 vp->v_iflag |= VI_CHANGING;
1076 vclean(vp);
1077 vrelel(vp, VRELEL_CHANGING_SET);
1078 return true;
1079 }
1080
1081 /*
1082 * Eliminate all activity associated with the requested vnode
1083 * and with all vnodes aliased to the requested vnode.
1084 */
1085 void
1086 vrevoke(vnode_t *vp)
1087 {
1088 vnode_t *vq;
1089 enum vtype type;
1090 dev_t dev;
1091
1092 KASSERT(vp->v_usecount > 0);
1093
1094 mutex_enter(vp->v_interlock);
1095 if ((vp->v_iflag & VI_CLEAN) != 0) {
1096 mutex_exit(vp->v_interlock);
1097 return;
1098 } else if (vp->v_type != VBLK && vp->v_type != VCHR) {
1099 atomic_inc_uint(&vp->v_usecount);
1100 mutex_exit(vp->v_interlock);
1101 vgone(vp);
1102 return;
1103 } else {
1104 dev = vp->v_rdev;
1105 type = vp->v_type;
1106 mutex_exit(vp->v_interlock);
1107 }
1108
1109 while (spec_node_lookup_by_dev(type, dev, &vq) == 0) {
1110 vgone(vq);
1111 }
1112 }
1113
1114 /*
1115 * Eliminate all activity associated with a vnode in preparation for
1116 * reuse. Drops a reference from the vnode.
1117 */
1118 void
1119 vgone(vnode_t *vp)
1120 {
1121
1122 mutex_enter(vp->v_interlock);
1123 if ((vp->v_iflag & VI_CHANGING) != 0)
1124 vwait(vp, VI_CHANGING);
1125 vp->v_iflag |= VI_CHANGING;
1126 vclean(vp);
1127 vrelel(vp, VRELEL_CHANGING_SET);
1128 }
1129
1130 static inline uint32_t
1131 vcache_hash(const struct vcache_key *key)
1132 {
1133 uint32_t hash = HASH32_BUF_INIT;
1134
1135 hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash);
1136 hash = hash32_buf(key->vk_key, key->vk_key_len, hash);
1137 return hash;
1138 }
1139
1140 static void
1141 vcache_init(void)
1142 {
1143
1144 vcache.pool = pool_cache_init(sizeof(struct vcache_node), 0, 0, 0,
1145 "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL);
1146 KASSERT(vcache.pool != NULL);
1147 mutex_init(&vcache.lock, MUTEX_DEFAULT, IPL_NONE);
1148 vcache.hashtab = hashinit(desiredvnodes, HASH_SLIST, true,
1149 &vcache.hashmask);
1150 }
1151
1152 static void
1153 vcache_reinit(void)
1154 {
1155 int i;
1156 uint32_t hash;
1157 u_long oldmask, newmask;
1158 struct hashhead *oldtab, *newtab;
1159 struct vcache_node *node;
1160
1161 newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask);
1162 mutex_enter(&vcache.lock);
1163 oldtab = vcache.hashtab;
1164 oldmask = vcache.hashmask;
1165 vcache.hashtab = newtab;
1166 vcache.hashmask = newmask;
1167 for (i = 0; i <= oldmask; i++) {
1168 while ((node = SLIST_FIRST(&oldtab[i])) != NULL) {
1169 SLIST_REMOVE(&oldtab[i], node, vcache_node, vn_hash);
1170 hash = vcache_hash(&node->vn_key);
1171 SLIST_INSERT_HEAD(&newtab[hash & vcache.hashmask],
1172 node, vn_hash);
1173 }
1174 }
1175 mutex_exit(&vcache.lock);
1176 hashdone(oldtab, HASH_SLIST, oldmask);
1177 }
1178
1179 static inline struct vcache_node *
1180 vcache_hash_lookup(const struct vcache_key *key, uint32_t hash)
1181 {
1182 struct hashhead *hashp;
1183 struct vcache_node *node;
1184
1185 KASSERT(mutex_owned(&vcache.lock));
1186
1187 hashp = &vcache.hashtab[hash & vcache.hashmask];
1188 SLIST_FOREACH(node, hashp, vn_hash) {
1189 if (key->vk_mount != node->vn_key.vk_mount)
1190 continue;
1191 if (key->vk_key_len != node->vn_key.vk_key_len)
1192 continue;
1193 if (memcmp(key->vk_key, node->vn_key.vk_key, key->vk_key_len))
1194 continue;
1195 return node;
1196 }
1197 return NULL;
1198 }
1199
1200 /*
1201 * Get a vnode / fs node pair by key and return it referenced through vpp.
1202 */
1203 int
1204 vcache_get(struct mount *mp, const void *key, size_t key_len,
1205 struct vnode **vpp)
1206 {
1207 int error;
1208 uint32_t hash;
1209 const void *new_key;
1210 struct vnode *vp;
1211 struct vcache_key vcache_key;
1212 struct vcache_node *node, *new_node;
1213
1214 new_key = NULL;
1215 *vpp = NULL;
1216
1217 vcache_key.vk_mount = mp;
1218 vcache_key.vk_key = key;
1219 vcache_key.vk_key_len = key_len;
1220 hash = vcache_hash(&vcache_key);
1221
1222 again:
1223 mutex_enter(&vcache.lock);
1224 node = vcache_hash_lookup(&vcache_key, hash);
1225
1226 /* If found, take a reference or retry. */
1227 if (__predict_true(node != NULL && node->vn_vnode != NULL)) {
1228 vp = node->vn_vnode;
1229 mutex_enter(vp->v_interlock);
1230 mutex_exit(&vcache.lock);
1231 error = vget(vp, 0, true /* wait */);
1232 if (error == ENOENT)
1233 goto again;
1234 if (error == 0)
1235 *vpp = vp;
1236 KASSERT((error != 0) == (*vpp == NULL));
1237 return error;
1238 }
1239
1240 /* If another thread loads this node, wait and retry. */
1241 if (node != NULL) {
1242 KASSERT(node->vn_vnode == NULL);
1243 mutex_exit(&vcache.lock);
1244 kpause("vcache", false, mstohz(20), NULL);
1245 goto again;
1246 }
1247 mutex_exit(&vcache.lock);
1248
1249 /* Allocate and initialize a new vcache / vnode pair. */
1250 error = vfs_busy(mp, NULL);
1251 if (error)
1252 return error;
1253 new_node = pool_cache_get(vcache.pool, PR_WAITOK);
1254 new_node->vn_vnode = NULL;
1255 new_node->vn_key = vcache_key;
1256 vp = vnalloc(NULL);
1257 mutex_enter(&vcache.lock);
1258 node = vcache_hash_lookup(&vcache_key, hash);
1259 if (node == NULL) {
1260 SLIST_INSERT_HEAD(&vcache.hashtab[hash & vcache.hashmask],
1261 new_node, vn_hash);
1262 node = new_node;
1263 }
1264 mutex_exit(&vcache.lock);
1265
1266 /* If another thread beat us inserting this node, retry. */
1267 if (node != new_node) {
1268 pool_cache_put(vcache.pool, new_node);
1269 KASSERT(vp->v_usecount == 1);
1270 vp->v_usecount = 0;
1271 vnfree(vp);
1272 vfs_unbusy(mp, false, NULL);
1273 goto again;
1274 }
1275
1276 /* Load the fs node. Exclusive as new_node->vn_vnode is NULL. */
1277 vp->v_iflag |= VI_CHANGING;
1278 error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key);
1279 if (error) {
1280 mutex_enter(&vcache.lock);
1281 SLIST_REMOVE(&vcache.hashtab[hash & vcache.hashmask],
1282 new_node, vcache_node, vn_hash);
1283 mutex_exit(&vcache.lock);
1284 pool_cache_put(vcache.pool, new_node);
1285 KASSERT(vp->v_usecount == 1);
1286 vp->v_usecount = 0;
1287 vnfree(vp);
1288 vfs_unbusy(mp, false, NULL);
1289 KASSERT(*vpp == NULL);
1290 return error;
1291 }
1292 KASSERT(new_key != NULL);
1293 KASSERT(memcmp(key, new_key, key_len) == 0);
1294 KASSERT(vp->v_op != NULL);
1295 vfs_insmntque(vp, mp);
1296 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1297 vp->v_vflag |= VV_MPSAFE;
1298 vfs_unbusy(mp, true, NULL);
1299
1300 /* Finished loading, finalize node. */
1301 mutex_enter(&vcache.lock);
1302 new_node->vn_key.vk_key = new_key;
1303 new_node->vn_vnode = vp;
1304 mutex_exit(&vcache.lock);
1305 mutex_enter(vp->v_interlock);
1306 vp->v_iflag &= ~VI_CHANGING;
1307 cv_broadcast(&vp->v_cv);
1308 mutex_exit(vp->v_interlock);
1309 *vpp = vp;
1310 return 0;
1311 }
1312
1313 /*
1314 * Create a new vnode / fs node pair and return it referenced through vpp.
1315 */
1316 int
1317 vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap,
1318 kauth_cred_t cred, struct vnode **vpp)
1319 {
1320 int error;
1321 uint32_t hash;
1322 struct vnode *vp;
1323 struct vcache_node *new_node;
1324 struct vcache_node *old_node __diagused;
1325
1326 *vpp = NULL;
1327
1328 /* Allocate and initialize a new vcache / vnode pair. */
1329 error = vfs_busy(mp, NULL);
1330 if (error)
1331 return error;
1332 new_node = pool_cache_get(vcache.pool, PR_WAITOK);
1333 new_node->vn_key.vk_mount = mp;
1334 new_node->vn_vnode = NULL;
1335 vp = vnalloc(NULL);
1336
1337 /* Create and load the fs node. */
1338 vp->v_iflag |= VI_CHANGING;
1339 error = VFS_NEWVNODE(mp, dvp, vp, vap, cred,
1340 &new_node->vn_key.vk_key_len, &new_node->vn_key.vk_key);
1341 if (error) {
1342 pool_cache_put(vcache.pool, new_node);
1343 KASSERT(vp->v_usecount == 1);
1344 vp->v_usecount = 0;
1345 vnfree(vp);
1346 vfs_unbusy(mp, false, NULL);
1347 KASSERT(*vpp == NULL);
1348 return error;
1349 }
1350 KASSERT(new_node->vn_key.vk_key != NULL);
1351 KASSERT(vp->v_op != NULL);
1352 hash = vcache_hash(&new_node->vn_key);
1353
1354 /* Wait for previous instance to be reclaimed, then insert new node. */
1355 mutex_enter(&vcache.lock);
1356 while ((old_node = vcache_hash_lookup(&new_node->vn_key, hash))) {
1357 #ifdef DIAGNOSTIC
1358 if (old_node->vn_vnode != NULL)
1359 mutex_enter(old_node->vn_vnode->v_interlock);
1360 KASSERT(old_node->vn_vnode == NULL ||
1361 (old_node->vn_vnode->v_iflag & (VI_XLOCK | VI_CLEAN)) != 0);
1362 if (old_node->vn_vnode != NULL)
1363 mutex_exit(old_node->vn_vnode->v_interlock);
1364 #endif
1365 mutex_exit(&vcache.lock);
1366 kpause("vcache", false, mstohz(20), NULL);
1367 mutex_enter(&vcache.lock);
1368 }
1369 SLIST_INSERT_HEAD(&vcache.hashtab[hash & vcache.hashmask],
1370 new_node, vn_hash);
1371 mutex_exit(&vcache.lock);
1372 vfs_insmntque(vp, mp);
1373 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1374 vp->v_vflag |= VV_MPSAFE;
1375 vfs_unbusy(mp, true, NULL);
1376
1377 /* Finished loading, finalize node. */
1378 mutex_enter(&vcache.lock);
1379 new_node->vn_vnode = vp;
1380 mutex_exit(&vcache.lock);
1381 mutex_enter(vp->v_interlock);
1382 vp->v_iflag &= ~VI_CHANGING;
1383 cv_broadcast(&vp->v_cv);
1384 mutex_exit(vp->v_interlock);
1385 *vpp = vp;
1386 return 0;
1387 }
1388
1389 /*
1390 * Prepare key change: lock old and new cache node.
1391 * Return an error if the new node already exists.
1392 */
1393 int
1394 vcache_rekey_enter(struct mount *mp, struct vnode *vp,
1395 const void *old_key, size_t old_key_len,
1396 const void *new_key, size_t new_key_len)
1397 {
1398 uint32_t old_hash, new_hash;
1399 struct vcache_key old_vcache_key, new_vcache_key;
1400 struct vcache_node *node, *new_node;
1401
1402 old_vcache_key.vk_mount = mp;
1403 old_vcache_key.vk_key = old_key;
1404 old_vcache_key.vk_key_len = old_key_len;
1405 old_hash = vcache_hash(&old_vcache_key);
1406
1407 new_vcache_key.vk_mount = mp;
1408 new_vcache_key.vk_key = new_key;
1409 new_vcache_key.vk_key_len = new_key_len;
1410 new_hash = vcache_hash(&new_vcache_key);
1411
1412 new_node = pool_cache_get(vcache.pool, PR_WAITOK);
1413 new_node->vn_vnode = NULL;
1414 new_node->vn_key = new_vcache_key;
1415
1416 mutex_enter(&vcache.lock);
1417 node = vcache_hash_lookup(&new_vcache_key, new_hash);
1418 if (node != NULL) {
1419 mutex_exit(&vcache.lock);
1420 pool_cache_put(vcache.pool, new_node);
1421 return EEXIST;
1422 }
1423 SLIST_INSERT_HEAD(&vcache.hashtab[new_hash & vcache.hashmask],
1424 new_node, vn_hash);
1425 node = vcache_hash_lookup(&old_vcache_key, old_hash);
1426 KASSERT(node != NULL);
1427 KASSERT(node->vn_vnode == vp);
1428 node->vn_vnode = NULL;
1429 node->vn_key = old_vcache_key;
1430 mutex_exit(&vcache.lock);
1431 return 0;
1432 }
1433
1434 /*
1435 * Key change complete: remove old node and unlock new node.
1436 */
1437 void
1438 vcache_rekey_exit(struct mount *mp, struct vnode *vp,
1439 const void *old_key, size_t old_key_len,
1440 const void *new_key, size_t new_key_len)
1441 {
1442 uint32_t old_hash, new_hash;
1443 struct vcache_key old_vcache_key, new_vcache_key;
1444 struct vcache_node *node;
1445
1446 old_vcache_key.vk_mount = mp;
1447 old_vcache_key.vk_key = old_key;
1448 old_vcache_key.vk_key_len = old_key_len;
1449 old_hash = vcache_hash(&old_vcache_key);
1450
1451 new_vcache_key.vk_mount = mp;
1452 new_vcache_key.vk_key = new_key;
1453 new_vcache_key.vk_key_len = new_key_len;
1454 new_hash = vcache_hash(&new_vcache_key);
1455
1456 mutex_enter(&vcache.lock);
1457 node = vcache_hash_lookup(&new_vcache_key, new_hash);
1458 KASSERT(node != NULL && node->vn_vnode == NULL);
1459 KASSERT(node->vn_key.vk_key_len == new_key_len);
1460 node->vn_vnode = vp;
1461 node->vn_key = new_vcache_key;
1462 node = vcache_hash_lookup(&old_vcache_key, old_hash);
1463 KASSERT(node != NULL);
1464 KASSERT(node->vn_vnode == NULL);
1465 SLIST_REMOVE(&vcache.hashtab[old_hash & vcache.hashmask],
1466 node, vcache_node, vn_hash);
1467 mutex_exit(&vcache.lock);
1468 pool_cache_put(vcache.pool, node);
1469 }
1470
1471 /*
1472 * Remove a vnode / fs node pair from the cache.
1473 */
1474 void
1475 vcache_remove(struct mount *mp, const void *key, size_t key_len)
1476 {
1477 uint32_t hash;
1478 struct vcache_key vcache_key;
1479 struct vcache_node *node;
1480
1481 vcache_key.vk_mount = mp;
1482 vcache_key.vk_key = key;
1483 vcache_key.vk_key_len = key_len;
1484 hash = vcache_hash(&vcache_key);
1485
1486 mutex_enter(&vcache.lock);
1487 node = vcache_hash_lookup(&vcache_key, hash);
1488 KASSERT(node != NULL);
1489 SLIST_REMOVE(&vcache.hashtab[hash & vcache.hashmask],
1490 node, vcache_node, vn_hash);
1491 mutex_exit(&vcache.lock);
1492 pool_cache_put(vcache.pool, node);
1493 }
1494
1495 /*
1496 * Update outstanding I/O count and do wakeup if requested.
1497 */
1498 void
1499 vwakeup(struct buf *bp)
1500 {
1501 vnode_t *vp;
1502
1503 if ((vp = bp->b_vp) == NULL)
1504 return;
1505
1506 KASSERT(bp->b_objlock == vp->v_interlock);
1507 KASSERT(mutex_owned(bp->b_objlock));
1508
1509 if (--vp->v_numoutput < 0)
1510 vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp);
1511 if (vp->v_numoutput == 0)
1512 cv_broadcast(&vp->v_cv);
1513 }
1514
1515 /*
1516 * Test a vnode for being or becoming dead. Returns one of:
1517 * EBUSY: vnode is becoming dead, with "flags == VDEAD_NOWAIT" only.
1518 * ENOENT: vnode is dead.
1519 * 0: otherwise.
1520 *
1521 * Whenever this function returns a non-zero value all future
1522 * calls will also return a non-zero value.
1523 */
1524 int
1525 vdead_check(struct vnode *vp, int flags)
1526 {
1527
1528 KASSERT(mutex_owned(vp->v_interlock));
1529 if (ISSET(vp->v_iflag, VI_XLOCK)) {
1530 if (ISSET(flags, VDEAD_NOWAIT))
1531 return EBUSY;
1532 vwait(vp, VI_XLOCK);
1533 KASSERT(ISSET(vp->v_iflag, VI_CLEAN));
1534 }
1535 if (ISSET(vp->v_iflag, VI_CLEAN))
1536 return ENOENT;
1537 return 0;
1538 }
1539
1540 /*
1541 * Wait for a vnode (typically with VI_XLOCK set) to be cleaned or
1542 * recycled.
1543 */
1544 static void
1545 vwait(vnode_t *vp, int flags)
1546 {
1547
1548 KASSERT(mutex_owned(vp->v_interlock));
1549 KASSERT(vp->v_usecount != 0);
1550
1551 while ((vp->v_iflag & flags) != 0)
1552 cv_wait(&vp->v_cv, vp->v_interlock);
1553 }
1554
1555 int
1556 vfs_drainvnodes(long target)
1557 {
1558 int error;
1559
1560 mutex_enter(&vnode_free_list_lock);
1561
1562 while (numvnodes > target) {
1563 error = cleanvnode();
1564 if (error != 0)
1565 return error;
1566 mutex_enter(&vnode_free_list_lock);
1567 }
1568
1569 mutex_exit(&vnode_free_list_lock);
1570
1571 vcache_reinit();
1572
1573 return 0;
1574 }
1575
1576 void
1577 vnpanic(vnode_t *vp, const char *fmt, ...)
1578 {
1579 va_list ap;
1580
1581 #ifdef DIAGNOSTIC
1582 vprint(NULL, vp);
1583 #endif
1584 va_start(ap, fmt);
1585 vpanic(fmt, ap);
1586 va_end(ap);
1587 }
1588