vfs_vnode.c revision 1.39.2.2 1 /* $NetBSD: vfs_vnode.c,v 1.39.2.2 2015/06/06 14:40:22 skrll Exp $ */
2
3 /*-
4 * Copyright (c) 1997-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /*
34 * Copyright (c) 1989, 1993
35 * The Regents of the University of California. All rights reserved.
36 * (c) UNIX System Laboratories, Inc.
37 * All or some portions of this file are derived from material licensed
38 * to the University of California by American Telephone and Telegraph
39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40 * the permission of UNIX System Laboratories, Inc.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
67 */
68
69 /*
70 * The vnode cache subsystem.
71 *
72 * Life-cycle
73 *
74 * Normally, there are two points where new vnodes are created:
75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode
76 * starts in one of the following ways:
77 *
78 * - Allocation, via getnewvnode(9) and/or vnalloc(9).
79 * - Reclamation of inactive vnode, via vget(9).
80 *
81 * Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9)
82 * was another, traditional way. Currently, only the draining thread
83 * recycles the vnodes. This behaviour might be revisited.
84 *
85 * The life-cycle ends when the last reference is dropped, usually
86 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform
87 * the file system that vnode is inactive. Via this call, file system
88 * indicates whether vnode can be recycled (usually, it checks its own
89 * references, e.g. count of links, whether the file was removed).
90 *
91 * Depending on indication, vnode can be put into a free list (cache),
92 * or cleaned via vclean(9), which calls VOP_RECLAIM(9) to disassociate
93 * underlying file system from the vnode, and finally destroyed.
94 *
95 * Reference counting
96 *
97 * Vnode is considered active, if reference count (vnode_t::v_usecount)
98 * is non-zero. It is maintained using: vref(9) and vrele(9), as well
99 * as vput(9), routines. Common points holding references are e.g.
100 * file openings, current working directory, mount points, etc.
101 *
102 * Note on v_usecount and its locking
103 *
104 * At nearly all points it is known that v_usecount could be zero,
105 * the vnode_t::v_interlock will be held. To change v_usecount away
106 * from zero, the interlock must be held. To change from a non-zero
107 * value to zero, again the interlock must be held.
108 *
109 * Changing the usecount from a non-zero value to a non-zero value can
110 * safely be done using atomic operations, without the interlock held.
111 *
112 * Note: if VI_CLEAN is set, vnode_t::v_interlock will be released while
113 * mntvnode_lock is still held.
114 *
115 * See PR 41374.
116 */
117
118 #include <sys/cdefs.h>
119 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.39.2.2 2015/06/06 14:40:22 skrll Exp $");
120
121 #define _VFS_VNODE_PRIVATE
122
123 #include <sys/param.h>
124 #include <sys/kernel.h>
125
126 #include <sys/atomic.h>
127 #include <sys/buf.h>
128 #include <sys/conf.h>
129 #include <sys/device.h>
130 #include <sys/hash.h>
131 #include <sys/kauth.h>
132 #include <sys/kmem.h>
133 #include <sys/kthread.h>
134 #include <sys/module.h>
135 #include <sys/mount.h>
136 #include <sys/namei.h>
137 #include <sys/syscallargs.h>
138 #include <sys/sysctl.h>
139 #include <sys/systm.h>
140 #include <sys/vnode.h>
141 #include <sys/wapbl.h>
142 #include <sys/fstrans.h>
143
144 #include <uvm/uvm.h>
145 #include <uvm/uvm_readahead.h>
146
147 /* Flags to vrelel. */
148 #define VRELEL_ASYNC_RELE 0x0001 /* Always defer to vrele thread. */
149 #define VRELEL_CHANGING_SET 0x0002 /* VI_CHANGING set by caller. */
150
151 struct vcache_key {
152 struct mount *vk_mount;
153 const void *vk_key;
154 size_t vk_key_len;
155 };
156 struct vcache_node {
157 SLIST_ENTRY(vcache_node) vn_hash;
158 struct vnode *vn_vnode;
159 struct vcache_key vn_key;
160 };
161
162 u_int numvnodes __cacheline_aligned;
163
164 static pool_cache_t vnode_cache __read_mostly;
165 static struct mount *dead_mount;
166
167 /*
168 * There are two free lists: one is for vnodes which have no buffer/page
169 * references and one for those which do (i.e. v_holdcnt is non-zero).
170 * Vnode recycling mechanism first attempts to look into the former list.
171 */
172 static kmutex_t vnode_free_list_lock __cacheline_aligned;
173 static vnodelst_t vnode_free_list __cacheline_aligned;
174 static vnodelst_t vnode_hold_list __cacheline_aligned;
175 static kcondvar_t vdrain_cv __cacheline_aligned;
176
177 static vnodelst_t vrele_list __cacheline_aligned;
178 static kmutex_t vrele_lock __cacheline_aligned;
179 static kcondvar_t vrele_cv __cacheline_aligned;
180 static lwp_t * vrele_lwp __cacheline_aligned;
181 static int vrele_pending __cacheline_aligned;
182 static int vrele_gen __cacheline_aligned;
183
184 SLIST_HEAD(hashhead, vcache_node);
185 static struct {
186 kmutex_t lock;
187 u_long hashmask;
188 struct hashhead *hashtab;
189 pool_cache_t pool;
190 } vcache __cacheline_aligned;
191
192 static int cleanvnode(void);
193 static void vcache_init(void);
194 static void vcache_reinit(void);
195 static void vclean(vnode_t *);
196 static void vrelel(vnode_t *, int);
197 static void vdrain_thread(void *);
198 static void vrele_thread(void *);
199 static void vnpanic(vnode_t *, const char *, ...)
200 __printflike(2, 3);
201 static void vwait(vnode_t *, int);
202
203 /* Routines having to do with the management of the vnode table. */
204 extern int (**dead_vnodeop_p)(void *);
205 extern struct vfsops dead_vfsops;
206
207 void
208 vfs_vnode_sysinit(void)
209 {
210 int error __diagused;
211
212 vnode_cache = pool_cache_init(sizeof(vnode_t), 0, 0, 0, "vnodepl",
213 NULL, IPL_NONE, NULL, NULL, NULL);
214 KASSERT(vnode_cache != NULL);
215
216 dead_mount = vfs_mountalloc(&dead_vfsops, NULL);
217 KASSERT(dead_mount != NULL);
218 dead_mount->mnt_iflag = IMNT_MPSAFE;
219
220 mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE);
221 TAILQ_INIT(&vnode_free_list);
222 TAILQ_INIT(&vnode_hold_list);
223 TAILQ_INIT(&vrele_list);
224
225 vcache_init();
226
227 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE);
228 cv_init(&vdrain_cv, "vdrain");
229 cv_init(&vrele_cv, "vrele");
230 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread,
231 NULL, NULL, "vdrain");
232 KASSERT(error == 0);
233 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread,
234 NULL, &vrele_lwp, "vrele");
235 KASSERT(error == 0);
236 }
237
238 /*
239 * Allocate a new, uninitialized vnode. If 'mp' is non-NULL, this is a
240 * marker vnode.
241 */
242 vnode_t *
243 vnalloc(struct mount *mp)
244 {
245 vnode_t *vp;
246
247 vp = pool_cache_get(vnode_cache, PR_WAITOK);
248 KASSERT(vp != NULL);
249
250 memset(vp, 0, sizeof(*vp));
251 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0);
252 cv_init(&vp->v_cv, "vnode");
253 /*
254 * Done by memset() above.
255 * LIST_INIT(&vp->v_nclist);
256 * LIST_INIT(&vp->v_dnclist);
257 */
258
259 if (mp != NULL) {
260 vp->v_mount = mp;
261 vp->v_type = VBAD;
262 vp->v_iflag = VI_MARKER;
263 return vp;
264 }
265
266 mutex_enter(&vnode_free_list_lock);
267 numvnodes++;
268 if (numvnodes > desiredvnodes + desiredvnodes / 10)
269 cv_signal(&vdrain_cv);
270 mutex_exit(&vnode_free_list_lock);
271
272 rw_init(&vp->v_lock);
273 vp->v_usecount = 1;
274 vp->v_type = VNON;
275 vp->v_size = vp->v_writesize = VSIZENOTSET;
276
277 return vp;
278 }
279
280 /*
281 * Free an unused, unreferenced vnode.
282 */
283 void
284 vnfree(vnode_t *vp)
285 {
286
287 KASSERT(vp->v_usecount == 0);
288
289 if ((vp->v_iflag & VI_MARKER) == 0) {
290 rw_destroy(&vp->v_lock);
291 mutex_enter(&vnode_free_list_lock);
292 numvnodes--;
293 mutex_exit(&vnode_free_list_lock);
294 }
295
296 uvm_obj_destroy(&vp->v_uobj, true);
297 cv_destroy(&vp->v_cv);
298 pool_cache_put(vnode_cache, vp);
299 }
300
301 /*
302 * cleanvnode: grab a vnode from freelist, clean and free it.
303 *
304 * => Releases vnode_free_list_lock.
305 */
306 static int
307 cleanvnode(void)
308 {
309 vnode_t *vp;
310 vnodelst_t *listhd;
311 struct mount *mp;
312
313 KASSERT(mutex_owned(&vnode_free_list_lock));
314
315 listhd = &vnode_free_list;
316 try_nextlist:
317 TAILQ_FOREACH(vp, listhd, v_freelist) {
318 /*
319 * It's safe to test v_usecount and v_iflag
320 * without holding the interlock here, since
321 * these vnodes should never appear on the
322 * lists.
323 */
324 KASSERT(vp->v_usecount == 0);
325 KASSERT((vp->v_iflag & VI_CLEAN) == 0);
326 KASSERT(vp->v_freelisthd == listhd);
327
328 if (!mutex_tryenter(vp->v_interlock))
329 continue;
330 if ((vp->v_iflag & VI_XLOCK) != 0) {
331 mutex_exit(vp->v_interlock);
332 continue;
333 }
334 mp = vp->v_mount;
335 if (fstrans_start_nowait(mp, FSTRANS_SHARED) != 0) {
336 mutex_exit(vp->v_interlock);
337 continue;
338 }
339 break;
340 }
341
342 if (vp == NULL) {
343 if (listhd == &vnode_free_list) {
344 listhd = &vnode_hold_list;
345 goto try_nextlist;
346 }
347 mutex_exit(&vnode_free_list_lock);
348 return EBUSY;
349 }
350
351 /* Remove it from the freelist. */
352 TAILQ_REMOVE(listhd, vp, v_freelist);
353 vp->v_freelisthd = NULL;
354 mutex_exit(&vnode_free_list_lock);
355
356 KASSERT(vp->v_usecount == 0);
357
358 /*
359 * The vnode is still associated with a file system, so we must
360 * clean it out before freeing it. We need to add a reference
361 * before doing this.
362 */
363 vp->v_usecount = 1;
364 KASSERT((vp->v_iflag & VI_CHANGING) == 0);
365 vp->v_iflag |= VI_CHANGING;
366 vclean(vp);
367 vrelel(vp, VRELEL_CHANGING_SET);
368 fstrans_done(mp);
369
370 return 0;
371 }
372
373 /*
374 * getnewvnode: return a fresh vnode.
375 *
376 * => Returns referenced vnode, moved into the mount queue.
377 * => Shares the interlock specified by 'slock', if it is not NULL.
378 */
379 int
380 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *),
381 kmutex_t *slock, vnode_t **vpp)
382 {
383 struct uvm_object *uobj __diagused;
384 vnode_t *vp;
385 int error = 0;
386
387 if (mp != NULL) {
388 /*
389 * Mark filesystem busy while we are creating a vnode.
390 * If unmount is in progress, this will fail.
391 */
392 error = vfs_busy(mp, NULL);
393 if (error)
394 return error;
395 }
396
397 vp = NULL;
398
399 /* Allocate a new vnode. */
400 vp = vnalloc(NULL);
401
402 KASSERT(vp->v_freelisthd == NULL);
403 KASSERT(LIST_EMPTY(&vp->v_nclist));
404 KASSERT(LIST_EMPTY(&vp->v_dnclist));
405 KASSERT(vp->v_data == NULL);
406
407 /* Initialize vnode. */
408 vp->v_tag = tag;
409 vp->v_op = vops;
410
411 uobj = &vp->v_uobj;
412 KASSERT(uobj->pgops == &uvm_vnodeops);
413 KASSERT(uobj->uo_npages == 0);
414 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL);
415
416 /* Share the vnode_t::v_interlock, if requested. */
417 if (slock) {
418 /* Set the interlock and mark that it is shared. */
419 KASSERT(vp->v_mount == NULL);
420 mutex_obj_hold(slock);
421 uvm_obj_setlock(&vp->v_uobj, slock);
422 KASSERT(vp->v_interlock == slock);
423 }
424
425 /* Finally, move vnode into the mount queue. */
426 vfs_insmntque(vp, mp);
427
428 if (mp != NULL) {
429 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
430 vp->v_vflag |= VV_MPSAFE;
431 vfs_unbusy(mp, true, NULL);
432 }
433
434 *vpp = vp;
435 return 0;
436 }
437
438 /*
439 * This is really just the reverse of getnewvnode(). Needed for
440 * VFS_VGET functions who may need to push back a vnode in case
441 * of a locking race.
442 */
443 void
444 ungetnewvnode(vnode_t *vp)
445 {
446
447 KASSERT(vp->v_usecount == 1);
448 KASSERT(vp->v_data == NULL);
449 KASSERT(vp->v_freelisthd == NULL);
450
451 mutex_enter(vp->v_interlock);
452 vp->v_iflag |= VI_CLEAN;
453 vrelel(vp, 0);
454 }
455
456 /*
457 * Helper thread to keep the number of vnodes below desiredvnodes.
458 */
459 static void
460 vdrain_thread(void *cookie)
461 {
462 int error;
463
464 mutex_enter(&vnode_free_list_lock);
465
466 for (;;) {
467 cv_timedwait(&vdrain_cv, &vnode_free_list_lock, hz);
468 while (numvnodes > desiredvnodes) {
469 error = cleanvnode();
470 if (error)
471 kpause("vndsbusy", false, hz, NULL);
472 mutex_enter(&vnode_free_list_lock);
473 if (error)
474 break;
475 }
476 }
477 }
478
479 /*
480 * Remove a vnode from its freelist.
481 */
482 void
483 vremfree(vnode_t *vp)
484 {
485
486 KASSERT(mutex_owned(vp->v_interlock));
487 KASSERT(vp->v_usecount == 0);
488
489 /*
490 * Note that the reference count must not change until
491 * the vnode is removed.
492 */
493 mutex_enter(&vnode_free_list_lock);
494 if (vp->v_holdcnt > 0) {
495 KASSERT(vp->v_freelisthd == &vnode_hold_list);
496 } else {
497 KASSERT(vp->v_freelisthd == &vnode_free_list);
498 }
499 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
500 vp->v_freelisthd = NULL;
501 mutex_exit(&vnode_free_list_lock);
502 }
503
504 /*
505 * vget: get a particular vnode from the free list, increment its reference
506 * count and lock it.
507 *
508 * => Should be called with v_interlock held.
509 *
510 * If VI_CHANGING is set, the vnode may be eliminated in vgone()/vclean().
511 * In that case, we cannot grab the vnode, so the process is awakened when
512 * the transition is completed, and an error returned to indicate that the
513 * vnode is no longer usable.
514 */
515 int
516 vget(vnode_t *vp, int flags, bool waitok)
517 {
518 int error = 0;
519
520 KASSERT((vp->v_iflag & VI_MARKER) == 0);
521 KASSERT(mutex_owned(vp->v_interlock));
522 KASSERT((flags & ~LK_NOWAIT) == 0);
523 KASSERT(waitok == ((flags & LK_NOWAIT) == 0));
524
525 /*
526 * Before adding a reference, we must remove the vnode
527 * from its freelist.
528 */
529 if (vp->v_usecount == 0) {
530 vremfree(vp);
531 vp->v_usecount = 1;
532 } else {
533 atomic_inc_uint(&vp->v_usecount);
534 }
535
536 /*
537 * If the vnode is in the process of changing state we wait
538 * for the change to complete and take care not to return
539 * a clean vnode.
540 */
541 if ((vp->v_iflag & VI_CHANGING) != 0) {
542 if ((flags & LK_NOWAIT) != 0) {
543 vrelel(vp, 0);
544 return EBUSY;
545 }
546 vwait(vp, VI_CHANGING);
547 if ((vp->v_iflag & VI_CLEAN) != 0) {
548 vrelel(vp, 0);
549 return ENOENT;
550 }
551 }
552
553 /*
554 * Ok, we got it in good shape.
555 */
556 KASSERT((vp->v_iflag & VI_CLEAN) == 0);
557 mutex_exit(vp->v_interlock);
558 return error;
559 }
560
561 /*
562 * vput: unlock and release the reference.
563 */
564 void
565 vput(vnode_t *vp)
566 {
567
568 KASSERT((vp->v_iflag & VI_MARKER) == 0);
569
570 VOP_UNLOCK(vp);
571 vrele(vp);
572 }
573
574 /*
575 * Try to drop reference on a vnode. Abort if we are releasing the
576 * last reference. Note: this _must_ succeed if not the last reference.
577 */
578 static inline bool
579 vtryrele(vnode_t *vp)
580 {
581 u_int use, next;
582
583 for (use = vp->v_usecount;; use = next) {
584 if (use == 1) {
585 return false;
586 }
587 KASSERT(use > 1);
588 next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
589 if (__predict_true(next == use)) {
590 return true;
591 }
592 }
593 }
594
595 /*
596 * Vnode release. If reference count drops to zero, call inactive
597 * routine and either return to freelist or free to the pool.
598 */
599 static void
600 vrelel(vnode_t *vp, int flags)
601 {
602 bool recycle, defer;
603 int error;
604
605 KASSERT(mutex_owned(vp->v_interlock));
606 KASSERT((vp->v_iflag & VI_MARKER) == 0);
607 KASSERT(vp->v_freelisthd == NULL);
608
609 if (__predict_false(vp->v_op == dead_vnodeop_p &&
610 (vp->v_iflag & (VI_CLEAN|VI_XLOCK)) == 0)) {
611 vnpanic(vp, "dead but not clean");
612 }
613
614 /*
615 * If not the last reference, just drop the reference count
616 * and unlock.
617 */
618 if (vtryrele(vp)) {
619 if ((flags & VRELEL_CHANGING_SET) != 0) {
620 KASSERT((vp->v_iflag & VI_CHANGING) != 0);
621 vp->v_iflag &= ~VI_CHANGING;
622 cv_broadcast(&vp->v_cv);
623 }
624 mutex_exit(vp->v_interlock);
625 return;
626 }
627 if (vp->v_usecount <= 0 || vp->v_writecount != 0) {
628 vnpanic(vp, "%s: bad ref count", __func__);
629 }
630
631 KASSERT((vp->v_iflag & VI_XLOCK) == 0);
632
633 #ifdef DIAGNOSTIC
634 if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
635 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
636 vprint("vrelel: missing VOP_CLOSE()", vp);
637 }
638 #endif
639
640 /*
641 * If not clean, deactivate the vnode, but preserve
642 * our reference across the call to VOP_INACTIVE().
643 */
644 if ((vp->v_iflag & VI_CLEAN) == 0) {
645 recycle = false;
646
647 /*
648 * XXX This ugly block can be largely eliminated if
649 * locking is pushed down into the file systems.
650 *
651 * Defer vnode release to vrele_thread if caller
652 * requests it explicitly or is the pagedaemon.
653 */
654 if ((curlwp == uvm.pagedaemon_lwp) ||
655 (flags & VRELEL_ASYNC_RELE) != 0) {
656 defer = true;
657 } else if (curlwp == vrele_lwp) {
658 /*
659 * We have to try harder.
660 */
661 mutex_exit(vp->v_interlock);
662 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
663 KASSERT(error == 0);
664 mutex_enter(vp->v_interlock);
665 defer = false;
666 } else {
667 /* If we can't acquire the lock, then defer. */
668 mutex_exit(vp->v_interlock);
669 error = vn_lock(vp,
670 LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
671 defer = (error != 0);
672 mutex_enter(vp->v_interlock);
673 }
674
675 KASSERT(mutex_owned(vp->v_interlock));
676 KASSERT(! (curlwp == vrele_lwp && defer));
677
678 if (defer) {
679 /*
680 * Defer reclaim to the kthread; it's not safe to
681 * clean it here. We donate it our last reference.
682 */
683 if ((flags & VRELEL_CHANGING_SET) != 0) {
684 KASSERT((vp->v_iflag & VI_CHANGING) != 0);
685 vp->v_iflag &= ~VI_CHANGING;
686 cv_broadcast(&vp->v_cv);
687 }
688 mutex_enter(&vrele_lock);
689 TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist);
690 if (++vrele_pending > (desiredvnodes >> 8))
691 cv_signal(&vrele_cv);
692 mutex_exit(&vrele_lock);
693 mutex_exit(vp->v_interlock);
694 return;
695 }
696
697 /*
698 * If the node got another reference while we
699 * released the interlock, don't try to inactivate it yet.
700 */
701 if (__predict_false(vtryrele(vp))) {
702 VOP_UNLOCK(vp);
703 if ((flags & VRELEL_CHANGING_SET) != 0) {
704 KASSERT((vp->v_iflag & VI_CHANGING) != 0);
705 vp->v_iflag &= ~VI_CHANGING;
706 cv_broadcast(&vp->v_cv);
707 }
708 mutex_exit(vp->v_interlock);
709 return;
710 }
711
712 if ((flags & VRELEL_CHANGING_SET) == 0) {
713 KASSERT((vp->v_iflag & VI_CHANGING) == 0);
714 vp->v_iflag |= VI_CHANGING;
715 }
716 mutex_exit(vp->v_interlock);
717
718 /*
719 * The vnode can gain another reference while being
720 * deactivated. If VOP_INACTIVE() indicates that
721 * the described file has been deleted, then recycle
722 * the vnode irrespective of additional references.
723 * Another thread may be waiting to re-use the on-disk
724 * inode.
725 *
726 * Note that VOP_INACTIVE() will drop the vnode lock.
727 */
728 VOP_INACTIVE(vp, &recycle);
729 mutex_enter(vp->v_interlock);
730 if (!recycle) {
731 if (vtryrele(vp)) {
732 KASSERT((vp->v_iflag & VI_CHANGING) != 0);
733 vp->v_iflag &= ~VI_CHANGING;
734 cv_broadcast(&vp->v_cv);
735 mutex_exit(vp->v_interlock);
736 return;
737 }
738 }
739
740 /* Take care of space accounting. */
741 if (vp->v_iflag & VI_EXECMAP) {
742 atomic_add_int(&uvmexp.execpages,
743 -vp->v_uobj.uo_npages);
744 atomic_add_int(&uvmexp.filepages,
745 vp->v_uobj.uo_npages);
746 }
747 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP);
748 vp->v_vflag &= ~VV_MAPPED;
749
750 /*
751 * Recycle the vnode if the file is now unused (unlinked),
752 * otherwise just free it.
753 */
754 if (recycle) {
755 vclean(vp);
756 }
757 KASSERT(vp->v_usecount > 0);
758 } else { /* vnode was already clean */
759 if ((flags & VRELEL_CHANGING_SET) == 0) {
760 KASSERT((vp->v_iflag & VI_CHANGING) == 0);
761 vp->v_iflag |= VI_CHANGING;
762 }
763 }
764
765 if (atomic_dec_uint_nv(&vp->v_usecount) != 0) {
766 /* Gained another reference while being reclaimed. */
767 KASSERT((vp->v_iflag & VI_CHANGING) != 0);
768 vp->v_iflag &= ~VI_CHANGING;
769 cv_broadcast(&vp->v_cv);
770 mutex_exit(vp->v_interlock);
771 return;
772 }
773
774 if ((vp->v_iflag & VI_CLEAN) != 0) {
775 /*
776 * It's clean so destroy it. It isn't referenced
777 * anywhere since it has been reclaimed.
778 */
779 KASSERT(vp->v_holdcnt == 0);
780 KASSERT(vp->v_writecount == 0);
781 mutex_exit(vp->v_interlock);
782 vfs_insmntque(vp, NULL);
783 if (vp->v_type == VBLK || vp->v_type == VCHR) {
784 spec_node_destroy(vp);
785 }
786 vnfree(vp);
787 } else {
788 /*
789 * Otherwise, put it back onto the freelist. It
790 * can't be destroyed while still associated with
791 * a file system.
792 */
793 mutex_enter(&vnode_free_list_lock);
794 if (vp->v_holdcnt > 0) {
795 vp->v_freelisthd = &vnode_hold_list;
796 } else {
797 vp->v_freelisthd = &vnode_free_list;
798 }
799 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
800 mutex_exit(&vnode_free_list_lock);
801 KASSERT((vp->v_iflag & VI_CHANGING) != 0);
802 vp->v_iflag &= ~VI_CHANGING;
803 cv_broadcast(&vp->v_cv);
804 mutex_exit(vp->v_interlock);
805 }
806 }
807
808 void
809 vrele(vnode_t *vp)
810 {
811
812 KASSERT((vp->v_iflag & VI_MARKER) == 0);
813
814 if (vtryrele(vp)) {
815 return;
816 }
817 mutex_enter(vp->v_interlock);
818 vrelel(vp, 0);
819 }
820
821 /*
822 * Asynchronous vnode release, vnode is released in different context.
823 */
824 void
825 vrele_async(vnode_t *vp)
826 {
827
828 KASSERT((vp->v_iflag & VI_MARKER) == 0);
829
830 if (vtryrele(vp)) {
831 return;
832 }
833 mutex_enter(vp->v_interlock);
834 vrelel(vp, VRELEL_ASYNC_RELE);
835 }
836
837 static void
838 vrele_thread(void *cookie)
839 {
840 vnodelst_t skip_list;
841 vnode_t *vp;
842 struct mount *mp;
843
844 TAILQ_INIT(&skip_list);
845
846 mutex_enter(&vrele_lock);
847 for (;;) {
848 while (TAILQ_EMPTY(&vrele_list)) {
849 vrele_gen++;
850 cv_broadcast(&vrele_cv);
851 cv_timedwait(&vrele_cv, &vrele_lock, hz);
852 TAILQ_CONCAT(&vrele_list, &skip_list, v_freelist);
853 }
854 vp = TAILQ_FIRST(&vrele_list);
855 mp = vp->v_mount;
856 TAILQ_REMOVE(&vrele_list, vp, v_freelist);
857 if (fstrans_start_nowait(mp, FSTRANS_LAZY) != 0) {
858 TAILQ_INSERT_TAIL(&skip_list, vp, v_freelist);
859 continue;
860 }
861 vrele_pending--;
862 mutex_exit(&vrele_lock);
863
864 /*
865 * If not the last reference, then ignore the vnode
866 * and look for more work.
867 */
868 mutex_enter(vp->v_interlock);
869 vrelel(vp, 0);
870 fstrans_done(mp);
871 mutex_enter(&vrele_lock);
872 }
873 }
874
875 void
876 vrele_flush(void)
877 {
878 int gen;
879
880 mutex_enter(&vrele_lock);
881 gen = vrele_gen;
882 while (vrele_pending && gen == vrele_gen) {
883 cv_broadcast(&vrele_cv);
884 cv_wait(&vrele_cv, &vrele_lock);
885 }
886 mutex_exit(&vrele_lock);
887 }
888
889 /*
890 * Vnode reference, where a reference is already held by some other
891 * object (for example, a file structure).
892 */
893 void
894 vref(vnode_t *vp)
895 {
896
897 KASSERT((vp->v_iflag & VI_MARKER) == 0);
898 KASSERT(vp->v_usecount != 0);
899
900 atomic_inc_uint(&vp->v_usecount);
901 }
902
903 /*
904 * Page or buffer structure gets a reference.
905 * Called with v_interlock held.
906 */
907 void
908 vholdl(vnode_t *vp)
909 {
910
911 KASSERT(mutex_owned(vp->v_interlock));
912 KASSERT((vp->v_iflag & VI_MARKER) == 0);
913
914 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) {
915 mutex_enter(&vnode_free_list_lock);
916 KASSERT(vp->v_freelisthd == &vnode_free_list);
917 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
918 vp->v_freelisthd = &vnode_hold_list;
919 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
920 mutex_exit(&vnode_free_list_lock);
921 }
922 }
923
924 /*
925 * Page or buffer structure frees a reference.
926 * Called with v_interlock held.
927 */
928 void
929 holdrelel(vnode_t *vp)
930 {
931
932 KASSERT(mutex_owned(vp->v_interlock));
933 KASSERT((vp->v_iflag & VI_MARKER) == 0);
934
935 if (vp->v_holdcnt <= 0) {
936 vnpanic(vp, "%s: holdcnt vp %p", __func__, vp);
937 }
938
939 vp->v_holdcnt--;
940 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) {
941 mutex_enter(&vnode_free_list_lock);
942 KASSERT(vp->v_freelisthd == &vnode_hold_list);
943 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
944 vp->v_freelisthd = &vnode_free_list;
945 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
946 mutex_exit(&vnode_free_list_lock);
947 }
948 }
949
950 /*
951 * Disassociate the underlying file system from a vnode.
952 *
953 * Must be called with the interlock held, and will return with it held.
954 */
955 static void
956 vclean(vnode_t *vp)
957 {
958 lwp_t *l = curlwp;
959 bool recycle, active, doclose;
960 int error;
961
962 KASSERT(mutex_owned(vp->v_interlock));
963 KASSERT((vp->v_iflag & VI_MARKER) == 0);
964 KASSERT(vp->v_usecount != 0);
965
966 /* If already clean, nothing to do. */
967 if ((vp->v_iflag & VI_CLEAN) != 0) {
968 return;
969 }
970
971 active = (vp->v_usecount > 1);
972 doclose = ! (active && vp->v_type == VBLK &&
973 spec_node_getmountedfs(vp) != NULL);
974 mutex_exit(vp->v_interlock);
975
976 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
977
978 /*
979 * Prevent the vnode from being recycled or brought into use
980 * while we clean it out.
981 */
982 mutex_enter(vp->v_interlock);
983 KASSERT((vp->v_iflag & (VI_XLOCK | VI_CLEAN)) == 0);
984 vp->v_iflag |= VI_XLOCK;
985 if (vp->v_iflag & VI_EXECMAP) {
986 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages);
987 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages);
988 }
989 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
990 mutex_exit(vp->v_interlock);
991
992 /*
993 * Clean out any cached data associated with the vnode.
994 * If purging an active vnode, it must be closed and
995 * deactivated before being reclaimed. Note that the
996 * VOP_INACTIVE will unlock the vnode.
997 */
998 if (doclose) {
999 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
1000 if (error != 0) {
1001 if (wapbl_vphaswapbl(vp))
1002 WAPBL_DISCARD(wapbl_vptomp(vp));
1003 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
1004 }
1005 KASSERT(error == 0);
1006 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1007 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) {
1008 spec_node_revoke(vp);
1009 }
1010 }
1011 if (active) {
1012 VOP_INACTIVE(vp, &recycle);
1013 } else {
1014 /*
1015 * Any other processes trying to obtain this lock must first
1016 * wait for VI_XLOCK to clear, then call the new lock operation.
1017 */
1018 VOP_UNLOCK(vp);
1019 }
1020
1021 /* Disassociate the underlying file system from the vnode. */
1022 if (VOP_RECLAIM(vp)) {
1023 vnpanic(vp, "%s: cannot reclaim", __func__);
1024 }
1025
1026 KASSERT(vp->v_data == NULL);
1027 KASSERT(vp->v_uobj.uo_npages == 0);
1028
1029 if (vp->v_type == VREG && vp->v_ractx != NULL) {
1030 uvm_ra_freectx(vp->v_ractx);
1031 vp->v_ractx = NULL;
1032 }
1033
1034 /* Purge name cache. */
1035 cache_purge(vp);
1036
1037 /* Move to dead mount. */
1038 vp->v_vflag &= ~VV_ROOT;
1039 atomic_inc_uint(&dead_mount->mnt_refcnt);
1040 vfs_insmntque(vp, dead_mount);
1041
1042 /* Done with purge, notify sleepers of the grim news. */
1043 mutex_enter(vp->v_interlock);
1044 if (doclose) {
1045 vp->v_op = dead_vnodeop_p;
1046 vp->v_vflag |= VV_LOCKSWORK;
1047 vp->v_iflag |= VI_CLEAN;
1048 } else {
1049 vp->v_op = spec_vnodeop_p;
1050 vp->v_vflag &= ~VV_LOCKSWORK;
1051 }
1052 vp->v_tag = VT_NON;
1053 KNOTE(&vp->v_klist, NOTE_REVOKE);
1054 vp->v_iflag &= ~VI_XLOCK;
1055 cv_broadcast(&vp->v_cv);
1056
1057 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1058 }
1059
1060 /*
1061 * Recycle an unused vnode if caller holds the last reference.
1062 */
1063 bool
1064 vrecycle(vnode_t *vp)
1065 {
1066
1067 mutex_enter(vp->v_interlock);
1068
1069 KASSERT((vp->v_iflag & VI_MARKER) == 0);
1070
1071 if (vp->v_usecount != 1) {
1072 mutex_exit(vp->v_interlock);
1073 return false;
1074 }
1075 if ((vp->v_iflag & VI_CHANGING) != 0)
1076 vwait(vp, VI_CHANGING);
1077 if (vp->v_usecount != 1) {
1078 mutex_exit(vp->v_interlock);
1079 return false;
1080 } else if ((vp->v_iflag & VI_CLEAN) != 0) {
1081 mutex_exit(vp->v_interlock);
1082 return true;
1083 }
1084 vp->v_iflag |= VI_CHANGING;
1085 vclean(vp);
1086 vrelel(vp, VRELEL_CHANGING_SET);
1087 return true;
1088 }
1089
1090 /*
1091 * Eliminate all activity associated with the requested vnode
1092 * and with all vnodes aliased to the requested vnode.
1093 */
1094 void
1095 vrevoke(vnode_t *vp)
1096 {
1097 vnode_t *vq;
1098 enum vtype type;
1099 dev_t dev;
1100
1101 KASSERT(vp->v_usecount > 0);
1102
1103 mutex_enter(vp->v_interlock);
1104 if ((vp->v_iflag & VI_CLEAN) != 0) {
1105 mutex_exit(vp->v_interlock);
1106 return;
1107 } else if (vp->v_type != VBLK && vp->v_type != VCHR) {
1108 atomic_inc_uint(&vp->v_usecount);
1109 mutex_exit(vp->v_interlock);
1110 vgone(vp);
1111 return;
1112 } else {
1113 dev = vp->v_rdev;
1114 type = vp->v_type;
1115 mutex_exit(vp->v_interlock);
1116 }
1117
1118 while (spec_node_lookup_by_dev(type, dev, &vq) == 0) {
1119 vgone(vq);
1120 }
1121 }
1122
1123 /*
1124 * Eliminate all activity associated with a vnode in preparation for
1125 * reuse. Drops a reference from the vnode.
1126 */
1127 void
1128 vgone(vnode_t *vp)
1129 {
1130
1131 mutex_enter(vp->v_interlock);
1132 if ((vp->v_iflag & VI_CHANGING) != 0)
1133 vwait(vp, VI_CHANGING);
1134 vp->v_iflag |= VI_CHANGING;
1135 vclean(vp);
1136 vrelel(vp, VRELEL_CHANGING_SET);
1137 }
1138
1139 static inline uint32_t
1140 vcache_hash(const struct vcache_key *key)
1141 {
1142 uint32_t hash = HASH32_BUF_INIT;
1143
1144 hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash);
1145 hash = hash32_buf(key->vk_key, key->vk_key_len, hash);
1146 return hash;
1147 }
1148
1149 static void
1150 vcache_init(void)
1151 {
1152
1153 vcache.pool = pool_cache_init(sizeof(struct vcache_node), 0, 0, 0,
1154 "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL);
1155 KASSERT(vcache.pool != NULL);
1156 mutex_init(&vcache.lock, MUTEX_DEFAULT, IPL_NONE);
1157 vcache.hashtab = hashinit(desiredvnodes, HASH_SLIST, true,
1158 &vcache.hashmask);
1159 }
1160
1161 static void
1162 vcache_reinit(void)
1163 {
1164 int i;
1165 uint32_t hash;
1166 u_long oldmask, newmask;
1167 struct hashhead *oldtab, *newtab;
1168 struct vcache_node *node;
1169
1170 newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask);
1171 mutex_enter(&vcache.lock);
1172 oldtab = vcache.hashtab;
1173 oldmask = vcache.hashmask;
1174 vcache.hashtab = newtab;
1175 vcache.hashmask = newmask;
1176 for (i = 0; i <= oldmask; i++) {
1177 while ((node = SLIST_FIRST(&oldtab[i])) != NULL) {
1178 SLIST_REMOVE(&oldtab[i], node, vcache_node, vn_hash);
1179 hash = vcache_hash(&node->vn_key);
1180 SLIST_INSERT_HEAD(&newtab[hash & vcache.hashmask],
1181 node, vn_hash);
1182 }
1183 }
1184 mutex_exit(&vcache.lock);
1185 hashdone(oldtab, HASH_SLIST, oldmask);
1186 }
1187
1188 static inline struct vcache_node *
1189 vcache_hash_lookup(const struct vcache_key *key, uint32_t hash)
1190 {
1191 struct hashhead *hashp;
1192 struct vcache_node *node;
1193
1194 KASSERT(mutex_owned(&vcache.lock));
1195
1196 hashp = &vcache.hashtab[hash & vcache.hashmask];
1197 SLIST_FOREACH(node, hashp, vn_hash) {
1198 if (key->vk_mount != node->vn_key.vk_mount)
1199 continue;
1200 if (key->vk_key_len != node->vn_key.vk_key_len)
1201 continue;
1202 if (memcmp(key->vk_key, node->vn_key.vk_key, key->vk_key_len))
1203 continue;
1204 return node;
1205 }
1206 return NULL;
1207 }
1208
1209 /*
1210 * Get a vnode / fs node pair by key and return it referenced through vpp.
1211 */
1212 int
1213 vcache_get(struct mount *mp, const void *key, size_t key_len,
1214 struct vnode **vpp)
1215 {
1216 int error;
1217 uint32_t hash;
1218 const void *new_key;
1219 struct vnode *vp;
1220 struct vcache_key vcache_key;
1221 struct vcache_node *node, *new_node;
1222
1223 new_key = NULL;
1224 *vpp = NULL;
1225
1226 vcache_key.vk_mount = mp;
1227 vcache_key.vk_key = key;
1228 vcache_key.vk_key_len = key_len;
1229 hash = vcache_hash(&vcache_key);
1230
1231 again:
1232 mutex_enter(&vcache.lock);
1233 node = vcache_hash_lookup(&vcache_key, hash);
1234
1235 /* If found, take a reference or retry. */
1236 if (__predict_true(node != NULL && node->vn_vnode != NULL)) {
1237 vp = node->vn_vnode;
1238 mutex_enter(vp->v_interlock);
1239 mutex_exit(&vcache.lock);
1240 error = vget(vp, 0, true /* wait */);
1241 if (error == ENOENT)
1242 goto again;
1243 if (error == 0)
1244 *vpp = vp;
1245 KASSERT((error != 0) == (*vpp == NULL));
1246 return error;
1247 }
1248
1249 /* If another thread loads this node, wait and retry. */
1250 if (node != NULL) {
1251 KASSERT(node->vn_vnode == NULL);
1252 mutex_exit(&vcache.lock);
1253 kpause("vcache", false, mstohz(20), NULL);
1254 goto again;
1255 }
1256 mutex_exit(&vcache.lock);
1257
1258 /* Allocate and initialize a new vcache / vnode pair. */
1259 error = vfs_busy(mp, NULL);
1260 if (error)
1261 return error;
1262 new_node = pool_cache_get(vcache.pool, PR_WAITOK);
1263 new_node->vn_vnode = NULL;
1264 new_node->vn_key = vcache_key;
1265 vp = vnalloc(NULL);
1266 mutex_enter(&vcache.lock);
1267 node = vcache_hash_lookup(&vcache_key, hash);
1268 if (node == NULL) {
1269 SLIST_INSERT_HEAD(&vcache.hashtab[hash & vcache.hashmask],
1270 new_node, vn_hash);
1271 node = new_node;
1272 }
1273 mutex_exit(&vcache.lock);
1274
1275 /* If another thread beat us inserting this node, retry. */
1276 if (node != new_node) {
1277 pool_cache_put(vcache.pool, new_node);
1278 KASSERT(vp->v_usecount == 1);
1279 vp->v_usecount = 0;
1280 vnfree(vp);
1281 vfs_unbusy(mp, false, NULL);
1282 goto again;
1283 }
1284
1285 /* Load the fs node. Exclusive as new_node->vn_vnode is NULL. */
1286 vp->v_iflag |= VI_CHANGING;
1287 error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key);
1288 if (error) {
1289 mutex_enter(&vcache.lock);
1290 SLIST_REMOVE(&vcache.hashtab[hash & vcache.hashmask],
1291 new_node, vcache_node, vn_hash);
1292 mutex_exit(&vcache.lock);
1293 pool_cache_put(vcache.pool, new_node);
1294 KASSERT(vp->v_usecount == 1);
1295 vp->v_usecount = 0;
1296 vnfree(vp);
1297 vfs_unbusy(mp, false, NULL);
1298 KASSERT(*vpp == NULL);
1299 return error;
1300 }
1301 KASSERT(new_key != NULL);
1302 KASSERT(memcmp(key, new_key, key_len) == 0);
1303 KASSERT(vp->v_op != NULL);
1304 vfs_insmntque(vp, mp);
1305 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1306 vp->v_vflag |= VV_MPSAFE;
1307 vfs_unbusy(mp, true, NULL);
1308
1309 /* Finished loading, finalize node. */
1310 mutex_enter(&vcache.lock);
1311 new_node->vn_key.vk_key = new_key;
1312 new_node->vn_vnode = vp;
1313 mutex_exit(&vcache.lock);
1314 mutex_enter(vp->v_interlock);
1315 vp->v_iflag &= ~VI_CHANGING;
1316 cv_broadcast(&vp->v_cv);
1317 mutex_exit(vp->v_interlock);
1318 *vpp = vp;
1319 return 0;
1320 }
1321
1322 /*
1323 * Create a new vnode / fs node pair and return it referenced through vpp.
1324 */
1325 int
1326 vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap,
1327 kauth_cred_t cred, struct vnode **vpp)
1328 {
1329 int error;
1330 uint32_t hash;
1331 struct vnode *vp;
1332 struct vcache_node *new_node;
1333 struct vcache_node *old_node __diagused;
1334
1335 *vpp = NULL;
1336
1337 /* Allocate and initialize a new vcache / vnode pair. */
1338 error = vfs_busy(mp, NULL);
1339 if (error)
1340 return error;
1341 new_node = pool_cache_get(vcache.pool, PR_WAITOK);
1342 new_node->vn_key.vk_mount = mp;
1343 new_node->vn_vnode = NULL;
1344 vp = vnalloc(NULL);
1345
1346 /* Create and load the fs node. */
1347 vp->v_iflag |= VI_CHANGING;
1348 error = VFS_NEWVNODE(mp, dvp, vp, vap, cred,
1349 &new_node->vn_key.vk_key_len, &new_node->vn_key.vk_key);
1350 if (error) {
1351 pool_cache_put(vcache.pool, new_node);
1352 KASSERT(vp->v_usecount == 1);
1353 vp->v_usecount = 0;
1354 vnfree(vp);
1355 vfs_unbusy(mp, false, NULL);
1356 KASSERT(*vpp == NULL);
1357 return error;
1358 }
1359 KASSERT(new_node->vn_key.vk_key != NULL);
1360 KASSERT(vp->v_op != NULL);
1361 hash = vcache_hash(&new_node->vn_key);
1362
1363 /* Wait for previous instance to be reclaimed, then insert new node. */
1364 mutex_enter(&vcache.lock);
1365 while ((old_node = vcache_hash_lookup(&new_node->vn_key, hash))) {
1366 #ifdef DIAGNOSTIC
1367 if (old_node->vn_vnode != NULL)
1368 mutex_enter(old_node->vn_vnode->v_interlock);
1369 KASSERT(old_node->vn_vnode == NULL ||
1370 (old_node->vn_vnode->v_iflag & (VI_XLOCK | VI_CLEAN)) != 0);
1371 if (old_node->vn_vnode != NULL)
1372 mutex_exit(old_node->vn_vnode->v_interlock);
1373 #endif
1374 mutex_exit(&vcache.lock);
1375 kpause("vcache", false, mstohz(20), NULL);
1376 mutex_enter(&vcache.lock);
1377 }
1378 SLIST_INSERT_HEAD(&vcache.hashtab[hash & vcache.hashmask],
1379 new_node, vn_hash);
1380 mutex_exit(&vcache.lock);
1381 vfs_insmntque(vp, mp);
1382 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1383 vp->v_vflag |= VV_MPSAFE;
1384 vfs_unbusy(mp, true, NULL);
1385
1386 /* Finished loading, finalize node. */
1387 mutex_enter(&vcache.lock);
1388 new_node->vn_vnode = vp;
1389 mutex_exit(&vcache.lock);
1390 mutex_enter(vp->v_interlock);
1391 vp->v_iflag &= ~VI_CHANGING;
1392 cv_broadcast(&vp->v_cv);
1393 mutex_exit(vp->v_interlock);
1394 *vpp = vp;
1395 return 0;
1396 }
1397
1398 /*
1399 * Prepare key change: lock old and new cache node.
1400 * Return an error if the new node already exists.
1401 */
1402 int
1403 vcache_rekey_enter(struct mount *mp, struct vnode *vp,
1404 const void *old_key, size_t old_key_len,
1405 const void *new_key, size_t new_key_len)
1406 {
1407 uint32_t old_hash, new_hash;
1408 struct vcache_key old_vcache_key, new_vcache_key;
1409 struct vcache_node *node, *new_node;
1410
1411 old_vcache_key.vk_mount = mp;
1412 old_vcache_key.vk_key = old_key;
1413 old_vcache_key.vk_key_len = old_key_len;
1414 old_hash = vcache_hash(&old_vcache_key);
1415
1416 new_vcache_key.vk_mount = mp;
1417 new_vcache_key.vk_key = new_key;
1418 new_vcache_key.vk_key_len = new_key_len;
1419 new_hash = vcache_hash(&new_vcache_key);
1420
1421 new_node = pool_cache_get(vcache.pool, PR_WAITOK);
1422 new_node->vn_vnode = NULL;
1423 new_node->vn_key = new_vcache_key;
1424
1425 mutex_enter(&vcache.lock);
1426 node = vcache_hash_lookup(&new_vcache_key, new_hash);
1427 if (node != NULL) {
1428 mutex_exit(&vcache.lock);
1429 pool_cache_put(vcache.pool, new_node);
1430 return EEXIST;
1431 }
1432 SLIST_INSERT_HEAD(&vcache.hashtab[new_hash & vcache.hashmask],
1433 new_node, vn_hash);
1434 node = vcache_hash_lookup(&old_vcache_key, old_hash);
1435 KASSERT(node != NULL);
1436 KASSERT(node->vn_vnode == vp);
1437 node->vn_vnode = NULL;
1438 node->vn_key = old_vcache_key;
1439 mutex_exit(&vcache.lock);
1440 return 0;
1441 }
1442
1443 /*
1444 * Key change complete: remove old node and unlock new node.
1445 */
1446 void
1447 vcache_rekey_exit(struct mount *mp, struct vnode *vp,
1448 const void *old_key, size_t old_key_len,
1449 const void *new_key, size_t new_key_len)
1450 {
1451 uint32_t old_hash, new_hash;
1452 struct vcache_key old_vcache_key, new_vcache_key;
1453 struct vcache_node *node;
1454
1455 old_vcache_key.vk_mount = mp;
1456 old_vcache_key.vk_key = old_key;
1457 old_vcache_key.vk_key_len = old_key_len;
1458 old_hash = vcache_hash(&old_vcache_key);
1459
1460 new_vcache_key.vk_mount = mp;
1461 new_vcache_key.vk_key = new_key;
1462 new_vcache_key.vk_key_len = new_key_len;
1463 new_hash = vcache_hash(&new_vcache_key);
1464
1465 mutex_enter(&vcache.lock);
1466 node = vcache_hash_lookup(&new_vcache_key, new_hash);
1467 KASSERT(node != NULL && node->vn_vnode == NULL);
1468 KASSERT(node->vn_key.vk_key_len == new_key_len);
1469 node->vn_vnode = vp;
1470 node->vn_key = new_vcache_key;
1471 node = vcache_hash_lookup(&old_vcache_key, old_hash);
1472 KASSERT(node != NULL);
1473 KASSERT(node->vn_vnode == NULL);
1474 SLIST_REMOVE(&vcache.hashtab[old_hash & vcache.hashmask],
1475 node, vcache_node, vn_hash);
1476 mutex_exit(&vcache.lock);
1477 pool_cache_put(vcache.pool, node);
1478 }
1479
1480 /*
1481 * Remove a vnode / fs node pair from the cache.
1482 */
1483 void
1484 vcache_remove(struct mount *mp, const void *key, size_t key_len)
1485 {
1486 uint32_t hash;
1487 struct vcache_key vcache_key;
1488 struct vcache_node *node;
1489
1490 vcache_key.vk_mount = mp;
1491 vcache_key.vk_key = key;
1492 vcache_key.vk_key_len = key_len;
1493 hash = vcache_hash(&vcache_key);
1494
1495 mutex_enter(&vcache.lock);
1496 node = vcache_hash_lookup(&vcache_key, hash);
1497 KASSERT(node != NULL);
1498 SLIST_REMOVE(&vcache.hashtab[hash & vcache.hashmask],
1499 node, vcache_node, vn_hash);
1500 mutex_exit(&vcache.lock);
1501 pool_cache_put(vcache.pool, node);
1502 }
1503
1504 /*
1505 * Update outstanding I/O count and do wakeup if requested.
1506 */
1507 void
1508 vwakeup(struct buf *bp)
1509 {
1510 vnode_t *vp;
1511
1512 if ((vp = bp->b_vp) == NULL)
1513 return;
1514
1515 KASSERT(bp->b_objlock == vp->v_interlock);
1516 KASSERT(mutex_owned(bp->b_objlock));
1517
1518 if (--vp->v_numoutput < 0)
1519 vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp);
1520 if (vp->v_numoutput == 0)
1521 cv_broadcast(&vp->v_cv);
1522 }
1523
1524 /*
1525 * Test a vnode for being or becoming dead. Returns one of:
1526 * EBUSY: vnode is becoming dead, with "flags == VDEAD_NOWAIT" only.
1527 * ENOENT: vnode is dead.
1528 * 0: otherwise.
1529 *
1530 * Whenever this function returns a non-zero value all future
1531 * calls will also return a non-zero value.
1532 */
1533 int
1534 vdead_check(struct vnode *vp, int flags)
1535 {
1536
1537 KASSERT(mutex_owned(vp->v_interlock));
1538 if (ISSET(vp->v_iflag, VI_XLOCK)) {
1539 if (ISSET(flags, VDEAD_NOWAIT))
1540 return EBUSY;
1541 vwait(vp, VI_XLOCK);
1542 KASSERT(ISSET(vp->v_iflag, VI_CLEAN));
1543 }
1544 if (ISSET(vp->v_iflag, VI_CLEAN))
1545 return ENOENT;
1546 return 0;
1547 }
1548
1549 /*
1550 * Wait for a vnode (typically with VI_XLOCK set) to be cleaned or
1551 * recycled.
1552 */
1553 static void
1554 vwait(vnode_t *vp, int flags)
1555 {
1556
1557 KASSERT(mutex_owned(vp->v_interlock));
1558 KASSERT(vp->v_usecount != 0);
1559
1560 while ((vp->v_iflag & flags) != 0)
1561 cv_wait(&vp->v_cv, vp->v_interlock);
1562 }
1563
1564 int
1565 vfs_drainvnodes(long target)
1566 {
1567 int error;
1568
1569 mutex_enter(&vnode_free_list_lock);
1570
1571 while (numvnodes > target) {
1572 error = cleanvnode();
1573 if (error != 0)
1574 return error;
1575 mutex_enter(&vnode_free_list_lock);
1576 }
1577
1578 mutex_exit(&vnode_free_list_lock);
1579
1580 vcache_reinit();
1581
1582 return 0;
1583 }
1584
1585 void
1586 vnpanic(vnode_t *vp, const char *fmt, ...)
1587 {
1588 va_list ap;
1589
1590 #ifdef DIAGNOSTIC
1591 vprint(NULL, vp);
1592 #endif
1593 va_start(ap, fmt);
1594 vpanic(fmt, ap);
1595 va_end(ap);
1596 }
1597