vfs_vnode.c revision 1.41 1 /* $NetBSD: vfs_vnode.c,v 1.41 2015/04/20 13:44:16 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 1997-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /*
34 * Copyright (c) 1989, 1993
35 * The Regents of the University of California. All rights reserved.
36 * (c) UNIX System Laboratories, Inc.
37 * All or some portions of this file are derived from material licensed
38 * to the University of California by American Telephone and Telegraph
39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40 * the permission of UNIX System Laboratories, Inc.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
67 */
68
69 /*
70 * The vnode cache subsystem.
71 *
72 * Life-cycle
73 *
74 * Normally, there are two points where new vnodes are created:
75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode
76 * starts in one of the following ways:
77 *
78 * - Allocation, via getnewvnode(9) and/or vnalloc(9).
79 * - Reclamation of inactive vnode, via vget(9).
80 *
81 * Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9)
82 * was another, traditional way. Currently, only the draining thread
83 * recycles the vnodes. This behaviour might be revisited.
84 *
85 * The life-cycle ends when the last reference is dropped, usually
86 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform
87 * the file system that vnode is inactive. Via this call, file system
88 * indicates whether vnode can be recycled (usually, it checks its own
89 * references, e.g. count of links, whether the file was removed).
90 *
91 * Depending on indication, vnode can be put into a free list (cache),
92 * or cleaned via vclean(9), which calls VOP_RECLAIM(9) to disassociate
93 * underlying file system from the vnode, and finally destroyed.
94 *
95 * Reference counting
96 *
97 * Vnode is considered active, if reference count (vnode_t::v_usecount)
98 * is non-zero. It is maintained using: vref(9) and vrele(9), as well
99 * as vput(9), routines. Common points holding references are e.g.
100 * file openings, current working directory, mount points, etc.
101 *
102 * Note on v_usecount and its locking
103 *
104 * At nearly all points it is known that v_usecount could be zero,
105 * the vnode_t::v_interlock will be held. To change v_usecount away
106 * from zero, the interlock must be held. To change from a non-zero
107 * value to zero, again the interlock must be held.
108 *
109 * Changing the usecount from a non-zero value to a non-zero value can
110 * safely be done using atomic operations, without the interlock held.
111 *
112 * Note: if VI_CLEAN is set, vnode_t::v_interlock will be released while
113 * mntvnode_lock is still held.
114 *
115 * See PR 41374.
116 */
117
118 #include <sys/cdefs.h>
119 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.41 2015/04/20 13:44:16 riastradh Exp $");
120
121 #define _VFS_VNODE_PRIVATE
122
123 #include <sys/param.h>
124 #include <sys/kernel.h>
125
126 #include <sys/atomic.h>
127 #include <sys/buf.h>
128 #include <sys/conf.h>
129 #include <sys/device.h>
130 #include <sys/hash.h>
131 #include <sys/kauth.h>
132 #include <sys/kmem.h>
133 #include <sys/kthread.h>
134 #include <sys/module.h>
135 #include <sys/mount.h>
136 #include <sys/namei.h>
137 #include <sys/syscallargs.h>
138 #include <sys/sysctl.h>
139 #include <sys/systm.h>
140 #include <sys/vnode.h>
141 #include <sys/wapbl.h>
142 #include <sys/fstrans.h>
143
144 #include <uvm/uvm.h>
145 #include <uvm/uvm_readahead.h>
146
147 /* Flags to vrelel. */
148 #define VRELEL_ASYNC_RELE 0x0001 /* Always defer to vrele thread. */
149 #define VRELEL_CHANGING_SET 0x0002 /* VI_CHANGING set by caller. */
150
151 struct vcache_key {
152 struct mount *vk_mount;
153 const void *vk_key;
154 size_t vk_key_len;
155 };
156 struct vcache_node {
157 SLIST_ENTRY(vcache_node) vn_hash;
158 struct vnode *vn_vnode;
159 struct vcache_key vn_key;
160 };
161
162 u_int numvnodes __cacheline_aligned;
163
164 static pool_cache_t vnode_cache __read_mostly;
165 static struct mount *dead_mount;
166
167 /*
168 * There are two free lists: one is for vnodes which have no buffer/page
169 * references and one for those which do (i.e. v_holdcnt is non-zero).
170 * Vnode recycling mechanism first attempts to look into the former list.
171 */
172 static kmutex_t vnode_free_list_lock __cacheline_aligned;
173 static vnodelst_t vnode_free_list __cacheline_aligned;
174 static vnodelst_t vnode_hold_list __cacheline_aligned;
175 static kcondvar_t vdrain_cv __cacheline_aligned;
176
177 static vnodelst_t vrele_list __cacheline_aligned;
178 static kmutex_t vrele_lock __cacheline_aligned;
179 static kcondvar_t vrele_cv __cacheline_aligned;
180 static lwp_t * vrele_lwp __cacheline_aligned;
181 static int vrele_pending __cacheline_aligned;
182 static int vrele_gen __cacheline_aligned;
183
184 SLIST_HEAD(hashhead, vcache_node);
185 static struct {
186 kmutex_t lock;
187 u_long hashmask;
188 struct hashhead *hashtab;
189 pool_cache_t pool;
190 } vcache __cacheline_aligned;
191
192 static int cleanvnode(void);
193 static void vcache_init(void);
194 static void vcache_reinit(void);
195 static void vclean(vnode_t *);
196 static void vrelel(vnode_t *, int);
197 static void vdrain_thread(void *);
198 static void vrele_thread(void *);
199 static void vnpanic(vnode_t *, const char *, ...)
200 __printflike(2, 3);
201 static void vwait(vnode_t *, int);
202
203 /* Routines having to do with the management of the vnode table. */
204 extern int (**dead_vnodeop_p)(void *);
205 extern struct vfsops dead_vfsops;
206
207 void
208 vfs_vnode_sysinit(void)
209 {
210 int error __diagused;
211
212 vnode_cache = pool_cache_init(sizeof(vnode_t), 0, 0, 0, "vnodepl",
213 NULL, IPL_NONE, NULL, NULL, NULL);
214 KASSERT(vnode_cache != NULL);
215
216 dead_mount = vfs_mountalloc(&dead_vfsops, NULL);
217 KASSERT(dead_mount != NULL);
218 dead_mount->mnt_iflag = IMNT_MPSAFE;
219
220 mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE);
221 TAILQ_INIT(&vnode_free_list);
222 TAILQ_INIT(&vnode_hold_list);
223 TAILQ_INIT(&vrele_list);
224
225 vcache_init();
226
227 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE);
228 cv_init(&vdrain_cv, "vdrain");
229 cv_init(&vrele_cv, "vrele");
230 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread,
231 NULL, NULL, "vdrain");
232 KASSERT(error == 0);
233 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread,
234 NULL, &vrele_lwp, "vrele");
235 KASSERT(error == 0);
236 }
237
238 /*
239 * Allocate a new, uninitialized vnode. If 'mp' is non-NULL, this is a
240 * marker vnode.
241 */
242 vnode_t *
243 vnalloc(struct mount *mp)
244 {
245 vnode_t *vp;
246
247 vp = pool_cache_get(vnode_cache, PR_WAITOK);
248 KASSERT(vp != NULL);
249
250 memset(vp, 0, sizeof(*vp));
251 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0);
252 cv_init(&vp->v_cv, "vnode");
253 /*
254 * Done by memset() above.
255 * LIST_INIT(&vp->v_nclist);
256 * LIST_INIT(&vp->v_dnclist);
257 */
258
259 if (mp != NULL) {
260 vp->v_mount = mp;
261 vp->v_type = VBAD;
262 vp->v_iflag = VI_MARKER;
263 return vp;
264 }
265
266 mutex_enter(&vnode_free_list_lock);
267 numvnodes++;
268 if (numvnodes > desiredvnodes + desiredvnodes / 10)
269 cv_signal(&vdrain_cv);
270 mutex_exit(&vnode_free_list_lock);
271
272 rw_init(&vp->v_lock);
273 vp->v_usecount = 1;
274 vp->v_type = VNON;
275 vp->v_size = vp->v_writesize = VSIZENOTSET;
276
277 return vp;
278 }
279
280 /*
281 * Free an unused, unreferenced vnode.
282 */
283 void
284 vnfree(vnode_t *vp)
285 {
286
287 KASSERT(vp->v_usecount == 0);
288
289 if ((vp->v_iflag & VI_MARKER) == 0) {
290 rw_destroy(&vp->v_lock);
291 mutex_enter(&vnode_free_list_lock);
292 numvnodes--;
293 mutex_exit(&vnode_free_list_lock);
294 }
295
296 /*
297 * Note: the vnode interlock will either be freed, of reference
298 * dropped (if VI_LOCKSHARE was in use).
299 */
300 uvm_obj_destroy(&vp->v_uobj, true);
301 cv_destroy(&vp->v_cv);
302 pool_cache_put(vnode_cache, vp);
303 }
304
305 /*
306 * cleanvnode: grab a vnode from freelist, clean and free it.
307 *
308 * => Releases vnode_free_list_lock.
309 */
310 static int
311 cleanvnode(void)
312 {
313 vnode_t *vp;
314 vnodelst_t *listhd;
315 struct mount *mp;
316
317 KASSERT(mutex_owned(&vnode_free_list_lock));
318
319 listhd = &vnode_free_list;
320 try_nextlist:
321 TAILQ_FOREACH(vp, listhd, v_freelist) {
322 /*
323 * It's safe to test v_usecount and v_iflag
324 * without holding the interlock here, since
325 * these vnodes should never appear on the
326 * lists.
327 */
328 KASSERT(vp->v_usecount == 0);
329 KASSERT((vp->v_iflag & VI_CLEAN) == 0);
330 KASSERT(vp->v_freelisthd == listhd);
331
332 if (!mutex_tryenter(vp->v_interlock))
333 continue;
334 if ((vp->v_iflag & VI_XLOCK) != 0) {
335 mutex_exit(vp->v_interlock);
336 continue;
337 }
338 mp = vp->v_mount;
339 if (fstrans_start_nowait(mp, FSTRANS_SHARED) != 0) {
340 mutex_exit(vp->v_interlock);
341 continue;
342 }
343 break;
344 }
345
346 if (vp == NULL) {
347 if (listhd == &vnode_free_list) {
348 listhd = &vnode_hold_list;
349 goto try_nextlist;
350 }
351 mutex_exit(&vnode_free_list_lock);
352 return EBUSY;
353 }
354
355 /* Remove it from the freelist. */
356 TAILQ_REMOVE(listhd, vp, v_freelist);
357 vp->v_freelisthd = NULL;
358 mutex_exit(&vnode_free_list_lock);
359
360 KASSERT(vp->v_usecount == 0);
361
362 /*
363 * The vnode is still associated with a file system, so we must
364 * clean it out before freeing it. We need to add a reference
365 * before doing this.
366 */
367 vp->v_usecount = 1;
368 KASSERT((vp->v_iflag & VI_CHANGING) == 0);
369 vp->v_iflag |= VI_CHANGING;
370 vclean(vp);
371 vrelel(vp, VRELEL_CHANGING_SET);
372 fstrans_done(mp);
373
374 return 0;
375 }
376
377 /*
378 * getnewvnode: return a fresh vnode.
379 *
380 * => Returns referenced vnode, moved into the mount queue.
381 * => Shares the interlock specified by 'slock', if it is not NULL.
382 */
383 int
384 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *),
385 kmutex_t *slock, vnode_t **vpp)
386 {
387 struct uvm_object *uobj __diagused;
388 vnode_t *vp;
389 int error = 0;
390
391 if (mp != NULL) {
392 /*
393 * Mark filesystem busy while we are creating a vnode.
394 * If unmount is in progress, this will fail.
395 */
396 error = vfs_busy(mp, NULL);
397 if (error)
398 return error;
399 }
400
401 vp = NULL;
402
403 /* Allocate a new vnode. */
404 vp = vnalloc(NULL);
405
406 KASSERT(vp->v_freelisthd == NULL);
407 KASSERT(LIST_EMPTY(&vp->v_nclist));
408 KASSERT(LIST_EMPTY(&vp->v_dnclist));
409 KASSERT(vp->v_data == NULL);
410
411 /* Initialize vnode. */
412 vp->v_tag = tag;
413 vp->v_op = vops;
414
415 uobj = &vp->v_uobj;
416 KASSERT(uobj->pgops == &uvm_vnodeops);
417 KASSERT(uobj->uo_npages == 0);
418 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL);
419
420 /* Share the vnode_t::v_interlock, if requested. */
421 if (slock) {
422 /* Set the interlock and mark that it is shared. */
423 KASSERT(vp->v_mount == NULL);
424 mutex_obj_hold(slock);
425 uvm_obj_setlock(&vp->v_uobj, slock);
426 KASSERT(vp->v_interlock == slock);
427 vp->v_iflag |= VI_LOCKSHARE;
428 }
429
430 /* Finally, move vnode into the mount queue. */
431 vfs_insmntque(vp, mp);
432
433 if (mp != NULL) {
434 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
435 vp->v_vflag |= VV_MPSAFE;
436 vfs_unbusy(mp, true, NULL);
437 }
438
439 *vpp = vp;
440 return 0;
441 }
442
443 /*
444 * This is really just the reverse of getnewvnode(). Needed for
445 * VFS_VGET functions who may need to push back a vnode in case
446 * of a locking race.
447 */
448 void
449 ungetnewvnode(vnode_t *vp)
450 {
451
452 KASSERT(vp->v_usecount == 1);
453 KASSERT(vp->v_data == NULL);
454 KASSERT(vp->v_freelisthd == NULL);
455
456 mutex_enter(vp->v_interlock);
457 vp->v_iflag |= VI_CLEAN;
458 vrelel(vp, 0);
459 }
460
461 /*
462 * Helper thread to keep the number of vnodes below desiredvnodes.
463 */
464 static void
465 vdrain_thread(void *cookie)
466 {
467 int error;
468
469 mutex_enter(&vnode_free_list_lock);
470
471 for (;;) {
472 cv_timedwait(&vdrain_cv, &vnode_free_list_lock, hz);
473 while (numvnodes > desiredvnodes) {
474 error = cleanvnode();
475 if (error)
476 kpause("vndsbusy", false, hz, NULL);
477 mutex_enter(&vnode_free_list_lock);
478 if (error)
479 break;
480 }
481 }
482 }
483
484 /*
485 * Remove a vnode from its freelist.
486 */
487 void
488 vremfree(vnode_t *vp)
489 {
490
491 KASSERT(mutex_owned(vp->v_interlock));
492 KASSERT(vp->v_usecount == 0);
493
494 /*
495 * Note that the reference count must not change until
496 * the vnode is removed.
497 */
498 mutex_enter(&vnode_free_list_lock);
499 if (vp->v_holdcnt > 0) {
500 KASSERT(vp->v_freelisthd == &vnode_hold_list);
501 } else {
502 KASSERT(vp->v_freelisthd == &vnode_free_list);
503 }
504 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
505 vp->v_freelisthd = NULL;
506 mutex_exit(&vnode_free_list_lock);
507 }
508
509 /*
510 * vget: get a particular vnode from the free list, increment its reference
511 * count and lock it.
512 *
513 * => Should be called with v_interlock held.
514 *
515 * If VI_CHANGING is set, the vnode may be eliminated in vgone()/vclean().
516 * In that case, we cannot grab the vnode, so the process is awakened when
517 * the transition is completed, and an error returned to indicate that the
518 * vnode is no longer usable.
519 */
520 int
521 vget(vnode_t *vp, int flags, bool waitok)
522 {
523 int error = 0;
524
525 KASSERT((vp->v_iflag & VI_MARKER) == 0);
526 KASSERT(mutex_owned(vp->v_interlock));
527 KASSERT((flags & ~LK_NOWAIT) == 0);
528 KASSERT(waitok == ((flags & LK_NOWAIT) == 0));
529
530 /*
531 * Before adding a reference, we must remove the vnode
532 * from its freelist.
533 */
534 if (vp->v_usecount == 0) {
535 vremfree(vp);
536 vp->v_usecount = 1;
537 } else {
538 atomic_inc_uint(&vp->v_usecount);
539 }
540
541 /*
542 * If the vnode is in the process of changing state we wait
543 * for the change to complete and take care not to return
544 * a clean vnode.
545 */
546 if ((vp->v_iflag & VI_CHANGING) != 0) {
547 if ((flags & LK_NOWAIT) != 0) {
548 vrelel(vp, 0);
549 return EBUSY;
550 }
551 vwait(vp, VI_CHANGING);
552 if ((vp->v_iflag & VI_CLEAN) != 0) {
553 vrelel(vp, 0);
554 return ENOENT;
555 }
556 }
557
558 /*
559 * Ok, we got it in good shape.
560 */
561 KASSERT((vp->v_iflag & VI_CLEAN) == 0);
562 mutex_exit(vp->v_interlock);
563 return error;
564 }
565
566 /*
567 * vput: unlock and release the reference.
568 */
569 void
570 vput(vnode_t *vp)
571 {
572
573 KASSERT((vp->v_iflag & VI_MARKER) == 0);
574
575 VOP_UNLOCK(vp);
576 vrele(vp);
577 }
578
579 /*
580 * Try to drop reference on a vnode. Abort if we are releasing the
581 * last reference. Note: this _must_ succeed if not the last reference.
582 */
583 static inline bool
584 vtryrele(vnode_t *vp)
585 {
586 u_int use, next;
587
588 for (use = vp->v_usecount;; use = next) {
589 if (use == 1) {
590 return false;
591 }
592 KASSERT(use > 1);
593 next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
594 if (__predict_true(next == use)) {
595 return true;
596 }
597 }
598 }
599
600 /*
601 * Vnode release. If reference count drops to zero, call inactive
602 * routine and either return to freelist or free to the pool.
603 */
604 static void
605 vrelel(vnode_t *vp, int flags)
606 {
607 bool recycle, defer;
608 int error;
609
610 KASSERT(mutex_owned(vp->v_interlock));
611 KASSERT((vp->v_iflag & VI_MARKER) == 0);
612 KASSERT(vp->v_freelisthd == NULL);
613
614 if (__predict_false(vp->v_op == dead_vnodeop_p &&
615 (vp->v_iflag & (VI_CLEAN|VI_XLOCK)) == 0)) {
616 vnpanic(vp, "dead but not clean");
617 }
618
619 /*
620 * If not the last reference, just drop the reference count
621 * and unlock.
622 */
623 if (vtryrele(vp)) {
624 if ((flags & VRELEL_CHANGING_SET) != 0) {
625 KASSERT((vp->v_iflag & VI_CHANGING) != 0);
626 vp->v_iflag &= ~VI_CHANGING;
627 cv_broadcast(&vp->v_cv);
628 }
629 mutex_exit(vp->v_interlock);
630 return;
631 }
632 if (vp->v_usecount <= 0 || vp->v_writecount != 0) {
633 vnpanic(vp, "%s: bad ref count", __func__);
634 }
635
636 KASSERT((vp->v_iflag & VI_XLOCK) == 0);
637
638 #ifdef DIAGNOSTIC
639 if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
640 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
641 vprint("vrelel: missing VOP_CLOSE()", vp);
642 }
643 #endif
644
645 /*
646 * If not clean, deactivate the vnode, but preserve
647 * our reference across the call to VOP_INACTIVE().
648 */
649 if ((vp->v_iflag & VI_CLEAN) == 0) {
650 recycle = false;
651
652 /*
653 * XXX This ugly block can be largely eliminated if
654 * locking is pushed down into the file systems.
655 *
656 * Defer vnode release to vrele_thread if caller
657 * requests it explicitly or is the pagedaemon.
658 */
659 if ((curlwp == uvm.pagedaemon_lwp) ||
660 (flags & VRELEL_ASYNC_RELE) != 0) {
661 defer = true;
662 } else if (curlwp == vrele_lwp) {
663 /*
664 * We have to try harder.
665 */
666 mutex_exit(vp->v_interlock);
667 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
668 KASSERT(error == 0);
669 mutex_enter(vp->v_interlock);
670 defer = false;
671 } else {
672 /* If we can't acquire the lock, then defer. */
673 mutex_exit(vp->v_interlock);
674 error = vn_lock(vp,
675 LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
676 defer = (error != 0);
677 mutex_enter(vp->v_interlock);
678 }
679
680 KASSERT(mutex_owned(vp->v_interlock));
681 KASSERT(! (curlwp == vrele_lwp && defer));
682
683 if (defer) {
684 /*
685 * Defer reclaim to the kthread; it's not safe to
686 * clean it here. We donate it our last reference.
687 */
688 if ((flags & VRELEL_CHANGING_SET) != 0) {
689 KASSERT((vp->v_iflag & VI_CHANGING) != 0);
690 vp->v_iflag &= ~VI_CHANGING;
691 cv_broadcast(&vp->v_cv);
692 }
693 mutex_enter(&vrele_lock);
694 TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist);
695 if (++vrele_pending > (desiredvnodes >> 8))
696 cv_signal(&vrele_cv);
697 mutex_exit(&vrele_lock);
698 mutex_exit(vp->v_interlock);
699 return;
700 }
701
702 /*
703 * If the node got another reference while we
704 * released the interlock, don't try to inactivate it yet.
705 */
706 if (__predict_false(vtryrele(vp))) {
707 VOP_UNLOCK(vp);
708 if ((flags & VRELEL_CHANGING_SET) != 0) {
709 KASSERT((vp->v_iflag & VI_CHANGING) != 0);
710 vp->v_iflag &= ~VI_CHANGING;
711 cv_broadcast(&vp->v_cv);
712 }
713 mutex_exit(vp->v_interlock);
714 return;
715 }
716
717 if ((flags & VRELEL_CHANGING_SET) == 0) {
718 KASSERT((vp->v_iflag & VI_CHANGING) == 0);
719 vp->v_iflag |= VI_CHANGING;
720 }
721 mutex_exit(vp->v_interlock);
722
723 /*
724 * The vnode can gain another reference while being
725 * deactivated. If VOP_INACTIVE() indicates that
726 * the described file has been deleted, then recycle
727 * the vnode irrespective of additional references.
728 * Another thread may be waiting to re-use the on-disk
729 * inode.
730 *
731 * Note that VOP_INACTIVE() will drop the vnode lock.
732 */
733 VOP_INACTIVE(vp, &recycle);
734 mutex_enter(vp->v_interlock);
735 if (!recycle) {
736 if (vtryrele(vp)) {
737 KASSERT((vp->v_iflag & VI_CHANGING) != 0);
738 vp->v_iflag &= ~VI_CHANGING;
739 cv_broadcast(&vp->v_cv);
740 mutex_exit(vp->v_interlock);
741 return;
742 }
743 }
744
745 /* Take care of space accounting. */
746 if (vp->v_iflag & VI_EXECMAP) {
747 atomic_add_int(&uvmexp.execpages,
748 -vp->v_uobj.uo_npages);
749 atomic_add_int(&uvmexp.filepages,
750 vp->v_uobj.uo_npages);
751 }
752 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP);
753 vp->v_vflag &= ~VV_MAPPED;
754
755 /*
756 * Recycle the vnode if the file is now unused (unlinked),
757 * otherwise just free it.
758 */
759 if (recycle) {
760 vclean(vp);
761 }
762 KASSERT(vp->v_usecount > 0);
763 } else { /* vnode was already clean */
764 if ((flags & VRELEL_CHANGING_SET) == 0) {
765 KASSERT((vp->v_iflag & VI_CHANGING) == 0);
766 vp->v_iflag |= VI_CHANGING;
767 }
768 }
769
770 if (atomic_dec_uint_nv(&vp->v_usecount) != 0) {
771 /* Gained another reference while being reclaimed. */
772 KASSERT((vp->v_iflag & VI_CHANGING) != 0);
773 vp->v_iflag &= ~VI_CHANGING;
774 cv_broadcast(&vp->v_cv);
775 mutex_exit(vp->v_interlock);
776 return;
777 }
778
779 if ((vp->v_iflag & VI_CLEAN) != 0) {
780 /*
781 * It's clean so destroy it. It isn't referenced
782 * anywhere since it has been reclaimed.
783 */
784 KASSERT(vp->v_holdcnt == 0);
785 KASSERT(vp->v_writecount == 0);
786 mutex_exit(vp->v_interlock);
787 vfs_insmntque(vp, NULL);
788 if (vp->v_type == VBLK || vp->v_type == VCHR) {
789 spec_node_destroy(vp);
790 }
791 vnfree(vp);
792 } else {
793 /*
794 * Otherwise, put it back onto the freelist. It
795 * can't be destroyed while still associated with
796 * a file system.
797 */
798 mutex_enter(&vnode_free_list_lock);
799 if (vp->v_holdcnt > 0) {
800 vp->v_freelisthd = &vnode_hold_list;
801 } else {
802 vp->v_freelisthd = &vnode_free_list;
803 }
804 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
805 mutex_exit(&vnode_free_list_lock);
806 KASSERT((vp->v_iflag & VI_CHANGING) != 0);
807 vp->v_iflag &= ~VI_CHANGING;
808 cv_broadcast(&vp->v_cv);
809 mutex_exit(vp->v_interlock);
810 }
811 }
812
813 void
814 vrele(vnode_t *vp)
815 {
816
817 KASSERT((vp->v_iflag & VI_MARKER) == 0);
818
819 if (vtryrele(vp)) {
820 return;
821 }
822 mutex_enter(vp->v_interlock);
823 vrelel(vp, 0);
824 }
825
826 /*
827 * Asynchronous vnode release, vnode is released in different context.
828 */
829 void
830 vrele_async(vnode_t *vp)
831 {
832
833 KASSERT((vp->v_iflag & VI_MARKER) == 0);
834
835 if (vtryrele(vp)) {
836 return;
837 }
838 mutex_enter(vp->v_interlock);
839 vrelel(vp, VRELEL_ASYNC_RELE);
840 }
841
842 static void
843 vrele_thread(void *cookie)
844 {
845 vnodelst_t skip_list;
846 vnode_t *vp;
847 struct mount *mp;
848
849 TAILQ_INIT(&skip_list);
850
851 mutex_enter(&vrele_lock);
852 for (;;) {
853 while (TAILQ_EMPTY(&vrele_list)) {
854 vrele_gen++;
855 cv_broadcast(&vrele_cv);
856 cv_timedwait(&vrele_cv, &vrele_lock, hz);
857 TAILQ_CONCAT(&vrele_list, &skip_list, v_freelist);
858 }
859 vp = TAILQ_FIRST(&vrele_list);
860 mp = vp->v_mount;
861 TAILQ_REMOVE(&vrele_list, vp, v_freelist);
862 if (fstrans_start_nowait(mp, FSTRANS_LAZY) != 0) {
863 TAILQ_INSERT_TAIL(&skip_list, vp, v_freelist);
864 continue;
865 }
866 vrele_pending--;
867 mutex_exit(&vrele_lock);
868
869 /*
870 * If not the last reference, then ignore the vnode
871 * and look for more work.
872 */
873 mutex_enter(vp->v_interlock);
874 vrelel(vp, 0);
875 fstrans_done(mp);
876 mutex_enter(&vrele_lock);
877 }
878 }
879
880 void
881 vrele_flush(void)
882 {
883 int gen;
884
885 mutex_enter(&vrele_lock);
886 gen = vrele_gen;
887 while (vrele_pending && gen == vrele_gen) {
888 cv_broadcast(&vrele_cv);
889 cv_wait(&vrele_cv, &vrele_lock);
890 }
891 mutex_exit(&vrele_lock);
892 }
893
894 /*
895 * Vnode reference, where a reference is already held by some other
896 * object (for example, a file structure).
897 */
898 void
899 vref(vnode_t *vp)
900 {
901
902 KASSERT((vp->v_iflag & VI_MARKER) == 0);
903 KASSERT(vp->v_usecount != 0);
904
905 atomic_inc_uint(&vp->v_usecount);
906 }
907
908 /*
909 * Page or buffer structure gets a reference.
910 * Called with v_interlock held.
911 */
912 void
913 vholdl(vnode_t *vp)
914 {
915
916 KASSERT(mutex_owned(vp->v_interlock));
917 KASSERT((vp->v_iflag & VI_MARKER) == 0);
918
919 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) {
920 mutex_enter(&vnode_free_list_lock);
921 KASSERT(vp->v_freelisthd == &vnode_free_list);
922 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
923 vp->v_freelisthd = &vnode_hold_list;
924 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
925 mutex_exit(&vnode_free_list_lock);
926 }
927 }
928
929 /*
930 * Page or buffer structure frees a reference.
931 * Called with v_interlock held.
932 */
933 void
934 holdrelel(vnode_t *vp)
935 {
936
937 KASSERT(mutex_owned(vp->v_interlock));
938 KASSERT((vp->v_iflag & VI_MARKER) == 0);
939
940 if (vp->v_holdcnt <= 0) {
941 vnpanic(vp, "%s: holdcnt vp %p", __func__, vp);
942 }
943
944 vp->v_holdcnt--;
945 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) {
946 mutex_enter(&vnode_free_list_lock);
947 KASSERT(vp->v_freelisthd == &vnode_hold_list);
948 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
949 vp->v_freelisthd = &vnode_free_list;
950 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
951 mutex_exit(&vnode_free_list_lock);
952 }
953 }
954
955 /*
956 * Disassociate the underlying file system from a vnode.
957 *
958 * Must be called with the interlock held, and will return with it held.
959 */
960 static void
961 vclean(vnode_t *vp)
962 {
963 lwp_t *l = curlwp;
964 bool recycle, active, doclose;
965 int error;
966
967 KASSERT(mutex_owned(vp->v_interlock));
968 KASSERT((vp->v_iflag & VI_MARKER) == 0);
969 KASSERT(vp->v_usecount != 0);
970
971 /* If already clean, nothing to do. */
972 if ((vp->v_iflag & VI_CLEAN) != 0) {
973 return;
974 }
975
976 active = (vp->v_usecount > 1);
977 doclose = ! (active && vp->v_type == VBLK &&
978 spec_node_getmountedfs(vp) != NULL);
979 mutex_exit(vp->v_interlock);
980
981 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
982
983 /*
984 * Prevent the vnode from being recycled or brought into use
985 * while we clean it out.
986 */
987 mutex_enter(vp->v_interlock);
988 KASSERT((vp->v_iflag & (VI_XLOCK | VI_CLEAN)) == 0);
989 vp->v_iflag |= VI_XLOCK;
990 if (vp->v_iflag & VI_EXECMAP) {
991 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages);
992 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages);
993 }
994 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
995 mutex_exit(vp->v_interlock);
996
997 /*
998 * Clean out any cached data associated with the vnode.
999 * If purging an active vnode, it must be closed and
1000 * deactivated before being reclaimed. Note that the
1001 * VOP_INACTIVE will unlock the vnode.
1002 */
1003 if (doclose) {
1004 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
1005 if (error != 0) {
1006 if (wapbl_vphaswapbl(vp))
1007 WAPBL_DISCARD(wapbl_vptomp(vp));
1008 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
1009 }
1010 KASSERT(error == 0);
1011 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1012 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) {
1013 spec_node_revoke(vp);
1014 }
1015 }
1016 if (active) {
1017 VOP_INACTIVE(vp, &recycle);
1018 } else {
1019 /*
1020 * Any other processes trying to obtain this lock must first
1021 * wait for VI_XLOCK to clear, then call the new lock operation.
1022 */
1023 VOP_UNLOCK(vp);
1024 }
1025
1026 /* Disassociate the underlying file system from the vnode. */
1027 if (VOP_RECLAIM(vp)) {
1028 vnpanic(vp, "%s: cannot reclaim", __func__);
1029 }
1030
1031 KASSERT(vp->v_data == NULL);
1032 KASSERT(vp->v_uobj.uo_npages == 0);
1033
1034 if (vp->v_type == VREG && vp->v_ractx != NULL) {
1035 uvm_ra_freectx(vp->v_ractx);
1036 vp->v_ractx = NULL;
1037 }
1038
1039 /* Purge name cache. */
1040 cache_purge(vp);
1041
1042 /* Move to dead mount. */
1043 vp->v_vflag &= ~VV_ROOT;
1044 atomic_inc_uint(&dead_mount->mnt_refcnt);
1045 vfs_insmntque(vp, dead_mount);
1046
1047 /* Done with purge, notify sleepers of the grim news. */
1048 mutex_enter(vp->v_interlock);
1049 if (doclose) {
1050 vp->v_op = dead_vnodeop_p;
1051 vp->v_vflag |= VV_LOCKSWORK;
1052 vp->v_iflag |= VI_CLEAN;
1053 } else {
1054 vp->v_op = spec_vnodeop_p;
1055 vp->v_vflag &= ~VV_LOCKSWORK;
1056 }
1057 vp->v_tag = VT_NON;
1058 KNOTE(&vp->v_klist, NOTE_REVOKE);
1059 vp->v_iflag &= ~VI_XLOCK;
1060 cv_broadcast(&vp->v_cv);
1061
1062 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1063 }
1064
1065 /*
1066 * Recycle an unused vnode if caller holds the last reference.
1067 */
1068 bool
1069 vrecycle(vnode_t *vp)
1070 {
1071
1072 mutex_enter(vp->v_interlock);
1073
1074 KASSERT((vp->v_iflag & VI_MARKER) == 0);
1075
1076 if (vp->v_usecount != 1) {
1077 mutex_exit(vp->v_interlock);
1078 return false;
1079 }
1080 if ((vp->v_iflag & VI_CHANGING) != 0)
1081 vwait(vp, VI_CHANGING);
1082 if (vp->v_usecount != 1) {
1083 mutex_exit(vp->v_interlock);
1084 return false;
1085 } else if ((vp->v_iflag & VI_CLEAN) != 0) {
1086 mutex_exit(vp->v_interlock);
1087 return true;
1088 }
1089 vp->v_iflag |= VI_CHANGING;
1090 vclean(vp);
1091 vrelel(vp, VRELEL_CHANGING_SET);
1092 return true;
1093 }
1094
1095 /*
1096 * Eliminate all activity associated with the requested vnode
1097 * and with all vnodes aliased to the requested vnode.
1098 */
1099 void
1100 vrevoke(vnode_t *vp)
1101 {
1102 vnode_t *vq;
1103 enum vtype type;
1104 dev_t dev;
1105
1106 KASSERT(vp->v_usecount > 0);
1107
1108 mutex_enter(vp->v_interlock);
1109 if ((vp->v_iflag & VI_CLEAN) != 0) {
1110 mutex_exit(vp->v_interlock);
1111 return;
1112 } else if (vp->v_type != VBLK && vp->v_type != VCHR) {
1113 atomic_inc_uint(&vp->v_usecount);
1114 mutex_exit(vp->v_interlock);
1115 vgone(vp);
1116 return;
1117 } else {
1118 dev = vp->v_rdev;
1119 type = vp->v_type;
1120 mutex_exit(vp->v_interlock);
1121 }
1122
1123 while (spec_node_lookup_by_dev(type, dev, &vq) == 0) {
1124 vgone(vq);
1125 }
1126 }
1127
1128 /*
1129 * Eliminate all activity associated with a vnode in preparation for
1130 * reuse. Drops a reference from the vnode.
1131 */
1132 void
1133 vgone(vnode_t *vp)
1134 {
1135
1136 mutex_enter(vp->v_interlock);
1137 if ((vp->v_iflag & VI_CHANGING) != 0)
1138 vwait(vp, VI_CHANGING);
1139 vp->v_iflag |= VI_CHANGING;
1140 vclean(vp);
1141 vrelel(vp, VRELEL_CHANGING_SET);
1142 }
1143
1144 static inline uint32_t
1145 vcache_hash(const struct vcache_key *key)
1146 {
1147 uint32_t hash = HASH32_BUF_INIT;
1148
1149 hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash);
1150 hash = hash32_buf(key->vk_key, key->vk_key_len, hash);
1151 return hash;
1152 }
1153
1154 static void
1155 vcache_init(void)
1156 {
1157
1158 vcache.pool = pool_cache_init(sizeof(struct vcache_node), 0, 0, 0,
1159 "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL);
1160 KASSERT(vcache.pool != NULL);
1161 mutex_init(&vcache.lock, MUTEX_DEFAULT, IPL_NONE);
1162 vcache.hashtab = hashinit(desiredvnodes, HASH_SLIST, true,
1163 &vcache.hashmask);
1164 }
1165
1166 static void
1167 vcache_reinit(void)
1168 {
1169 int i;
1170 uint32_t hash;
1171 u_long oldmask, newmask;
1172 struct hashhead *oldtab, *newtab;
1173 struct vcache_node *node;
1174
1175 newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask);
1176 mutex_enter(&vcache.lock);
1177 oldtab = vcache.hashtab;
1178 oldmask = vcache.hashmask;
1179 vcache.hashtab = newtab;
1180 vcache.hashmask = newmask;
1181 for (i = 0; i <= oldmask; i++) {
1182 while ((node = SLIST_FIRST(&oldtab[i])) != NULL) {
1183 SLIST_REMOVE(&oldtab[i], node, vcache_node, vn_hash);
1184 hash = vcache_hash(&node->vn_key);
1185 SLIST_INSERT_HEAD(&newtab[hash & vcache.hashmask],
1186 node, vn_hash);
1187 }
1188 }
1189 mutex_exit(&vcache.lock);
1190 hashdone(oldtab, HASH_SLIST, oldmask);
1191 }
1192
1193 static inline struct vcache_node *
1194 vcache_hash_lookup(const struct vcache_key *key, uint32_t hash)
1195 {
1196 struct hashhead *hashp;
1197 struct vcache_node *node;
1198
1199 KASSERT(mutex_owned(&vcache.lock));
1200
1201 hashp = &vcache.hashtab[hash & vcache.hashmask];
1202 SLIST_FOREACH(node, hashp, vn_hash) {
1203 if (key->vk_mount != node->vn_key.vk_mount)
1204 continue;
1205 if (key->vk_key_len != node->vn_key.vk_key_len)
1206 continue;
1207 if (memcmp(key->vk_key, node->vn_key.vk_key, key->vk_key_len))
1208 continue;
1209 return node;
1210 }
1211 return NULL;
1212 }
1213
1214 /*
1215 * Get a vnode / fs node pair by key and return it referenced through vpp.
1216 */
1217 int
1218 vcache_get(struct mount *mp, const void *key, size_t key_len,
1219 struct vnode **vpp)
1220 {
1221 int error;
1222 uint32_t hash;
1223 const void *new_key;
1224 struct vnode *vp;
1225 struct vcache_key vcache_key;
1226 struct vcache_node *node, *new_node;
1227
1228 new_key = NULL;
1229 *vpp = NULL;
1230
1231 vcache_key.vk_mount = mp;
1232 vcache_key.vk_key = key;
1233 vcache_key.vk_key_len = key_len;
1234 hash = vcache_hash(&vcache_key);
1235
1236 again:
1237 mutex_enter(&vcache.lock);
1238 node = vcache_hash_lookup(&vcache_key, hash);
1239
1240 /* If found, take a reference or retry. */
1241 if (__predict_true(node != NULL && node->vn_vnode != NULL)) {
1242 vp = node->vn_vnode;
1243 mutex_enter(vp->v_interlock);
1244 mutex_exit(&vcache.lock);
1245 error = vget(vp, 0, true /* wait */);
1246 if (error == ENOENT)
1247 goto again;
1248 if (error == 0)
1249 *vpp = vp;
1250 KASSERT((error != 0) == (*vpp == NULL));
1251 return error;
1252 }
1253
1254 /* If another thread loads this node, wait and retry. */
1255 if (node != NULL) {
1256 KASSERT(node->vn_vnode == NULL);
1257 mutex_exit(&vcache.lock);
1258 kpause("vcache", false, mstohz(20), NULL);
1259 goto again;
1260 }
1261 mutex_exit(&vcache.lock);
1262
1263 /* Allocate and initialize a new vcache / vnode pair. */
1264 error = vfs_busy(mp, NULL);
1265 if (error)
1266 return error;
1267 new_node = pool_cache_get(vcache.pool, PR_WAITOK);
1268 new_node->vn_vnode = NULL;
1269 new_node->vn_key = vcache_key;
1270 vp = vnalloc(NULL);
1271 mutex_enter(&vcache.lock);
1272 node = vcache_hash_lookup(&vcache_key, hash);
1273 if (node == NULL) {
1274 SLIST_INSERT_HEAD(&vcache.hashtab[hash & vcache.hashmask],
1275 new_node, vn_hash);
1276 node = new_node;
1277 }
1278 mutex_exit(&vcache.lock);
1279
1280 /* If another thread beat us inserting this node, retry. */
1281 if (node != new_node) {
1282 pool_cache_put(vcache.pool, new_node);
1283 KASSERT(vp->v_usecount == 1);
1284 vp->v_usecount = 0;
1285 vnfree(vp);
1286 vfs_unbusy(mp, false, NULL);
1287 goto again;
1288 }
1289
1290 /* Load the fs node. Exclusive as new_node->vn_vnode is NULL. */
1291 vp->v_iflag |= VI_CHANGING;
1292 error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key);
1293 if (error) {
1294 mutex_enter(&vcache.lock);
1295 SLIST_REMOVE(&vcache.hashtab[hash & vcache.hashmask],
1296 new_node, vcache_node, vn_hash);
1297 mutex_exit(&vcache.lock);
1298 pool_cache_put(vcache.pool, new_node);
1299 KASSERT(vp->v_usecount == 1);
1300 vp->v_usecount = 0;
1301 vnfree(vp);
1302 vfs_unbusy(mp, false, NULL);
1303 KASSERT(*vpp == NULL);
1304 return error;
1305 }
1306 KASSERT(new_key != NULL);
1307 KASSERT(memcmp(key, new_key, key_len) == 0);
1308 KASSERT(vp->v_op != NULL);
1309 vfs_insmntque(vp, mp);
1310 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1311 vp->v_vflag |= VV_MPSAFE;
1312 vfs_unbusy(mp, true, NULL);
1313
1314 /* Finished loading, finalize node. */
1315 mutex_enter(&vcache.lock);
1316 new_node->vn_key.vk_key = new_key;
1317 new_node->vn_vnode = vp;
1318 mutex_exit(&vcache.lock);
1319 mutex_enter(vp->v_interlock);
1320 vp->v_iflag &= ~VI_CHANGING;
1321 cv_broadcast(&vp->v_cv);
1322 mutex_exit(vp->v_interlock);
1323 *vpp = vp;
1324 return 0;
1325 }
1326
1327 /*
1328 * Create a new vnode / fs node pair and return it referenced through vpp.
1329 */
1330 int
1331 vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap,
1332 kauth_cred_t cred, struct vnode **vpp)
1333 {
1334 int error;
1335 uint32_t hash;
1336 struct vnode *vp;
1337 struct vcache_node *new_node;
1338 struct vcache_node *old_node __diagused;
1339
1340 *vpp = NULL;
1341
1342 /* Allocate and initialize a new vcache / vnode pair. */
1343 error = vfs_busy(mp, NULL);
1344 if (error)
1345 return error;
1346 new_node = pool_cache_get(vcache.pool, PR_WAITOK);
1347 new_node->vn_key.vk_mount = mp;
1348 new_node->vn_vnode = NULL;
1349 vp = vnalloc(NULL);
1350
1351 /* Create and load the fs node. */
1352 vp->v_iflag |= VI_CHANGING;
1353 error = VFS_NEWVNODE(mp, dvp, vp, vap, cred,
1354 &new_node->vn_key.vk_key_len, &new_node->vn_key.vk_key);
1355 if (error) {
1356 pool_cache_put(vcache.pool, new_node);
1357 KASSERT(vp->v_usecount == 1);
1358 vp->v_usecount = 0;
1359 vnfree(vp);
1360 vfs_unbusy(mp, false, NULL);
1361 KASSERT(*vpp == NULL);
1362 return error;
1363 }
1364 KASSERT(new_node->vn_key.vk_key != NULL);
1365 KASSERT(vp->v_op != NULL);
1366 hash = vcache_hash(&new_node->vn_key);
1367
1368 /* Wait for previous instance to be reclaimed, then insert new node. */
1369 mutex_enter(&vcache.lock);
1370 while ((old_node = vcache_hash_lookup(&new_node->vn_key, hash))) {
1371 #ifdef DIAGNOSTIC
1372 if (old_node->vn_vnode != NULL)
1373 mutex_enter(old_node->vn_vnode->v_interlock);
1374 KASSERT(old_node->vn_vnode == NULL ||
1375 (old_node->vn_vnode->v_iflag & (VI_XLOCK | VI_CLEAN)) != 0);
1376 if (old_node->vn_vnode != NULL)
1377 mutex_exit(old_node->vn_vnode->v_interlock);
1378 #endif
1379 mutex_exit(&vcache.lock);
1380 kpause("vcache", false, mstohz(20), NULL);
1381 mutex_enter(&vcache.lock);
1382 }
1383 SLIST_INSERT_HEAD(&vcache.hashtab[hash & vcache.hashmask],
1384 new_node, vn_hash);
1385 mutex_exit(&vcache.lock);
1386 vfs_insmntque(vp, mp);
1387 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1388 vp->v_vflag |= VV_MPSAFE;
1389 vfs_unbusy(mp, true, NULL);
1390
1391 /* Finished loading, finalize node. */
1392 mutex_enter(&vcache.lock);
1393 new_node->vn_vnode = vp;
1394 mutex_exit(&vcache.lock);
1395 mutex_enter(vp->v_interlock);
1396 vp->v_iflag &= ~VI_CHANGING;
1397 cv_broadcast(&vp->v_cv);
1398 mutex_exit(vp->v_interlock);
1399 *vpp = vp;
1400 return 0;
1401 }
1402
1403 /*
1404 * Prepare key change: lock old and new cache node.
1405 * Return an error if the new node already exists.
1406 */
1407 int
1408 vcache_rekey_enter(struct mount *mp, struct vnode *vp,
1409 const void *old_key, size_t old_key_len,
1410 const void *new_key, size_t new_key_len)
1411 {
1412 uint32_t old_hash, new_hash;
1413 struct vcache_key old_vcache_key, new_vcache_key;
1414 struct vcache_node *node, *new_node;
1415
1416 old_vcache_key.vk_mount = mp;
1417 old_vcache_key.vk_key = old_key;
1418 old_vcache_key.vk_key_len = old_key_len;
1419 old_hash = vcache_hash(&old_vcache_key);
1420
1421 new_vcache_key.vk_mount = mp;
1422 new_vcache_key.vk_key = new_key;
1423 new_vcache_key.vk_key_len = new_key_len;
1424 new_hash = vcache_hash(&new_vcache_key);
1425
1426 new_node = pool_cache_get(vcache.pool, PR_WAITOK);
1427 new_node->vn_vnode = NULL;
1428 new_node->vn_key = new_vcache_key;
1429
1430 mutex_enter(&vcache.lock);
1431 node = vcache_hash_lookup(&new_vcache_key, new_hash);
1432 if (node != NULL) {
1433 mutex_exit(&vcache.lock);
1434 pool_cache_put(vcache.pool, new_node);
1435 return EEXIST;
1436 }
1437 SLIST_INSERT_HEAD(&vcache.hashtab[new_hash & vcache.hashmask],
1438 new_node, vn_hash);
1439 node = vcache_hash_lookup(&old_vcache_key, old_hash);
1440 KASSERT(node != NULL);
1441 KASSERT(node->vn_vnode == vp);
1442 node->vn_vnode = NULL;
1443 node->vn_key = old_vcache_key;
1444 mutex_exit(&vcache.lock);
1445 return 0;
1446 }
1447
1448 /*
1449 * Key change complete: remove old node and unlock new node.
1450 */
1451 void
1452 vcache_rekey_exit(struct mount *mp, struct vnode *vp,
1453 const void *old_key, size_t old_key_len,
1454 const void *new_key, size_t new_key_len)
1455 {
1456 uint32_t old_hash, new_hash;
1457 struct vcache_key old_vcache_key, new_vcache_key;
1458 struct vcache_node *node;
1459
1460 old_vcache_key.vk_mount = mp;
1461 old_vcache_key.vk_key = old_key;
1462 old_vcache_key.vk_key_len = old_key_len;
1463 old_hash = vcache_hash(&old_vcache_key);
1464
1465 new_vcache_key.vk_mount = mp;
1466 new_vcache_key.vk_key = new_key;
1467 new_vcache_key.vk_key_len = new_key_len;
1468 new_hash = vcache_hash(&new_vcache_key);
1469
1470 mutex_enter(&vcache.lock);
1471 node = vcache_hash_lookup(&new_vcache_key, new_hash);
1472 KASSERT(node != NULL && node->vn_vnode == NULL);
1473 KASSERT(node->vn_key.vk_key_len == new_key_len);
1474 node->vn_vnode = vp;
1475 node->vn_key = new_vcache_key;
1476 node = vcache_hash_lookup(&old_vcache_key, old_hash);
1477 KASSERT(node != NULL);
1478 KASSERT(node->vn_vnode == NULL);
1479 SLIST_REMOVE(&vcache.hashtab[old_hash & vcache.hashmask],
1480 node, vcache_node, vn_hash);
1481 mutex_exit(&vcache.lock);
1482 pool_cache_put(vcache.pool, node);
1483 }
1484
1485 /*
1486 * Remove a vnode / fs node pair from the cache.
1487 */
1488 void
1489 vcache_remove(struct mount *mp, const void *key, size_t key_len)
1490 {
1491 uint32_t hash;
1492 struct vcache_key vcache_key;
1493 struct vcache_node *node;
1494
1495 vcache_key.vk_mount = mp;
1496 vcache_key.vk_key = key;
1497 vcache_key.vk_key_len = key_len;
1498 hash = vcache_hash(&vcache_key);
1499
1500 mutex_enter(&vcache.lock);
1501 node = vcache_hash_lookup(&vcache_key, hash);
1502 KASSERT(node != NULL);
1503 SLIST_REMOVE(&vcache.hashtab[hash & vcache.hashmask],
1504 node, vcache_node, vn_hash);
1505 mutex_exit(&vcache.lock);
1506 pool_cache_put(vcache.pool, node);
1507 }
1508
1509 /*
1510 * Update outstanding I/O count and do wakeup if requested.
1511 */
1512 void
1513 vwakeup(struct buf *bp)
1514 {
1515 vnode_t *vp;
1516
1517 if ((vp = bp->b_vp) == NULL)
1518 return;
1519
1520 KASSERT(bp->b_objlock == vp->v_interlock);
1521 KASSERT(mutex_owned(bp->b_objlock));
1522
1523 if (--vp->v_numoutput < 0)
1524 vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp);
1525 if (vp->v_numoutput == 0)
1526 cv_broadcast(&vp->v_cv);
1527 }
1528
1529 /*
1530 * Test a vnode for being or becoming dead. Returns one of:
1531 * EBUSY: vnode is becoming dead, with "flags == VDEAD_NOWAIT" only.
1532 * ENOENT: vnode is dead.
1533 * 0: otherwise.
1534 *
1535 * Whenever this function returns a non-zero value all future
1536 * calls will also return a non-zero value.
1537 */
1538 int
1539 vdead_check(struct vnode *vp, int flags)
1540 {
1541
1542 KASSERT(mutex_owned(vp->v_interlock));
1543 if (ISSET(vp->v_iflag, VI_XLOCK)) {
1544 if (ISSET(flags, VDEAD_NOWAIT))
1545 return EBUSY;
1546 vwait(vp, VI_XLOCK);
1547 KASSERT(ISSET(vp->v_iflag, VI_CLEAN));
1548 }
1549 if (ISSET(vp->v_iflag, VI_CLEAN))
1550 return ENOENT;
1551 return 0;
1552 }
1553
1554 /*
1555 * Wait for a vnode (typically with VI_XLOCK set) to be cleaned or
1556 * recycled.
1557 */
1558 static void
1559 vwait(vnode_t *vp, int flags)
1560 {
1561
1562 KASSERT(mutex_owned(vp->v_interlock));
1563 KASSERT(vp->v_usecount != 0);
1564
1565 while ((vp->v_iflag & flags) != 0)
1566 cv_wait(&vp->v_cv, vp->v_interlock);
1567 }
1568
1569 int
1570 vfs_drainvnodes(long target)
1571 {
1572 int error;
1573
1574 mutex_enter(&vnode_free_list_lock);
1575
1576 while (numvnodes > target) {
1577 error = cleanvnode();
1578 if (error != 0)
1579 return error;
1580 mutex_enter(&vnode_free_list_lock);
1581 }
1582
1583 mutex_exit(&vnode_free_list_lock);
1584
1585 vcache_reinit();
1586
1587 return 0;
1588 }
1589
1590 void
1591 vnpanic(vnode_t *vp, const char *fmt, ...)
1592 {
1593 va_list ap;
1594
1595 #ifdef DIAGNOSTIC
1596 vprint(NULL, vp);
1597 #endif
1598 va_start(ap, fmt);
1599 vpanic(fmt, ap);
1600 va_end(ap);
1601 }
1602