vfs_vnode.c revision 1.3 1 /* $NetBSD: vfs_vnode.c,v 1.3 2011/04/02 05:07:57 rmind Exp $ */
2
3 /*-
4 * Copyright (c) 1997-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /*
34 * Copyright (c) 1989, 1993
35 * The Regents of the University of California. All rights reserved.
36 * (c) UNIX System Laboratories, Inc.
37 * All or some portions of this file are derived from material licensed
38 * to the University of California by American Telephone and Telegraph
39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40 * the permission of UNIX System Laboratories, Inc.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
67 */
68
69 /*
70 * Note on v_usecount and locking:
71 *
72 * At nearly all points it is known that v_usecount could be zero, the
73 * vnode interlock will be held.
74 *
75 * To change v_usecount away from zero, the interlock must be held. To
76 * change from a non-zero value to zero, again the interlock must be
77 * held.
78 *
79 * There's a flag bit, VC_XLOCK, embedded in v_usecount.
80 * To raise v_usecount, if the VC_XLOCK bit is set in it, the interlock
81 * must be held.
82 * To modify the VC_XLOCK bit, the interlock must be held.
83 * We always keep the usecount (v_usecount & VC_MASK) non-zero while the
84 * VC_XLOCK bit is set.
85 *
86 * Unless the VC_XLOCK bit is set, changing the usecount from a non-zero
87 * value to a non-zero value can safely be done using atomic operations,
88 * without the interlock held.
89 * Even if the VC_XLOCK bit is set, decreasing the usecount to a non-zero
90 * value can be done using atomic operations, without the interlock held.
91 */
92
93 #include <sys/cdefs.h>
94 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.3 2011/04/02 05:07:57 rmind Exp $");
95
96 #include <sys/param.h>
97 #include <sys/kernel.h>
98
99 #include <sys/atomic.h>
100 #include <sys/buf.h>
101 #include <sys/conf.h>
102 #include <sys/device.h>
103 #include <sys/kauth.h>
104 #include <sys/kmem.h>
105 #include <sys/kthread.h>
106 #include <sys/module.h>
107 #include <sys/mount.h>
108 #include <sys/namei.h>
109 #include <sys/syscallargs.h>
110 #include <sys/sysctl.h>
111 #include <sys/systm.h>
112 #include <sys/vnode.h>
113 #include <sys/wapbl.h>
114
115 #include <uvm/uvm.h>
116 #include <uvm/uvm_readahead.h>
117
118 u_int numvnodes;
119
120 static pool_cache_t vnode_cache;
121 static kmutex_t vnode_free_list_lock;
122
123 static vnodelst_t vnode_free_list;
124 static vnodelst_t vnode_hold_list;
125 static vnodelst_t vrele_list;
126
127 static kmutex_t vrele_lock;
128 static kcondvar_t vrele_cv;
129 static lwp_t * vrele_lwp;
130 static int vrele_pending;
131 static int vrele_gen;
132
133 static vnode_t * getcleanvnode(void);
134 static void vrele_thread(void *);
135 static void vpanic(vnode_t *, const char *);
136
137 /* Routines having to do with the management of the vnode table. */
138 extern int (**dead_vnodeop_p)(void *);
139
140 void
141 vfs_vnode_sysinit(void)
142 {
143 int error;
144
145 vnode_cache = pool_cache_init(sizeof(vnode_t), 0, 0, 0, "vnodepl",
146 NULL, IPL_NONE, NULL, NULL, NULL);
147 KASSERT(vnode_cache != NULL);
148
149 mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE);
150 TAILQ_INIT(&vnode_free_list);
151 TAILQ_INIT(&vnode_hold_list);
152 TAILQ_INIT(&vrele_list);
153
154 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE);
155 cv_init(&vrele_cv, "vrele");
156 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread,
157 NULL, &vrele_lwp, "vrele");
158 KASSERT(error == 0);
159 }
160
161 /*
162 * Allocate a new, uninitialized vnode. If 'mp' is non-NULL, this is a
163 * marker vnode and we are prepared to wait for the allocation.
164 */
165 vnode_t *
166 vnalloc(struct mount *mp)
167 {
168 vnode_t *vp;
169
170 vp = pool_cache_get(vnode_cache, (mp != NULL ? PR_WAITOK : PR_NOWAIT));
171 if (vp == NULL) {
172 return NULL;
173 }
174
175 memset(vp, 0, sizeof(*vp));
176 UVM_OBJ_INIT(&vp->v_uobj, &uvm_vnodeops, 0);
177 cv_init(&vp->v_cv, "vnode");
178 /*
179 * Done by memset() above.
180 * LIST_INIT(&vp->v_nclist);
181 * LIST_INIT(&vp->v_dnclist);
182 */
183
184 if (mp != NULL) {
185 vp->v_mount = mp;
186 vp->v_type = VBAD;
187 vp->v_iflag = VI_MARKER;
188 } else {
189 rw_init(&vp->v_lock);
190 }
191
192 return vp;
193 }
194
195 /*
196 * Free an unused, unreferenced vnode.
197 */
198 void
199 vnfree(vnode_t *vp)
200 {
201
202 KASSERT(vp->v_usecount == 0);
203
204 if ((vp->v_iflag & VI_MARKER) == 0) {
205 rw_destroy(&vp->v_lock);
206 mutex_enter(&vnode_free_list_lock);
207 numvnodes--;
208 mutex_exit(&vnode_free_list_lock);
209 }
210
211 UVM_OBJ_DESTROY(&vp->v_uobj);
212 cv_destroy(&vp->v_cv);
213 pool_cache_put(vnode_cache, vp);
214 }
215
216 /*
217 * getcleanvnode: grab a vnode from freelist and clean it.
218 */
219 vnode_t *
220 getcleanvnode(void)
221 {
222 vnode_t *vp;
223 vnodelst_t *listhd;
224
225 KASSERT(mutex_owned(&vnode_free_list_lock));
226 retry:
227 listhd = &vnode_free_list;
228 try_nextlist:
229 TAILQ_FOREACH(vp, listhd, v_freelist) {
230 /*
231 * It's safe to test v_usecount and v_iflag
232 * without holding the interlock here, since
233 * these vnodes should never appear on the
234 * lists.
235 */
236 if (vp->v_usecount != 0) {
237 vpanic(vp, "free vnode isn't");
238 }
239 if ((vp->v_iflag & VI_CLEAN) != 0) {
240 vpanic(vp, "clean vnode on freelist");
241 }
242 if (vp->v_freelisthd != listhd) {
243 printf("vnode sez %p, listhd %p\n", vp->v_freelisthd, listhd);
244 vpanic(vp, "list head mismatch");
245 }
246 if (!mutex_tryenter(&vp->v_interlock))
247 continue;
248 if ((vp->v_iflag & VI_XLOCK) == 0)
249 break;
250 mutex_exit(&vp->v_interlock);
251 }
252
253 if (vp == NULL) {
254 if (listhd == &vnode_free_list) {
255 listhd = &vnode_hold_list;
256 goto try_nextlist;
257 }
258 mutex_exit(&vnode_free_list_lock);
259 return NULL;
260 }
261
262 /* Remove it from the freelist. */
263 TAILQ_REMOVE(listhd, vp, v_freelist);
264 vp->v_freelisthd = NULL;
265 mutex_exit(&vnode_free_list_lock);
266
267 KASSERT(vp->v_usecount == 0);
268
269 /*
270 * The vnode is still associated with a file system, so we must
271 * clean it out before reusing it. We need to add a reference
272 * before doing this. If the vnode gains another reference while
273 * being cleaned out then we lose - retry.
274 */
275 atomic_add_int(&vp->v_usecount, 1 + VC_XLOCK);
276 vclean(vp, DOCLOSE);
277 KASSERT(vp->v_usecount >= 1 + VC_XLOCK);
278 atomic_add_int(&vp->v_usecount, -VC_XLOCK);
279 if (vp->v_usecount == 1) {
280 /* We're about to dirty it. */
281 vp->v_iflag &= ~VI_CLEAN;
282 mutex_exit(&vp->v_interlock);
283 if (vp->v_type == VBLK || vp->v_type == VCHR) {
284 spec_node_destroy(vp);
285 }
286 vp->v_type = VNON;
287 } else {
288 /*
289 * Don't return to freelist - the holder of the last
290 * reference will destroy it.
291 */
292 vrelel(vp, 0); /* releases vp->v_interlock */
293 mutex_enter(&vnode_free_list_lock);
294 goto retry;
295 }
296
297 if (vp->v_data != NULL || vp->v_uobj.uo_npages != 0 ||
298 !TAILQ_EMPTY(&vp->v_uobj.memq)) {
299 vpanic(vp, "cleaned vnode isn't");
300 }
301 if (vp->v_numoutput != 0) {
302 vpanic(vp, "clean vnode has pending I/O's");
303 }
304 if ((vp->v_iflag & VI_ONWORKLST) != 0) {
305 vpanic(vp, "clean vnode on syncer list");
306 }
307
308 return vp;
309 }
310
311 /*
312 * Return the next vnode from the free list.
313 */
314 int
315 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *),
316 vnode_t **vpp)
317 {
318 struct uvm_object *uobj;
319 static int toggle;
320 vnode_t *vp;
321 int error = 0, tryalloc;
322
323 try_again:
324 if (mp != NULL) {
325 /*
326 * Mark filesystem busy while we're creating a
327 * vnode. If unmount is in progress, this will
328 * fail.
329 */
330 error = vfs_busy(mp, NULL);
331 if (error)
332 return error;
333 }
334
335 /*
336 * We must choose whether to allocate a new vnode or recycle an
337 * existing one. The criterion for allocating a new one is that
338 * the total number of vnodes is less than the number desired or
339 * there are no vnodes on either free list. Generally we only
340 * want to recycle vnodes that have no buffers associated with
341 * them, so we look first on the vnode_free_list. If it is empty,
342 * we next consider vnodes with referencing buffers on the
343 * vnode_hold_list. The toggle ensures that half the time we
344 * will use a buffer from the vnode_hold_list, and half the time
345 * we will allocate a new one unless the list has grown to twice
346 * the desired size. We are reticent to recycle vnodes from the
347 * vnode_hold_list because we will lose the identity of all its
348 * referencing buffers.
349 */
350
351 vp = NULL;
352
353 mutex_enter(&vnode_free_list_lock);
354
355 toggle ^= 1;
356 if (numvnodes > 2 * desiredvnodes)
357 toggle = 0;
358
359 tryalloc = numvnodes < desiredvnodes ||
360 (TAILQ_FIRST(&vnode_free_list) == NULL &&
361 (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle));
362
363 if (tryalloc) {
364 numvnodes++;
365 mutex_exit(&vnode_free_list_lock);
366 if ((vp = vnalloc(NULL)) == NULL) {
367 mutex_enter(&vnode_free_list_lock);
368 numvnodes--;
369 } else
370 vp->v_usecount = 1;
371 }
372
373 if (vp == NULL) {
374 vp = getcleanvnode();
375 if (vp == NULL) {
376 if (mp != NULL) {
377 vfs_unbusy(mp, false, NULL);
378 }
379 if (tryalloc) {
380 printf("WARNING: unable to allocate new "
381 "vnode, retrying...\n");
382 kpause("newvn", false, hz, NULL);
383 goto try_again;
384 }
385 tablefull("vnode", "increase kern.maxvnodes or NVNODE");
386 *vpp = 0;
387 return (ENFILE);
388 }
389 vp->v_iflag = 0;
390 vp->v_vflag = 0;
391 vp->v_uflag = 0;
392 vp->v_socket = NULL;
393 }
394
395 KASSERT(vp->v_usecount == 1);
396 KASSERT(vp->v_freelisthd == NULL);
397 KASSERT(LIST_EMPTY(&vp->v_nclist));
398 KASSERT(LIST_EMPTY(&vp->v_dnclist));
399
400 vp->v_type = VNON;
401 vp->v_tag = tag;
402 vp->v_op = vops;
403 vfs_insmntque(vp, mp);
404 *vpp = vp;
405 vp->v_data = NULL;
406
407 /*
408 * initialize uvm_object within vnode.
409 */
410
411 uobj = &vp->v_uobj;
412 KASSERT(uobj->pgops == &uvm_vnodeops);
413 KASSERT(uobj->uo_npages == 0);
414 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL);
415 vp->v_size = vp->v_writesize = VSIZENOTSET;
416
417 if (mp != NULL) {
418 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
419 vp->v_vflag |= VV_MPSAFE;
420 vfs_unbusy(mp, true, NULL);
421 }
422
423 return (0);
424 }
425
426 /*
427 * This is really just the reverse of getnewvnode(). Needed for
428 * VFS_VGET functions who may need to push back a vnode in case
429 * of a locking race.
430 */
431 void
432 ungetnewvnode(vnode_t *vp)
433 {
434
435 KASSERT(vp->v_usecount == 1);
436 KASSERT(vp->v_data == NULL);
437 KASSERT(vp->v_freelisthd == NULL);
438
439 mutex_enter(&vp->v_interlock);
440 vp->v_iflag |= VI_CLEAN;
441 vrelel(vp, 0);
442 }
443
444 /*
445 * Remove a vnode from its freelist.
446 */
447 void
448 vremfree(vnode_t *vp)
449 {
450
451 KASSERT(mutex_owned(&vp->v_interlock));
452 KASSERT(vp->v_usecount == 0);
453
454 /*
455 * Note that the reference count must not change until
456 * the vnode is removed.
457 */
458 mutex_enter(&vnode_free_list_lock);
459 if (vp->v_holdcnt > 0) {
460 KASSERT(vp->v_freelisthd == &vnode_hold_list);
461 } else {
462 KASSERT(vp->v_freelisthd == &vnode_free_list);
463 }
464 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
465 vp->v_freelisthd = NULL;
466 mutex_exit(&vnode_free_list_lock);
467 }
468
469 /*
470 * Try to gain a reference to a vnode, without acquiring its interlock.
471 * The caller must hold a lock that will prevent the vnode from being
472 * recycled or freed.
473 */
474 bool
475 vtryget(vnode_t *vp)
476 {
477 u_int use, next;
478
479 /*
480 * If the vnode is being freed, don't make life any harder
481 * for vclean() by adding another reference without waiting.
482 * This is not strictly necessary, but we'll do it anyway.
483 */
484 if (__predict_false((vp->v_iflag & VI_XLOCK) != 0)) {
485 return false;
486 }
487 for (use = vp->v_usecount;; use = next) {
488 if (use == 0 || __predict_false((use & VC_XLOCK) != 0)) {
489 /* Need interlock held if first reference. */
490 return false;
491 }
492 next = atomic_cas_uint(&vp->v_usecount, use, use + 1);
493 if (__predict_true(next == use)) {
494 return true;
495 }
496 }
497 }
498
499 /*
500 * Grab a particular vnode from the free list, increment its
501 * reference count and lock it. If the vnode lock bit is set the
502 * vnode is being eliminated in vgone. In that case, we can not
503 * grab the vnode, so the process is awakened when the transition is
504 * completed, and an error returned to indicate that the vnode is no
505 * longer usable (possibly having been changed to a new file system type).
506 * Called with v_interlock held.
507 */
508 int
509 vget(vnode_t *vp, int flags)
510 {
511 int error = 0;
512
513 KASSERT((vp->v_iflag & VI_MARKER) == 0);
514 KASSERT(mutex_owned(&vp->v_interlock));
515 KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT)) == 0);
516
517 /*
518 * Before adding a reference, we must remove the vnode
519 * from its freelist.
520 */
521 if (vp->v_usecount == 0) {
522 vremfree(vp);
523 vp->v_usecount = 1;
524 } else {
525 atomic_inc_uint(&vp->v_usecount);
526 }
527
528 /*
529 * If the vnode is in the process of being cleaned out for
530 * another use, we wait for the cleaning to finish and then
531 * return failure. Cleaning is determined by checking if
532 * the VI_XLOCK flag is set.
533 */
534 if ((vp->v_iflag & VI_XLOCK) != 0) {
535 if ((flags & LK_NOWAIT) != 0) {
536 vrelel(vp, 0);
537 return EBUSY;
538 }
539 vwait(vp, VI_XLOCK);
540 vrelel(vp, 0);
541 return ENOENT;
542 }
543
544 /*
545 * Ok, we got it in good shape. Just locking left.
546 */
547 KASSERT((vp->v_iflag & VI_CLEAN) == 0);
548 mutex_exit(&vp->v_interlock);
549 if (flags & (LK_EXCLUSIVE | LK_SHARED)) {
550 error = vn_lock(vp, flags);
551 if (error != 0) {
552 vrele(vp);
553 }
554 }
555 return error;
556 }
557
558 /*
559 * vput(), just unlock and vrele()
560 */
561 void
562 vput(vnode_t *vp)
563 {
564
565 KASSERT((vp->v_iflag & VI_MARKER) == 0);
566
567 VOP_UNLOCK(vp);
568 vrele(vp);
569 }
570
571 /*
572 * Try to drop reference on a vnode. Abort if we are releasing the
573 * last reference. Note: this _must_ succeed if not the last reference.
574 */
575 static inline bool
576 vtryrele(vnode_t *vp)
577 {
578 u_int use, next;
579
580 for (use = vp->v_usecount;; use = next) {
581 if (use == 1) {
582 return false;
583 }
584 KASSERT((use & VC_MASK) > 1);
585 next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
586 if (__predict_true(next == use)) {
587 return true;
588 }
589 }
590 }
591
592 /*
593 * Vnode release. If reference count drops to zero, call inactive
594 * routine and either return to freelist or free to the pool.
595 */
596 void
597 vrelel(vnode_t *vp, int flags)
598 {
599 bool recycle, defer;
600 int error;
601
602 KASSERT(mutex_owned(&vp->v_interlock));
603 KASSERT((vp->v_iflag & VI_MARKER) == 0);
604 KASSERT(vp->v_freelisthd == NULL);
605
606 if (__predict_false(vp->v_op == dead_vnodeop_p &&
607 (vp->v_iflag & (VI_CLEAN|VI_XLOCK)) == 0)) {
608 vpanic(vp, "dead but not clean");
609 }
610
611 /*
612 * If not the last reference, just drop the reference count
613 * and unlock.
614 */
615 if (vtryrele(vp)) {
616 vp->v_iflag |= VI_INACTREDO;
617 mutex_exit(&vp->v_interlock);
618 return;
619 }
620 if (vp->v_usecount <= 0 || vp->v_writecount != 0) {
621 vpanic(vp, "vrelel: bad ref count");
622 }
623
624 KASSERT((vp->v_iflag & VI_XLOCK) == 0);
625
626 /*
627 * If not clean, deactivate the vnode, but preserve
628 * our reference across the call to VOP_INACTIVE().
629 */
630 retry:
631 if ((vp->v_iflag & VI_CLEAN) == 0) {
632 recycle = false;
633 vp->v_iflag |= VI_INACTNOW;
634
635 /*
636 * XXX This ugly block can be largely eliminated if
637 * locking is pushed down into the file systems.
638 *
639 * Defer vnode release to vrele_thread if caller
640 * requests it explicitly.
641 */
642 if ((curlwp == uvm.pagedaemon_lwp) ||
643 (flags & VRELEL_ASYNC_RELE) != 0) {
644 /* The pagedaemon can't wait around; defer. */
645 defer = true;
646 } else if (curlwp == vrele_lwp) {
647 /* We have to try harder. */
648 vp->v_iflag &= ~VI_INACTREDO;
649 mutex_exit(&vp->v_interlock);
650 error = vn_lock(vp, LK_EXCLUSIVE);
651 if (error != 0) {
652 /* XXX */
653 vpanic(vp, "vrele: unable to lock %p");
654 }
655 defer = false;
656 } else if ((vp->v_iflag & VI_LAYER) != 0) {
657 /*
658 * Acquiring the stack's lock in vclean() even
659 * for an honest vput/vrele is dangerous because
660 * our caller may hold other vnode locks; defer.
661 */
662 defer = true;
663 } else {
664 /* If we can't acquire the lock, then defer. */
665 vp->v_iflag &= ~VI_INACTREDO;
666 mutex_exit(&vp->v_interlock);
667 error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT);
668 if (error != 0) {
669 defer = true;
670 mutex_enter(&vp->v_interlock);
671 } else {
672 defer = false;
673 }
674 }
675
676 if (defer) {
677 /*
678 * Defer reclaim to the kthread; it's not safe to
679 * clean it here. We donate it our last reference.
680 */
681 KASSERT(mutex_owned(&vp->v_interlock));
682 KASSERT((vp->v_iflag & VI_INACTPEND) == 0);
683 vp->v_iflag &= ~VI_INACTNOW;
684 vp->v_iflag |= VI_INACTPEND;
685 mutex_enter(&vrele_lock);
686 TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist);
687 if (++vrele_pending > (desiredvnodes >> 8))
688 cv_signal(&vrele_cv);
689 mutex_exit(&vrele_lock);
690 mutex_exit(&vp->v_interlock);
691 return;
692 }
693
694 #ifdef DIAGNOSTIC
695 if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
696 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
697 vprint("vrelel: missing VOP_CLOSE()", vp);
698 }
699 #endif
700
701 /*
702 * The vnode can gain another reference while being
703 * deactivated. If VOP_INACTIVE() indicates that
704 * the described file has been deleted, then recycle
705 * the vnode irrespective of additional references.
706 * Another thread may be waiting to re-use the on-disk
707 * inode.
708 *
709 * Note that VOP_INACTIVE() will drop the vnode lock.
710 */
711 VOP_INACTIVE(vp, &recycle);
712 mutex_enter(&vp->v_interlock);
713 vp->v_iflag &= ~VI_INACTNOW;
714 if (!recycle) {
715 if (vtryrele(vp)) {
716 mutex_exit(&vp->v_interlock);
717 return;
718 }
719
720 /*
721 * If we grew another reference while
722 * VOP_INACTIVE() was underway, retry.
723 */
724 if ((vp->v_iflag & VI_INACTREDO) != 0) {
725 goto retry;
726 }
727 }
728
729 /* Take care of space accounting. */
730 if (vp->v_iflag & VI_EXECMAP) {
731 atomic_add_int(&uvmexp.execpages,
732 -vp->v_uobj.uo_npages);
733 atomic_add_int(&uvmexp.filepages,
734 vp->v_uobj.uo_npages);
735 }
736 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP);
737 vp->v_vflag &= ~VV_MAPPED;
738
739 /*
740 * Recycle the vnode if the file is now unused (unlinked),
741 * otherwise just free it.
742 */
743 if (recycle) {
744 vclean(vp, DOCLOSE);
745 }
746 KASSERT(vp->v_usecount > 0);
747 }
748
749 if (atomic_dec_uint_nv(&vp->v_usecount) != 0) {
750 /* Gained another reference while being reclaimed. */
751 mutex_exit(&vp->v_interlock);
752 return;
753 }
754
755 if ((vp->v_iflag & VI_CLEAN) != 0) {
756 /*
757 * It's clean so destroy it. It isn't referenced
758 * anywhere since it has been reclaimed.
759 */
760 KASSERT(vp->v_holdcnt == 0);
761 KASSERT(vp->v_writecount == 0);
762 mutex_exit(&vp->v_interlock);
763 vfs_insmntque(vp, NULL);
764 if (vp->v_type == VBLK || vp->v_type == VCHR) {
765 spec_node_destroy(vp);
766 }
767 vnfree(vp);
768 } else {
769 /*
770 * Otherwise, put it back onto the freelist. It
771 * can't be destroyed while still associated with
772 * a file system.
773 */
774 mutex_enter(&vnode_free_list_lock);
775 if (vp->v_holdcnt > 0) {
776 vp->v_freelisthd = &vnode_hold_list;
777 } else {
778 vp->v_freelisthd = &vnode_free_list;
779 }
780 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
781 mutex_exit(&vnode_free_list_lock);
782 mutex_exit(&vp->v_interlock);
783 }
784 }
785
786 void
787 vrele(vnode_t *vp)
788 {
789
790 KASSERT((vp->v_iflag & VI_MARKER) == 0);
791
792 if ((vp->v_iflag & VI_INACTNOW) == 0 && vtryrele(vp)) {
793 return;
794 }
795 mutex_enter(&vp->v_interlock);
796 vrelel(vp, 0);
797 }
798
799 /*
800 * Asynchronous vnode release, vnode is released in different context.
801 */
802 void
803 vrele_async(vnode_t *vp)
804 {
805
806 KASSERT((vp->v_iflag & VI_MARKER) == 0);
807
808 if ((vp->v_iflag & VI_INACTNOW) == 0 && vtryrele(vp)) {
809 return;
810 }
811 mutex_enter(&vp->v_interlock);
812 vrelel(vp, VRELEL_ASYNC_RELE);
813 }
814
815 static void
816 vrele_thread(void *cookie)
817 {
818 vnode_t *vp;
819
820 for (;;) {
821 mutex_enter(&vrele_lock);
822 while (TAILQ_EMPTY(&vrele_list)) {
823 vrele_gen++;
824 cv_broadcast(&vrele_cv);
825 cv_timedwait(&vrele_cv, &vrele_lock, hz);
826 }
827 vp = TAILQ_FIRST(&vrele_list);
828 TAILQ_REMOVE(&vrele_list, vp, v_freelist);
829 vrele_pending--;
830 mutex_exit(&vrele_lock);
831
832 /*
833 * If not the last reference, then ignore the vnode
834 * and look for more work.
835 */
836 mutex_enter(&vp->v_interlock);
837 KASSERT((vp->v_iflag & VI_INACTPEND) != 0);
838 vp->v_iflag &= ~VI_INACTPEND;
839 vrelel(vp, 0);
840 }
841 }
842
843 void
844 vrele_flush(void)
845 {
846 int gen;
847
848 mutex_enter(&vrele_lock);
849 gen = vrele_gen;
850 while (vrele_pending && gen == vrele_gen) {
851 cv_broadcast(&vrele_cv);
852 cv_wait(&vrele_cv, &vrele_lock);
853 }
854 mutex_exit(&vrele_lock);
855 }
856
857 /*
858 * Vnode reference, where a reference is already held by some other
859 * object (for example, a file structure).
860 */
861 void
862 vref(vnode_t *vp)
863 {
864
865 KASSERT((vp->v_iflag & VI_MARKER) == 0);
866 KASSERT(vp->v_usecount != 0);
867
868 atomic_inc_uint(&vp->v_usecount);
869 }
870
871 /*
872 * Page or buffer structure gets a reference.
873 * Called with v_interlock held.
874 */
875 void
876 vholdl(vnode_t *vp)
877 {
878
879 KASSERT(mutex_owned(&vp->v_interlock));
880 KASSERT((vp->v_iflag & VI_MARKER) == 0);
881
882 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) {
883 mutex_enter(&vnode_free_list_lock);
884 KASSERT(vp->v_freelisthd == &vnode_free_list);
885 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
886 vp->v_freelisthd = &vnode_hold_list;
887 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
888 mutex_exit(&vnode_free_list_lock);
889 }
890 }
891
892 /*
893 * Page or buffer structure frees a reference.
894 * Called with v_interlock held.
895 */
896 void
897 holdrelel(vnode_t *vp)
898 {
899
900 KASSERT(mutex_owned(&vp->v_interlock));
901 KASSERT((vp->v_iflag & VI_MARKER) == 0);
902
903 if (vp->v_holdcnt <= 0) {
904 vpanic(vp, "holdrelel: holdcnt vp %p");
905 }
906
907 vp->v_holdcnt--;
908 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) {
909 mutex_enter(&vnode_free_list_lock);
910 KASSERT(vp->v_freelisthd == &vnode_hold_list);
911 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
912 vp->v_freelisthd = &vnode_free_list;
913 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
914 mutex_exit(&vnode_free_list_lock);
915 }
916 }
917
918 /*
919 * Disassociate the underlying file system from a vnode.
920 *
921 * Must be called with the interlock held, and will return with it held.
922 */
923 void
924 vclean(vnode_t *vp, int flags)
925 {
926 lwp_t *l = curlwp;
927 bool recycle, active;
928 int error;
929
930 KASSERT(mutex_owned(&vp->v_interlock));
931 KASSERT((vp->v_iflag & VI_MARKER) == 0);
932 KASSERT(vp->v_usecount != 0);
933
934 /* If cleaning is already in progress wait until done and return. */
935 if (vp->v_iflag & VI_XLOCK) {
936 vwait(vp, VI_XLOCK);
937 return;
938 }
939
940 /* If already clean, nothing to do. */
941 if ((vp->v_iflag & VI_CLEAN) != 0) {
942 return;
943 }
944
945 /*
946 * Prevent the vnode from being recycled or brought into use
947 * while we clean it out.
948 */
949 vp->v_iflag |= VI_XLOCK;
950 if (vp->v_iflag & VI_EXECMAP) {
951 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages);
952 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages);
953 }
954 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
955 active = (vp->v_usecount & VC_MASK) > 1;
956
957 /* XXXAD should not lock vnode under layer */
958 mutex_exit(&vp->v_interlock);
959 VOP_LOCK(vp, LK_EXCLUSIVE);
960
961 /*
962 * Clean out any cached data associated with the vnode.
963 * If purging an active vnode, it must be closed and
964 * deactivated before being reclaimed. Note that the
965 * VOP_INACTIVE will unlock the vnode.
966 */
967 if (flags & DOCLOSE) {
968 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
969 if (error != 0) {
970 /* XXX, fix vn_start_write's grab of mp and use that. */
971
972 if (wapbl_vphaswapbl(vp))
973 WAPBL_DISCARD(wapbl_vptomp(vp));
974 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
975 }
976 KASSERT(error == 0);
977 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
978 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) {
979 spec_node_revoke(vp);
980 }
981 }
982 if (active) {
983 VOP_INACTIVE(vp, &recycle);
984 } else {
985 /*
986 * Any other processes trying to obtain this lock must first
987 * wait for VI_XLOCK to clear, then call the new lock operation.
988 */
989 VOP_UNLOCK(vp);
990 }
991
992 /* Disassociate the underlying file system from the vnode. */
993 if (VOP_RECLAIM(vp)) {
994 vpanic(vp, "vclean: cannot reclaim");
995 }
996
997 KASSERT(vp->v_uobj.uo_npages == 0);
998 if (vp->v_type == VREG && vp->v_ractx != NULL) {
999 uvm_ra_freectx(vp->v_ractx);
1000 vp->v_ractx = NULL;
1001 }
1002 cache_purge(vp);
1003
1004 /* Done with purge, notify sleepers of the grim news. */
1005 mutex_enter(&vp->v_interlock);
1006 vp->v_op = dead_vnodeop_p;
1007 vp->v_tag = VT_NON;
1008 KNOTE(&vp->v_klist, NOTE_REVOKE);
1009 vp->v_iflag &= ~VI_XLOCK;
1010 vp->v_vflag &= ~VV_LOCKSWORK;
1011 if ((flags & DOCLOSE) != 0) {
1012 vp->v_iflag |= VI_CLEAN;
1013 }
1014 cv_broadcast(&vp->v_cv);
1015
1016 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1017 }
1018
1019 /*
1020 * Recycle an unused vnode to the front of the free list.
1021 * Release the passed interlock if the vnode will be recycled.
1022 */
1023 int
1024 vrecycle(vnode_t *vp, kmutex_t *inter_lkp, struct lwp *l)
1025 {
1026
1027 KASSERT((vp->v_iflag & VI_MARKER) == 0);
1028
1029 mutex_enter(&vp->v_interlock);
1030 if (vp->v_usecount != 0) {
1031 mutex_exit(&vp->v_interlock);
1032 return (0);
1033 }
1034 if (inter_lkp) {
1035 mutex_exit(inter_lkp);
1036 }
1037 vremfree(vp);
1038 vp->v_usecount = 1;
1039 vclean(vp, DOCLOSE);
1040 vrelel(vp, 0);
1041 return (1);
1042 }
1043
1044 /*
1045 * Eliminate all activity associated with the requested vnode
1046 * and with all vnodes aliased to the requested vnode.
1047 */
1048 void
1049 vrevoke(vnode_t *vp)
1050 {
1051 vnode_t *vq, **vpp;
1052 enum vtype type;
1053 dev_t dev;
1054
1055 KASSERT(vp->v_usecount > 0);
1056
1057 mutex_enter(&vp->v_interlock);
1058 if ((vp->v_iflag & VI_CLEAN) != 0) {
1059 mutex_exit(&vp->v_interlock);
1060 return;
1061 } else if (vp->v_type != VBLK && vp->v_type != VCHR) {
1062 atomic_inc_uint(&vp->v_usecount);
1063 vclean(vp, DOCLOSE);
1064 vrelel(vp, 0);
1065 return;
1066 } else {
1067 dev = vp->v_rdev;
1068 type = vp->v_type;
1069 mutex_exit(&vp->v_interlock);
1070 }
1071
1072 vpp = &specfs_hash[SPECHASH(dev)];
1073 mutex_enter(&device_lock);
1074 for (vq = *vpp; vq != NULL;) {
1075 /* If clean or being cleaned, then ignore it. */
1076 mutex_enter(&vq->v_interlock);
1077 if ((vq->v_iflag & (VI_CLEAN | VI_XLOCK)) != 0 ||
1078 vq->v_rdev != dev || vq->v_type != type) {
1079 mutex_exit(&vq->v_interlock);
1080 vq = vq->v_specnext;
1081 continue;
1082 }
1083 mutex_exit(&device_lock);
1084 if (vq->v_usecount == 0) {
1085 vremfree(vq);
1086 vq->v_usecount = 1;
1087 } else {
1088 atomic_inc_uint(&vq->v_usecount);
1089 }
1090 vclean(vq, DOCLOSE);
1091 vrelel(vq, 0);
1092 mutex_enter(&device_lock);
1093 vq = *vpp;
1094 }
1095 mutex_exit(&device_lock);
1096 }
1097
1098 /*
1099 * Eliminate all activity associated with a vnode in preparation for
1100 * reuse. Drops a reference from the vnode.
1101 */
1102 void
1103 vgone(vnode_t *vp)
1104 {
1105
1106 mutex_enter(&vp->v_interlock);
1107 vclean(vp, DOCLOSE);
1108 vrelel(vp, 0);
1109 }
1110
1111 /*
1112 * Update outstanding I/O count and do wakeup if requested.
1113 */
1114 void
1115 vwakeup(struct buf *bp)
1116 {
1117 vnode_t *vp;
1118
1119 if ((vp = bp->b_vp) == NULL)
1120 return;
1121
1122 KASSERT(bp->b_objlock == &vp->v_interlock);
1123 KASSERT(mutex_owned(bp->b_objlock));
1124
1125 if (--vp->v_numoutput < 0)
1126 panic("vwakeup: neg numoutput, vp %p", vp);
1127 if (vp->v_numoutput == 0)
1128 cv_broadcast(&vp->v_cv);
1129 }
1130
1131 /*
1132 * Wait for a vnode (typically with VI_XLOCK set) to be cleaned or
1133 * recycled.
1134 */
1135 void
1136 vwait(vnode_t *vp, int flags)
1137 {
1138
1139 KASSERT(mutex_owned(&vp->v_interlock));
1140 KASSERT(vp->v_usecount != 0);
1141
1142 while ((vp->v_iflag & flags) != 0)
1143 cv_wait(&vp->v_cv, &vp->v_interlock);
1144 }
1145
1146 int
1147 vfs_drainvnodes(long target)
1148 {
1149
1150 while (numvnodes > target) {
1151 vnode_t *vp;
1152
1153 mutex_enter(&vnode_free_list_lock);
1154 vp = getcleanvnode();
1155 if (vp == NULL) {
1156 return EBUSY;
1157 }
1158 ungetnewvnode(vp);
1159 }
1160 return 0;
1161 }
1162
1163 void
1164 vpanic(vnode_t *vp, const char *msg)
1165 {
1166 #ifdef DIAGNOSTIC
1167
1168 vprint(NULL, vp);
1169 panic("%s\n", msg);
1170 #endif
1171 }
1172