vfs_mount.c revision 1.4.2.2 1 /* $NetBSD: vfs_mount.c,v 1.4.2.2 2011/04/21 01:42:11 rmind Exp $ */
2
3 /*-
4 * Copyright (c) 1997-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /*
34 * Copyright (c) 1989, 1993
35 * The Regents of the University of California. All rights reserved.
36 * (c) UNIX System Laboratories, Inc.
37 * All or some portions of this file are derived from material licensed
38 * to the University of California by American Telephone and Telegraph
39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40 * the permission of UNIX System Laboratories, Inc.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
67 */
68
69 #include <sys/cdefs.h>
70 __KERNEL_RCSID(0, "$NetBSD: vfs_mount.c,v 1.4.2.2 2011/04/21 01:42:11 rmind Exp $");
71
72 #include <sys/param.h>
73 #include <sys/kernel.h>
74
75 #include <sys/atomic.h>
76 #include <sys/buf.h>
77 #include <sys/conf.h>
78 #include <sys/fcntl.h>
79 #include <sys/filedesc.h>
80 #include <sys/device.h>
81 #include <sys/kauth.h>
82 #include <sys/kmem.h>
83 #include <sys/module.h>
84 #include <sys/mount.h>
85 #include <sys/namei.h>
86 #include <sys/syscallargs.h>
87 #include <sys/sysctl.h>
88 #include <sys/systm.h>
89 #include <sys/vnode.h>
90
91 #include <miscfs/genfs/genfs.h>
92 #include <miscfs/syncfs/syncfs.h>
93 #include <miscfs/specfs/specdev.h>
94
95 /* Root filesystem and device. */
96 vnode_t * rootvnode;
97 struct device * root_device;
98
99 /* Mounted filesystem list. */
100 struct mntlist mountlist;
101 kmutex_t mountlist_lock;
102
103 kmutex_t mntvnode_lock;
104 kmutex_t vfs_list_lock;
105
106 static specificdata_domain_t mount_specificdata_domain;
107 static kmutex_t mntid_lock;
108
109 static kmutex_t mountgen_lock;
110 static uint64_t mountgen;
111
112 void
113 vfs_mount_sysinit(void)
114 {
115
116 CIRCLEQ_INIT(&mountlist);
117 mutex_init(&mountlist_lock, MUTEX_DEFAULT, IPL_NONE);
118 mutex_init(&mntvnode_lock, MUTEX_DEFAULT, IPL_NONE);
119 mutex_init(&vfs_list_lock, MUTEX_DEFAULT, IPL_NONE);
120
121 mount_specificdata_domain = specificdata_domain_create();
122 mutex_init(&mntid_lock, MUTEX_DEFAULT, IPL_NONE);
123 mutex_init(&mountgen_lock, MUTEX_DEFAULT, IPL_NONE);
124 mountgen = 0;
125 }
126
127 struct mount *
128 vfs_mountalloc(struct vfsops *vfsops, vnode_t *vp)
129 {
130 struct mount *mp;
131 int error;
132
133 mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
134 if (mp == NULL)
135 return NULL;
136
137 mp->mnt_op = vfsops;
138 mp->mnt_refcnt = 1;
139 TAILQ_INIT(&mp->mnt_vnodelist);
140 rw_init(&mp->mnt_unmounting);
141 mutex_init(&mp->mnt_renamelock, MUTEX_DEFAULT, IPL_NONE);
142 mutex_init(&mp->mnt_updating, MUTEX_DEFAULT, IPL_NONE);
143 error = vfs_busy(mp, NULL);
144 KASSERT(error == 0);
145 mp->mnt_vnodecovered = vp;
146 mount_initspecific(mp);
147
148 mutex_enter(&mountgen_lock);
149 mp->mnt_gen = mountgen++;
150 mutex_exit(&mountgen_lock);
151
152 return mp;
153 }
154
155 /*
156 * vfs_rootmountalloc: lookup a filesystem type, and if found allocate and
157 * initialize a mount structure for it.
158 *
159 * Devname is usually updated by mount(8) after booting.
160 */
161 int
162 vfs_rootmountalloc(const char *fstypename, const char *devname,
163 struct mount **mpp)
164 {
165 struct vfsops *vfsp = NULL;
166 struct mount *mp;
167
168 mutex_enter(&vfs_list_lock);
169 LIST_FOREACH(vfsp, &vfs_list, vfs_list)
170 if (!strncmp(vfsp->vfs_name, fstypename,
171 sizeof(mp->mnt_stat.f_fstypename)))
172 break;
173 if (vfsp == NULL) {
174 mutex_exit(&vfs_list_lock);
175 return (ENODEV);
176 }
177 vfsp->vfs_refcount++;
178 mutex_exit(&vfs_list_lock);
179
180 if ((mp = vfs_mountalloc(vfsp, NULL)) == NULL)
181 return ENOMEM;
182 mp->mnt_flag = MNT_RDONLY;
183 (void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name,
184 sizeof(mp->mnt_stat.f_fstypename));
185 mp->mnt_stat.f_mntonname[0] = '/';
186 mp->mnt_stat.f_mntonname[1] = '\0';
187 mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] =
188 '\0';
189 (void)copystr(devname, mp->mnt_stat.f_mntfromname,
190 sizeof(mp->mnt_stat.f_mntfromname) - 1, 0);
191 *mpp = mp;
192 return 0;
193 }
194
195 /*
196 * vfs_getnewfsid: get a new unique fsid.
197 */
198 void
199 vfs_getnewfsid(struct mount *mp)
200 {
201 static u_short xxxfs_mntid;
202 fsid_t tfsid;
203 int mtype;
204
205 mutex_enter(&mntid_lock);
206 mtype = makefstype(mp->mnt_op->vfs_name);
207 mp->mnt_stat.f_fsidx.__fsid_val[0] = makedev(mtype, 0);
208 mp->mnt_stat.f_fsidx.__fsid_val[1] = mtype;
209 mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
210 if (xxxfs_mntid == 0)
211 ++xxxfs_mntid;
212 tfsid.__fsid_val[0] = makedev(mtype & 0xff, xxxfs_mntid);
213 tfsid.__fsid_val[1] = mtype;
214 if (!CIRCLEQ_EMPTY(&mountlist)) {
215 while (vfs_getvfs(&tfsid)) {
216 tfsid.__fsid_val[0]++;
217 xxxfs_mntid++;
218 }
219 }
220 mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0];
221 mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
222 mutex_exit(&mntid_lock);
223 }
224
225 /*
226 * Lookup a mount point by filesystem identifier.
227 *
228 * XXX Needs to add a reference to the mount point.
229 */
230 struct mount *
231 vfs_getvfs(fsid_t *fsid)
232 {
233 struct mount *mp;
234
235 mutex_enter(&mountlist_lock);
236 CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) {
237 if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] &&
238 mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) {
239 mutex_exit(&mountlist_lock);
240 return (mp);
241 }
242 }
243 mutex_exit(&mountlist_lock);
244 return NULL;
245 }
246
247 /*
248 * Drop a reference to a mount structure, freeing if the last reference.
249 */
250 void
251 vfs_destroy(struct mount *mp)
252 {
253
254 if (__predict_true((int)atomic_dec_uint_nv(&mp->mnt_refcnt) > 0)) {
255 return;
256 }
257
258 /*
259 * Nothing else has visibility of the mount: we can now
260 * free the data structures.
261 */
262 KASSERT(mp->mnt_refcnt == 0);
263 specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
264 rw_destroy(&mp->mnt_unmounting);
265 mutex_destroy(&mp->mnt_updating);
266 mutex_destroy(&mp->mnt_renamelock);
267 if (mp->mnt_op != NULL) {
268 vfs_delref(mp->mnt_op);
269 }
270 kmem_free(mp, sizeof(*mp));
271 }
272
273 /*
274 * Mark a mount point as busy, and gain a new reference to it. Used to
275 * prevent the file system from being unmounted during critical sections.
276 *
277 * => The caller must hold a pre-existing reference to the mount.
278 * => Will fail if the file system is being unmounted, or is unmounted.
279 */
280 int
281 vfs_busy(struct mount *mp, struct mount **nextp)
282 {
283
284 KASSERT(mp->mnt_refcnt > 0);
285
286 if (__predict_false(!rw_tryenter(&mp->mnt_unmounting, RW_READER))) {
287 if (nextp != NULL) {
288 KASSERT(mutex_owned(&mountlist_lock));
289 *nextp = CIRCLEQ_NEXT(mp, mnt_list);
290 }
291 return EBUSY;
292 }
293 if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) {
294 rw_exit(&mp->mnt_unmounting);
295 if (nextp != NULL) {
296 KASSERT(mutex_owned(&mountlist_lock));
297 *nextp = CIRCLEQ_NEXT(mp, mnt_list);
298 }
299 return ENOENT;
300 }
301 if (nextp != NULL) {
302 mutex_exit(&mountlist_lock);
303 }
304 atomic_inc_uint(&mp->mnt_refcnt);
305 return 0;
306 }
307
308 /*
309 * Unbusy a busy filesystem.
310 *
311 * => If keepref is true, preserve reference added by vfs_busy().
312 * => If nextp != NULL, acquire mountlist_lock.
313 */
314 void
315 vfs_unbusy(struct mount *mp, bool keepref, struct mount **nextp)
316 {
317
318 KASSERT(mp->mnt_refcnt > 0);
319
320 if (nextp != NULL) {
321 mutex_enter(&mountlist_lock);
322 }
323 rw_exit(&mp->mnt_unmounting);
324 if (!keepref) {
325 vfs_destroy(mp);
326 }
327 if (nextp != NULL) {
328 KASSERT(mutex_owned(&mountlist_lock));
329 *nextp = CIRCLEQ_NEXT(mp, mnt_list);
330 }
331 }
332
333 /*
334 * Insert a marker vnode into a mount's vnode list, after the
335 * specified vnode. mntvnode_lock must be held.
336 */
337 void
338 vmark(vnode_t *mvp, vnode_t *vp)
339 {
340 struct mount *mp = mvp->v_mount;
341
342 KASSERT(mutex_owned(&mntvnode_lock));
343 KASSERT((mvp->v_iflag & VI_MARKER) != 0);
344 KASSERT(vp->v_mount == mp);
345
346 TAILQ_INSERT_AFTER(&mp->mnt_vnodelist, vp, mvp, v_mntvnodes);
347 }
348
349 /*
350 * Remove a marker vnode from a mount's vnode list, and return
351 * a pointer to the next vnode in the list. mntvnode_lock must
352 * be held.
353 */
354 vnode_t *
355 vunmark(vnode_t *mvp)
356 {
357 struct mount *mp = mvp->v_mount;
358 vnode_t *vp;
359
360 KASSERT(mutex_owned(&mntvnode_lock));
361 KASSERT((mvp->v_iflag & VI_MARKER) != 0);
362
363 vp = TAILQ_NEXT(mvp, v_mntvnodes);
364 TAILQ_REMOVE(&mp->mnt_vnodelist, mvp, v_mntvnodes);
365
366 KASSERT(vp == NULL || vp->v_mount == mp);
367
368 return vp;
369 }
370
371 /*
372 * Move a vnode from one mount queue to another.
373 */
374 void
375 vfs_insmntque(vnode_t *vp, struct mount *mp)
376 {
377 struct mount *omp;
378
379 KASSERT(mp == NULL || (mp->mnt_iflag & IMNT_UNMOUNT) == 0 ||
380 vp->v_tag == VT_VFS);
381
382 mutex_enter(&mntvnode_lock);
383 /*
384 * Delete from old mount point vnode list, if on one.
385 */
386 if ((omp = vp->v_mount) != NULL)
387 TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes);
388 /*
389 * Insert into list of vnodes for the new mount point, if
390 * available. The caller must take a reference on the mount
391 * structure and donate to the vnode.
392 */
393 if ((vp->v_mount = mp) != NULL)
394 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
395 mutex_exit(&mntvnode_lock);
396
397 if (omp != NULL) {
398 /* Release reference to old mount. */
399 vfs_destroy(omp);
400 }
401 }
402
403 /*
404 * Remove any vnodes in the vnode table belonging to mount point mp.
405 *
406 * If FORCECLOSE is not specified, there should not be any active ones,
407 * return error if any are found (nb: this is a user error, not a
408 * system error). If FORCECLOSE is specified, detach any active vnodes
409 * that are found.
410 *
411 * If WRITECLOSE is set, only flush out regular file vnodes open for
412 * writing.
413 *
414 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
415 */
416 #ifdef DEBUG
417 int busyprt = 0; /* print out busy vnodes */
418 struct ctldebug debug1 = { "busyprt", &busyprt };
419 #endif
420
421 static vnode_t *
422 vflushnext(vnode_t *mvp, int *when)
423 {
424
425 if (hardclock_ticks > *when) {
426 mutex_exit(&mntvnode_lock);
427 yield();
428 mutex_enter(&mntvnode_lock);
429 *when = hardclock_ticks + hz / 10;
430 }
431 return vunmark(mvp);
432 }
433
434 int
435 vflush(struct mount *mp, vnode_t *skipvp, int flags)
436 {
437 vnode_t *vp, *mvp;
438 int busy = 0, when = 0;
439
440 /* First, flush out any vnode references from vrele_list. */
441 vrele_flush();
442
443 /* Allocate a marker vnode. */
444 mvp = vnalloc(mp);
445 if (mvp == NULL) {
446 return ENOMEM;
447 }
448
449 /*
450 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
451 * and vclean() are called.
452 */
453 mutex_enter(&mntvnode_lock);
454 for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp != NULL;
455 vp = vflushnext(mvp, &when)) {
456 vmark(mvp, vp);
457 if (vp->v_mount != mp || vismarker(vp))
458 continue;
459 /*
460 * Skip over a selected vnode.
461 */
462 if (vp == skipvp)
463 continue;
464 mutex_enter(vp->v_interlock);
465 /*
466 * Ignore clean but still referenced vnodes.
467 */
468 if ((vp->v_iflag & VI_CLEAN) != 0) {
469 mutex_exit(vp->v_interlock);
470 continue;
471 }
472 /*
473 * Skip over a vnodes marked VSYSTEM.
474 */
475 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
476 mutex_exit(vp->v_interlock);
477 continue;
478 }
479 /*
480 * If WRITECLOSE is set, only flush out regular file
481 * vnodes open for writing.
482 */
483 if ((flags & WRITECLOSE) &&
484 (vp->v_writecount == 0 || vp->v_type != VREG)) {
485 mutex_exit(vp->v_interlock);
486 continue;
487 }
488 /*
489 * With v_usecount == 0, all we need to do is clear
490 * out the vnode data structures and we are done.
491 */
492 if (vp->v_usecount == 0) {
493 mutex_exit(&mntvnode_lock);
494 vremfree(vp);
495 vp->v_usecount = 1;
496 vclean(vp, DOCLOSE);
497 vrelel(vp, 0);
498 mutex_enter(&mntvnode_lock);
499 continue;
500 }
501 /*
502 * If FORCECLOSE is set, forcibly close the vnode.
503 * For block or character devices, revert to an
504 * anonymous device. For all other files, just
505 * kill them.
506 */
507 if (flags & FORCECLOSE) {
508 mutex_exit(&mntvnode_lock);
509 atomic_inc_uint(&vp->v_usecount);
510 if (vp->v_type != VBLK && vp->v_type != VCHR) {
511 vclean(vp, DOCLOSE);
512 vrelel(vp, 0);
513 } else {
514 vclean(vp, 0);
515 vp->v_op = spec_vnodeop_p; /* XXXSMP */
516 mutex_exit(vp->v_interlock);
517 /*
518 * The vnode isn't clean, but still resides
519 * on the mount list. Remove it. XXX This
520 * is a bit dodgy.
521 */
522 vfs_insmntque(vp, NULL);
523 vrele(vp);
524 }
525 mutex_enter(&mntvnode_lock);
526 continue;
527 }
528 #ifdef DEBUG
529 if (busyprt)
530 vprint("vflush: busy vnode", vp);
531 #endif
532 mutex_exit(vp->v_interlock);
533 busy++;
534 }
535 mutex_exit(&mntvnode_lock);
536 vnfree(mvp);
537 if (busy)
538 return (EBUSY);
539 return (0);
540 }
541
542 /*
543 * Remove clean vnodes from a mountpoint's vnode list.
544 */
545 void
546 vfs_scrubvnlist(struct mount *mp)
547 {
548 vnode_t *vp, *nvp;
549
550 retry:
551 mutex_enter(&mntvnode_lock);
552 for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) {
553 nvp = TAILQ_NEXT(vp, v_mntvnodes);
554 mutex_enter(vp->v_interlock);
555 if ((vp->v_iflag & VI_CLEAN) != 0) {
556 TAILQ_REMOVE(&mp->mnt_vnodelist, vp, v_mntvnodes);
557 vp->v_mount = NULL;
558 mutex_exit(&mntvnode_lock);
559 mutex_exit(vp->v_interlock);
560 vfs_destroy(mp);
561 goto retry;
562 }
563 mutex_exit(vp->v_interlock);
564 }
565 mutex_exit(&mntvnode_lock);
566 }
567
568 /*
569 * Mount a file system.
570 */
571
572 /*
573 * Scan all active processes to see if any of them have a current or root
574 * directory onto which the new filesystem has just been mounted. If so,
575 * replace them with the new mount point.
576 */
577 static void
578 mount_checkdirs(vnode_t *olddp)
579 {
580 vnode_t *newdp, *rele1, *rele2;
581 struct cwdinfo *cwdi;
582 struct proc *p;
583 bool retry;
584
585 if (olddp->v_usecount == 1) {
586 return;
587 }
588 if (VFS_ROOT(olddp->v_mountedhere, &newdp))
589 panic("mount: lost mount");
590
591 do {
592 retry = false;
593 mutex_enter(proc_lock);
594 PROCLIST_FOREACH(p, &allproc) {
595 if ((cwdi = p->p_cwdi) == NULL)
596 continue;
597 /*
598 * Cannot change to the old directory any more,
599 * so even if we see a stale value it is not a
600 * problem.
601 */
602 if (cwdi->cwdi_cdir != olddp &&
603 cwdi->cwdi_rdir != olddp)
604 continue;
605 retry = true;
606 rele1 = NULL;
607 rele2 = NULL;
608 atomic_inc_uint(&cwdi->cwdi_refcnt);
609 mutex_exit(proc_lock);
610 rw_enter(&cwdi->cwdi_lock, RW_WRITER);
611 if (cwdi->cwdi_cdir == olddp) {
612 rele1 = cwdi->cwdi_cdir;
613 vref(newdp);
614 cwdi->cwdi_cdir = newdp;
615 }
616 if (cwdi->cwdi_rdir == olddp) {
617 rele2 = cwdi->cwdi_rdir;
618 vref(newdp);
619 cwdi->cwdi_rdir = newdp;
620 }
621 rw_exit(&cwdi->cwdi_lock);
622 cwdfree(cwdi);
623 if (rele1 != NULL)
624 vrele(rele1);
625 if (rele2 != NULL)
626 vrele(rele2);
627 mutex_enter(proc_lock);
628 break;
629 }
630 mutex_exit(proc_lock);
631 } while (retry);
632
633 if (rootvnode == olddp) {
634 vrele(rootvnode);
635 vref(newdp);
636 rootvnode = newdp;
637 }
638 vput(newdp);
639 }
640
641 int
642 mount_domount(struct lwp *l, vnode_t **vpp, struct vfsops *vfsops,
643 const char *path, int flags, void *data, size_t *data_len)
644 {
645 vnode_t *vp = *vpp;
646 struct mount *mp;
647 struct vattr va;
648 struct pathbuf *pb;
649 struct nameidata nd;
650 int error;
651
652 error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
653 KAUTH_REQ_SYSTEM_MOUNT_NEW, vp, KAUTH_ARG(flags), data);
654 if (error) {
655 vfs_delref(vfsops);
656 return error;
657 }
658
659 /* Cannot make a non-dir a mount-point (from here anyway). */
660 if (vp->v_type != VDIR) {
661 vfs_delref(vfsops);
662 return ENOTDIR;
663 }
664
665 /*
666 * If the user is not root, ensure that they own the directory
667 * onto which we are attempting to mount.
668 */
669 if ((error = VOP_GETATTR(vp, &va, l->l_cred)) != 0 ||
670 (va.va_uid != kauth_cred_geteuid(l->l_cred) &&
671 (error = kauth_authorize_generic(l->l_cred,
672 KAUTH_GENERIC_ISSUSER, NULL)) != 0)) {
673 vfs_delref(vfsops);
674 return error;
675 }
676
677 if (flags & MNT_EXPORTED) {
678 vfs_delref(vfsops);
679 return EINVAL;
680 }
681
682 if ((mp = vfs_mountalloc(vfsops, vp)) == NULL) {
683 vfs_delref(vfsops);
684 return ENOMEM;
685 }
686
687 mp->mnt_stat.f_owner = kauth_cred_geteuid(l->l_cred);
688
689 /*
690 * The underlying file system may refuse the mount for
691 * various reasons. Allow the user to force it to happen.
692 *
693 * Set the mount level flags.
694 */
695 mp->mnt_flag = flags &
696 (MNT_FORCE | MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
697 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP |
698 MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP |
699 MNT_LOG | MNT_IGNORE | MNT_RDONLY);
700
701 mutex_enter(&mp->mnt_updating);
702 error = VFS_MOUNT(mp, path, data, data_len);
703 mp->mnt_flag &= ~MNT_OP_FLAGS;
704
705 if (error != 0)
706 goto err_unmounted;
707
708 /*
709 * Validate and prepare the mount point.
710 */
711 error = pathbuf_copyin(path, &pb);
712 if (error != 0) {
713 goto err_mounted;
714 }
715 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
716 error = namei(&nd);
717 pathbuf_destroy(pb);
718 if (error != 0) {
719 goto err_mounted;
720 }
721 if (nd.ni_vp != vp) {
722 vput(nd.ni_vp);
723 error = EINVAL;
724 goto err_mounted;
725 }
726 if (vp->v_mountedhere != NULL) {
727 vput(nd.ni_vp);
728 error = EBUSY;
729 goto err_mounted;
730 }
731 error = vinvalbuf(vp, V_SAVE, l->l_cred, l, 0, 0);
732 if (error != 0) {
733 vput(nd.ni_vp);
734 goto err_mounted;
735 }
736
737 /*
738 * Put the new filesystem on the mount list after root.
739 */
740 cache_purge(vp);
741 mp->mnt_iflag &= ~IMNT_WANTRDWR;
742
743 mutex_enter(&mountlist_lock);
744 CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
745 mutex_exit(&mountlist_lock);
746 if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
747 error = vfs_allocate_syncvnode(mp);
748 if (error == 0)
749 vp->v_mountedhere = mp;
750 vput(nd.ni_vp);
751 if (error != 0)
752 goto err_onmountlist;
753
754 mount_checkdirs(vp);
755 mutex_exit(&mp->mnt_updating);
756
757 /* Hold an additional reference to the mount across VFS_START(). */
758 vfs_unbusy(mp, true, NULL);
759 (void) VFS_STATVFS(mp, &mp->mnt_stat);
760 error = VFS_START(mp, 0);
761 if (error)
762 vrele(vp);
763 /* Drop reference held for VFS_START(). */
764 vfs_destroy(mp);
765 *vpp = NULL;
766 return error;
767
768 err_onmountlist:
769 mutex_enter(&mountlist_lock);
770 CIRCLEQ_REMOVE(&mountlist, mp, mnt_list);
771 mp->mnt_iflag |= IMNT_GONE;
772 mutex_exit(&mountlist_lock);
773
774 err_mounted:
775 if (VFS_UNMOUNT(mp, MNT_FORCE) != 0)
776 panic("Unmounting fresh file system failed");
777
778 err_unmounted:
779 vp->v_mountedhere = NULL;
780 mutex_exit(&mp->mnt_updating);
781 vfs_unbusy(mp, false, NULL);
782 vfs_destroy(mp);
783
784 return error;
785 }
786
787 /*
788 * Do the actual file system unmount. File system is assumed to have
789 * been locked by the caller.
790 *
791 * => Caller hold reference to the mount, explicitly for dounmount().
792 */
793 int
794 dounmount(struct mount *mp, int flags, struct lwp *l)
795 {
796 vnode_t *coveredvp;
797 int error, async, used_syncer;
798
799 #if NVERIEXEC > 0
800 error = veriexec_unmountchk(mp);
801 if (error)
802 return (error);
803 #endif /* NVERIEXEC > 0 */
804
805 /*
806 * XXX Freeze syncer. Must do this before locking the
807 * mount point. See dounmount() for details.
808 */
809 mutex_enter(&syncer_mutex);
810 rw_enter(&mp->mnt_unmounting, RW_WRITER);
811 if ((mp->mnt_iflag & IMNT_GONE) != 0) {
812 rw_exit(&mp->mnt_unmounting);
813 mutex_exit(&syncer_mutex);
814 return ENOENT;
815 }
816
817 used_syncer = (mp->mnt_syncer != NULL);
818
819 /*
820 * XXX Syncer must be frozen when we get here. This should really
821 * be done on a per-mountpoint basis, but the syncer doesn't work
822 * like that.
823 *
824 * The caller of dounmount() must acquire syncer_mutex because
825 * the syncer itself acquires locks in syncer_mutex -> vfs_busy
826 * order, and we must preserve that order to avoid deadlock.
827 *
828 * So, if the file system did not use the syncer, now is
829 * the time to release the syncer_mutex.
830 */
831 if (used_syncer == 0) {
832 mutex_exit(&syncer_mutex);
833 }
834 mp->mnt_iflag |= IMNT_UNMOUNT;
835 async = mp->mnt_flag & MNT_ASYNC;
836 mp->mnt_flag &= ~MNT_ASYNC;
837 cache_purgevfs(mp); /* remove cache entries for this file sys */
838 if (mp->mnt_syncer != NULL)
839 vfs_deallocate_syncvnode(mp);
840 error = 0;
841 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
842 error = VFS_SYNC(mp, MNT_WAIT, l->l_cred);
843 }
844 vfs_scrubvnlist(mp);
845 if (error == 0 || (flags & MNT_FORCE)) {
846 error = VFS_UNMOUNT(mp, flags);
847 }
848 if (error) {
849 if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
850 (void) vfs_allocate_syncvnode(mp);
851 mp->mnt_iflag &= ~IMNT_UNMOUNT;
852 mp->mnt_flag |= async;
853 rw_exit(&mp->mnt_unmounting);
854 if (used_syncer)
855 mutex_exit(&syncer_mutex);
856 return (error);
857 }
858 vfs_scrubvnlist(mp);
859 mutex_enter(&mountlist_lock);
860 if ((coveredvp = mp->mnt_vnodecovered) != NULLVP)
861 coveredvp->v_mountedhere = NULL;
862 CIRCLEQ_REMOVE(&mountlist, mp, mnt_list);
863 mp->mnt_iflag |= IMNT_GONE;
864 mutex_exit(&mountlist_lock);
865 if (TAILQ_FIRST(&mp->mnt_vnodelist) != NULL)
866 panic("unmount: dangling vnode");
867 if (used_syncer)
868 mutex_exit(&syncer_mutex);
869 vfs_hooks_unmount(mp);
870 rw_exit(&mp->mnt_unmounting);
871 vfs_destroy(mp); /* reference from mount() */
872 if (coveredvp != NULLVP) {
873 vrele(coveredvp);
874 }
875 return (0);
876 }
877
878 /*
879 * Unmount all file systems.
880 * We traverse the list in reverse order under the assumption that doing so
881 * will avoid needing to worry about dependencies.
882 */
883 bool
884 vfs_unmountall(struct lwp *l)
885 {
886
887 printf("unmounting file systems...");
888 return vfs_unmountall1(l, true, true);
889 }
890
891 static void
892 vfs_unmount_print(struct mount *mp, const char *pfx)
893 {
894
895 aprint_verbose("%sunmounted %s on %s type %s\n", pfx,
896 mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname,
897 mp->mnt_stat.f_fstypename);
898 }
899
900 bool
901 vfs_unmount_forceone(struct lwp *l)
902 {
903 struct mount *mp, *nmp;
904 int error;
905
906 nmp = NULL;
907
908 CIRCLEQ_FOREACH_REVERSE(mp, &mountlist, mnt_list) {
909 if (nmp == NULL || mp->mnt_gen > nmp->mnt_gen) {
910 nmp = mp;
911 }
912 }
913 if (nmp == NULL) {
914 return false;
915 }
916
917 #ifdef DEBUG
918 printf("\nforcefully unmounting %s (%s)...",
919 nmp->mnt_stat.f_mntonname, nmp->mnt_stat.f_mntfromname);
920 #endif
921 atomic_inc_uint(&nmp->mnt_refcnt);
922 if ((error = dounmount(nmp, MNT_FORCE, l)) == 0) {
923 vfs_unmount_print(nmp, "forcefully ");
924 return true;
925 } else {
926 vfs_destroy(nmp);
927 }
928
929 #ifdef DEBUG
930 printf("forceful unmount of %s failed with error %d\n",
931 nmp->mnt_stat.f_mntonname, error);
932 #endif
933
934 return false;
935 }
936
937 bool
938 vfs_unmountall1(struct lwp *l, bool force, bool verbose)
939 {
940 struct mount *mp, *nmp;
941 bool any_error = false, progress = false;
942 int error;
943
944 for (mp = CIRCLEQ_LAST(&mountlist);
945 mp != (void *)&mountlist;
946 mp = nmp) {
947 nmp = CIRCLEQ_PREV(mp, mnt_list);
948 #ifdef DEBUG
949 printf("\nunmounting %p %s (%s)...",
950 (void *)mp, mp->mnt_stat.f_mntonname,
951 mp->mnt_stat.f_mntfromname);
952 #endif
953 atomic_inc_uint(&mp->mnt_refcnt);
954 if ((error = dounmount(mp, force ? MNT_FORCE : 0, l)) == 0) {
955 vfs_unmount_print(mp, "");
956 progress = true;
957 } else {
958 vfs_destroy(mp);
959 if (verbose) {
960 printf("unmount of %s failed with error %d\n",
961 mp->mnt_stat.f_mntonname, error);
962 }
963 any_error = true;
964 }
965 }
966 if (verbose) {
967 printf(" done\n");
968 }
969 if (any_error && verbose) {
970 printf("WARNING: some file systems would not unmount\n");
971 }
972 return progress;
973 }
974
975 void
976 vfs_sync_all(struct lwp *l)
977 {
978 printf("syncing disks... ");
979
980 /* remove user processes from run queue */
981 suspendsched();
982 (void)spl0();
983
984 /* avoid coming back this way again if we panic. */
985 doing_shutdown = 1;
986
987 sys_sync(l, NULL, NULL);
988
989 /* Wait for sync to finish. */
990 if (buf_syncwait() != 0) {
991 #if defined(DDB) && defined(DEBUG_HALT_BUSY)
992 Debugger();
993 #endif
994 printf("giving up\n");
995 return;
996 } else
997 printf("done\n");
998 }
999
1000 /*
1001 * Sync and unmount file systems before shutting down.
1002 */
1003 void
1004 vfs_shutdown(void)
1005 {
1006 lwp_t *l = curlwp;
1007
1008 vfs_sync_all(l);
1009
1010 /*
1011 * If we have paniced - do not make the situation potentially
1012 * worse by unmounting the file systems.
1013 */
1014 if (panicstr != NULL) {
1015 return;
1016 }
1017
1018 /* Unmount file systems. */
1019 vfs_unmountall(l);
1020 }
1021
1022 /*
1023 * Print a list of supported file system types (used by vfs_mountroot)
1024 */
1025 static void
1026 vfs_print_fstypes(void)
1027 {
1028 struct vfsops *v;
1029 int cnt = 0;
1030
1031 mutex_enter(&vfs_list_lock);
1032 LIST_FOREACH(v, &vfs_list, vfs_list)
1033 ++cnt;
1034 mutex_exit(&vfs_list_lock);
1035
1036 if (cnt == 0) {
1037 printf("WARNING: No file system modules have been loaded.\n");
1038 return;
1039 }
1040
1041 printf("Supported file systems:");
1042 mutex_enter(&vfs_list_lock);
1043 LIST_FOREACH(v, &vfs_list, vfs_list) {
1044 printf(" %s", v->vfs_name);
1045 }
1046 mutex_exit(&vfs_list_lock);
1047 printf("\n");
1048 }
1049
1050 /*
1051 * Mount the root file system. If the operator didn't specify a
1052 * file system to use, try all possible file systems until one
1053 * succeeds.
1054 */
1055 int
1056 vfs_mountroot(void)
1057 {
1058 struct vfsops *v;
1059 int error = ENODEV;
1060
1061 if (root_device == NULL)
1062 panic("vfs_mountroot: root device unknown");
1063
1064 switch (device_class(root_device)) {
1065 case DV_IFNET:
1066 if (rootdev != NODEV)
1067 panic("vfs_mountroot: rootdev set for DV_IFNET "
1068 "(0x%llx -> %llu,%llu)",
1069 (unsigned long long)rootdev,
1070 (unsigned long long)major(rootdev),
1071 (unsigned long long)minor(rootdev));
1072 break;
1073
1074 case DV_DISK:
1075 if (rootdev == NODEV)
1076 panic("vfs_mountroot: rootdev not set for DV_DISK");
1077 if (bdevvp(rootdev, &rootvp))
1078 panic("vfs_mountroot: can't get vnode for rootdev");
1079 error = VOP_OPEN(rootvp, FREAD, FSCRED);
1080 if (error) {
1081 printf("vfs_mountroot: can't open root device\n");
1082 return (error);
1083 }
1084 break;
1085
1086 case DV_VIRTUAL:
1087 break;
1088
1089 default:
1090 printf("%s: inappropriate for root file system\n",
1091 device_xname(root_device));
1092 return (ENODEV);
1093 }
1094
1095 /*
1096 * If user specified a root fs type, use it. Make sure the
1097 * specified type exists and has a mount_root()
1098 */
1099 if (strcmp(rootfstype, ROOT_FSTYPE_ANY) != 0) {
1100 v = vfs_getopsbyname(rootfstype);
1101 error = EFTYPE;
1102 if (v != NULL) {
1103 if (v->vfs_mountroot != NULL) {
1104 error = (v->vfs_mountroot)();
1105 }
1106 v->vfs_refcount--;
1107 }
1108 goto done;
1109 }
1110
1111 /*
1112 * Try each file system currently configured into the kernel.
1113 */
1114 mutex_enter(&vfs_list_lock);
1115 LIST_FOREACH(v, &vfs_list, vfs_list) {
1116 if (v->vfs_mountroot == NULL)
1117 continue;
1118 #ifdef DEBUG
1119 aprint_normal("mountroot: trying %s...\n", v->vfs_name);
1120 #endif
1121 v->vfs_refcount++;
1122 mutex_exit(&vfs_list_lock);
1123 error = (*v->vfs_mountroot)();
1124 mutex_enter(&vfs_list_lock);
1125 v->vfs_refcount--;
1126 if (!error) {
1127 aprint_normal("root file system type: %s\n",
1128 v->vfs_name);
1129 break;
1130 }
1131 }
1132 mutex_exit(&vfs_list_lock);
1133
1134 if (v == NULL) {
1135 vfs_print_fstypes();
1136 printf("no file system for %s", device_xname(root_device));
1137 if (device_class(root_device) == DV_DISK)
1138 printf(" (dev 0x%llx)", (unsigned long long)rootdev);
1139 printf("\n");
1140 error = EFTYPE;
1141 }
1142
1143 done:
1144 if (error && device_class(root_device) == DV_DISK) {
1145 VOP_CLOSE(rootvp, FREAD, FSCRED);
1146 vrele(rootvp);
1147 }
1148 if (error == 0) {
1149 extern struct cwdinfo cwdi0;
1150
1151 CIRCLEQ_FIRST(&mountlist)->mnt_flag |= MNT_ROOTFS;
1152 CIRCLEQ_FIRST(&mountlist)->mnt_op->vfs_refcount++;
1153
1154 /*
1155 * Get the vnode for '/'. Set cwdi0.cwdi_cdir to
1156 * reference it.
1157 */
1158 error = VFS_ROOT(CIRCLEQ_FIRST(&mountlist), &rootvnode);
1159 if (error)
1160 panic("cannot find root vnode, error=%d", error);
1161 cwdi0.cwdi_cdir = rootvnode;
1162 vref(cwdi0.cwdi_cdir);
1163 VOP_UNLOCK(rootvnode);
1164 cwdi0.cwdi_rdir = NULL;
1165
1166 /*
1167 * Now that root is mounted, we can fixup initproc's CWD
1168 * info. All other processes are kthreads, which merely
1169 * share proc0's CWD info.
1170 */
1171 initproc->p_cwdi->cwdi_cdir = rootvnode;
1172 vref(initproc->p_cwdi->cwdi_cdir);
1173 initproc->p_cwdi->cwdi_rdir = NULL;
1174 /*
1175 * Enable loading of modules from the filesystem
1176 */
1177 module_load_vfs_init();
1178
1179 }
1180 return (error);
1181 }
1182
1183 /*
1184 * mount_specific_key_create --
1185 * Create a key for subsystem mount-specific data.
1186 */
1187 int
1188 mount_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor)
1189 {
1190
1191 return specificdata_key_create(mount_specificdata_domain, keyp, dtor);
1192 }
1193
1194 /*
1195 * mount_specific_key_delete --
1196 * Delete a key for subsystem mount-specific data.
1197 */
1198 void
1199 mount_specific_key_delete(specificdata_key_t key)
1200 {
1201
1202 specificdata_key_delete(mount_specificdata_domain, key);
1203 }
1204
1205 /*
1206 * mount_initspecific --
1207 * Initialize a mount's specificdata container.
1208 */
1209 void
1210 mount_initspecific(struct mount *mp)
1211 {
1212 int error;
1213
1214 error = specificdata_init(mount_specificdata_domain,
1215 &mp->mnt_specdataref);
1216 KASSERT(error == 0);
1217 }
1218
1219 /*
1220 * mount_finispecific --
1221 * Finalize a mount's specificdata container.
1222 */
1223 void
1224 mount_finispecific(struct mount *mp)
1225 {
1226
1227 specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
1228 }
1229
1230 /*
1231 * mount_getspecific --
1232 * Return mount-specific data corresponding to the specified key.
1233 */
1234 void *
1235 mount_getspecific(struct mount *mp, specificdata_key_t key)
1236 {
1237
1238 return specificdata_getspecific(mount_specificdata_domain,
1239 &mp->mnt_specdataref, key);
1240 }
1241
1242 /*
1243 * mount_setspecific --
1244 * Set mount-specific data corresponding to the specified key.
1245 */
1246 void
1247 mount_setspecific(struct mount *mp, specificdata_key_t key, void *data)
1248 {
1249
1250 specificdata_setspecific(mount_specificdata_domain,
1251 &mp->mnt_specdataref, key, data);
1252 }
1253
1254 /*
1255 * Check to see if a filesystem is mounted on a block device.
1256 */
1257 int
1258 vfs_mountedon(vnode_t *vp)
1259 {
1260 vnode_t *vq;
1261 int error = 0;
1262
1263 if (vp->v_type != VBLK)
1264 return ENOTBLK;
1265 if (vp->v_specmountpoint != NULL)
1266 return (EBUSY);
1267 mutex_enter(&device_lock);
1268 for (vq = specfs_hash[SPECHASH(vp->v_rdev)]; vq != NULL;
1269 vq = vq->v_specnext) {
1270 if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
1271 continue;
1272 if (vq->v_specmountpoint != NULL) {
1273 error = EBUSY;
1274 break;
1275 }
1276 }
1277 mutex_exit(&device_lock);
1278 return (error);
1279 }
1280
1281 /*
1282 * Check if a device pointed to by vp is mounted.
1283 *
1284 * Returns:
1285 * EINVAL if it's not a disk
1286 * EBUSY if it's a disk and mounted
1287 * 0 if it's a disk and not mounted
1288 */
1289 int
1290 rawdev_mounted(vnode_t *vp, vnode_t **bvpp)
1291 {
1292 vnode_t *bvp;
1293 dev_t dev;
1294 int d_type;
1295
1296 bvp = NULL;
1297 dev = vp->v_rdev;
1298 d_type = D_OTHER;
1299
1300 if (iskmemvp(vp))
1301 return EINVAL;
1302
1303 switch (vp->v_type) {
1304 case VCHR: {
1305 const struct cdevsw *cdev;
1306
1307 cdev = cdevsw_lookup(dev);
1308 if (cdev != NULL) {
1309 dev_t blkdev;
1310
1311 blkdev = devsw_chr2blk(dev);
1312 if (blkdev != NODEV) {
1313 if (vfinddev(blkdev, VBLK, &bvp) != 0) {
1314 d_type = (cdev->d_flag & D_TYPEMASK);
1315 /* XXX: what if bvp disappears? */
1316 vrele(bvp);
1317 }
1318 }
1319 }
1320
1321 break;
1322 }
1323
1324 case VBLK: {
1325 const struct bdevsw *bdev;
1326
1327 bdev = bdevsw_lookup(dev);
1328 if (bdev != NULL)
1329 d_type = (bdev->d_flag & D_TYPEMASK);
1330
1331 bvp = vp;
1332
1333 break;
1334 }
1335
1336 default:
1337 break;
1338 }
1339
1340 if (d_type != D_DISK)
1341 return EINVAL;
1342
1343 if (bvpp != NULL)
1344 *bvpp = bvp;
1345
1346 /*
1347 * XXX: This is bogus. We should be failing the request
1348 * XXX: not only if this specific slice is mounted, but
1349 * XXX: if it's on a disk with any other mounted slice.
1350 */
1351 if (vfs_mountedon(bvp))
1352 return EBUSY;
1353
1354 return 0;
1355 }
1356
1357 /*
1358 * Make a 'unique' number from a mount type name.
1359 */
1360 long
1361 makefstype(const char *type)
1362 {
1363 long rv;
1364
1365 for (rv = 0; *type; type++) {
1366 rv <<= 2;
1367 rv ^= *type;
1368 }
1369 return rv;
1370 }
1371