vfs_mount.c revision 1.28.2.1 1 /* $NetBSD: vfs_mount.c,v 1.28.2.1 2014/08/10 06:55:58 tls Exp $ */
2
3 /*-
4 * Copyright (c) 1997-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /*
34 * Copyright (c) 1989, 1993
35 * The Regents of the University of California. All rights reserved.
36 * (c) UNIX System Laboratories, Inc.
37 * All or some portions of this file are derived from material licensed
38 * to the University of California by American Telephone and Telegraph
39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40 * the permission of UNIX System Laboratories, Inc.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
67 */
68
69 #include <sys/cdefs.h>
70 __KERNEL_RCSID(0, "$NetBSD: vfs_mount.c,v 1.28.2.1 2014/08/10 06:55:58 tls Exp $");
71
72 #define _VFS_VNODE_PRIVATE
73
74 #include <sys/param.h>
75 #include <sys/kernel.h>
76
77 #include <sys/atomic.h>
78 #include <sys/buf.h>
79 #include <sys/conf.h>
80 #include <sys/fcntl.h>
81 #include <sys/filedesc.h>
82 #include <sys/device.h>
83 #include <sys/kauth.h>
84 #include <sys/kmem.h>
85 #include <sys/module.h>
86 #include <sys/mount.h>
87 #include <sys/namei.h>
88 #include <sys/extattr.h>
89 #include <sys/syscallargs.h>
90 #include <sys/sysctl.h>
91 #include <sys/systm.h>
92 #include <sys/vfs_syscalls.h>
93 #include <sys/vnode.h>
94
95 #include <miscfs/genfs/genfs.h>
96 #include <miscfs/syncfs/syncfs.h>
97 #include <miscfs/specfs/specdev.h>
98
99 /* Root filesystem and device. */
100 vnode_t * rootvnode;
101 device_t root_device;
102
103 /* Mounted filesystem list. */
104 struct mntlist mountlist;
105 kmutex_t mountlist_lock;
106
107 kmutex_t mntvnode_lock;
108 kmutex_t vfs_list_lock;
109
110 static specificdata_domain_t mount_specificdata_domain;
111 static kmutex_t mntid_lock;
112
113 static kmutex_t mountgen_lock;
114 static uint64_t mountgen;
115
116 void
117 vfs_mount_sysinit(void)
118 {
119
120 TAILQ_INIT(&mountlist);
121 mutex_init(&mountlist_lock, MUTEX_DEFAULT, IPL_NONE);
122 mutex_init(&mntvnode_lock, MUTEX_DEFAULT, IPL_NONE);
123 mutex_init(&vfs_list_lock, MUTEX_DEFAULT, IPL_NONE);
124
125 mount_specificdata_domain = specificdata_domain_create();
126 mutex_init(&mntid_lock, MUTEX_DEFAULT, IPL_NONE);
127 mutex_init(&mountgen_lock, MUTEX_DEFAULT, IPL_NONE);
128 mountgen = 0;
129 }
130
131 struct mount *
132 vfs_mountalloc(struct vfsops *vfsops, vnode_t *vp)
133 {
134 struct mount *mp;
135 int error __diagused;
136
137 mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
138 if (mp == NULL)
139 return NULL;
140
141 mp->mnt_op = vfsops;
142 mp->mnt_refcnt = 1;
143 TAILQ_INIT(&mp->mnt_vnodelist);
144 mutex_init(&mp->mnt_unmounting, MUTEX_DEFAULT, IPL_NONE);
145 mutex_init(&mp->mnt_renamelock, MUTEX_DEFAULT, IPL_NONE);
146 mutex_init(&mp->mnt_updating, MUTEX_DEFAULT, IPL_NONE);
147 error = vfs_busy(mp, NULL);
148 KASSERT(error == 0);
149 mp->mnt_vnodecovered = vp;
150 mount_initspecific(mp);
151
152 mutex_enter(&mountgen_lock);
153 mp->mnt_gen = mountgen++;
154 mutex_exit(&mountgen_lock);
155
156 return mp;
157 }
158
159 /*
160 * vfs_rootmountalloc: lookup a filesystem type, and if found allocate and
161 * initialize a mount structure for it.
162 *
163 * Devname is usually updated by mount(8) after booting.
164 */
165 int
166 vfs_rootmountalloc(const char *fstypename, const char *devname,
167 struct mount **mpp)
168 {
169 struct vfsops *vfsp = NULL;
170 struct mount *mp;
171
172 mutex_enter(&vfs_list_lock);
173 LIST_FOREACH(vfsp, &vfs_list, vfs_list)
174 if (!strncmp(vfsp->vfs_name, fstypename,
175 sizeof(mp->mnt_stat.f_fstypename)))
176 break;
177 if (vfsp == NULL) {
178 mutex_exit(&vfs_list_lock);
179 return (ENODEV);
180 }
181 vfsp->vfs_refcount++;
182 mutex_exit(&vfs_list_lock);
183
184 if ((mp = vfs_mountalloc(vfsp, NULL)) == NULL)
185 return ENOMEM;
186 mp->mnt_flag = MNT_RDONLY;
187 (void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name,
188 sizeof(mp->mnt_stat.f_fstypename));
189 mp->mnt_stat.f_mntonname[0] = '/';
190 mp->mnt_stat.f_mntonname[1] = '\0';
191 mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] =
192 '\0';
193 (void)copystr(devname, mp->mnt_stat.f_mntfromname,
194 sizeof(mp->mnt_stat.f_mntfromname) - 1, 0);
195 *mpp = mp;
196 return 0;
197 }
198
199 /*
200 * vfs_getnewfsid: get a new unique fsid.
201 */
202 void
203 vfs_getnewfsid(struct mount *mp)
204 {
205 static u_short xxxfs_mntid;
206 fsid_t tfsid;
207 int mtype;
208
209 mutex_enter(&mntid_lock);
210 mtype = makefstype(mp->mnt_op->vfs_name);
211 mp->mnt_stat.f_fsidx.__fsid_val[0] = makedev(mtype, 0);
212 mp->mnt_stat.f_fsidx.__fsid_val[1] = mtype;
213 mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
214 if (xxxfs_mntid == 0)
215 ++xxxfs_mntid;
216 tfsid.__fsid_val[0] = makedev(mtype & 0xff, xxxfs_mntid);
217 tfsid.__fsid_val[1] = mtype;
218 if (!TAILQ_EMPTY(&mountlist)) {
219 while (vfs_getvfs(&tfsid)) {
220 tfsid.__fsid_val[0]++;
221 xxxfs_mntid++;
222 }
223 }
224 mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0];
225 mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
226 mutex_exit(&mntid_lock);
227 }
228
229 /*
230 * Lookup a mount point by filesystem identifier.
231 *
232 * XXX Needs to add a reference to the mount point.
233 */
234 struct mount *
235 vfs_getvfs(fsid_t *fsid)
236 {
237 struct mount *mp;
238
239 mutex_enter(&mountlist_lock);
240 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
241 if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] &&
242 mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) {
243 mutex_exit(&mountlist_lock);
244 return (mp);
245 }
246 }
247 mutex_exit(&mountlist_lock);
248 return NULL;
249 }
250
251 /*
252 * Drop a reference to a mount structure, freeing if the last reference.
253 */
254 void
255 vfs_destroy(struct mount *mp)
256 {
257
258 if (__predict_true((int)atomic_dec_uint_nv(&mp->mnt_refcnt) > 0)) {
259 return;
260 }
261
262 /*
263 * Nothing else has visibility of the mount: we can now
264 * free the data structures.
265 */
266 KASSERT(mp->mnt_refcnt == 0);
267 specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
268 mutex_destroy(&mp->mnt_unmounting);
269 mutex_destroy(&mp->mnt_updating);
270 mutex_destroy(&mp->mnt_renamelock);
271 if (mp->mnt_op != NULL) {
272 vfs_delref(mp->mnt_op);
273 }
274 kmem_free(mp, sizeof(*mp));
275 }
276
277 /*
278 * Mark a mount point as busy, and gain a new reference to it. Used to
279 * prevent the file system from being unmounted during critical sections.
280 *
281 * vfs_busy can be called multiple times and by multiple threads
282 * and must be accompanied by the same number of vfs_unbusy calls.
283 *
284 * => The caller must hold a pre-existing reference to the mount.
285 * => Will fail if the file system is being unmounted, or is unmounted.
286 */
287 int
288 vfs_busy(struct mount *mp, struct mount **nextp)
289 {
290
291 KASSERT(mp->mnt_refcnt > 0);
292
293 mutex_enter(&mp->mnt_unmounting);
294 if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) {
295 mutex_exit(&mp->mnt_unmounting);
296 if (nextp != NULL) {
297 KASSERT(mutex_owned(&mountlist_lock));
298 *nextp = TAILQ_NEXT(mp, mnt_list);
299 }
300 return ENOENT;
301 }
302 ++mp->mnt_busynest;
303 KASSERT(mp->mnt_busynest != 0);
304 mutex_exit(&mp->mnt_unmounting);
305 if (nextp != NULL) {
306 mutex_exit(&mountlist_lock);
307 }
308 atomic_inc_uint(&mp->mnt_refcnt);
309 return 0;
310 }
311
312 /*
313 * Unbusy a busy filesystem.
314 *
315 * Every successful vfs_busy() call must be undone by a vfs_unbusy() call.
316 *
317 * => If keepref is true, preserve reference added by vfs_busy().
318 * => If nextp != NULL, acquire mountlist_lock.
319 */
320 void
321 vfs_unbusy(struct mount *mp, bool keepref, struct mount **nextp)
322 {
323
324 KASSERT(mp->mnt_refcnt > 0);
325
326 if (nextp != NULL) {
327 mutex_enter(&mountlist_lock);
328 }
329 mutex_enter(&mp->mnt_unmounting);
330 KASSERT(mp->mnt_busynest != 0);
331 mp->mnt_busynest--;
332 mutex_exit(&mp->mnt_unmounting);
333 if (!keepref) {
334 vfs_destroy(mp);
335 }
336 if (nextp != NULL) {
337 KASSERT(mutex_owned(&mountlist_lock));
338 *nextp = TAILQ_NEXT(mp, mnt_list);
339 }
340 }
341
342 struct vnode_iterator {
343 struct vnode vi_vnode;
344 };
345
346 void
347 vfs_vnode_iterator_init(struct mount *mp, struct vnode_iterator **vip)
348 {
349 struct vnode *vp;
350
351 vp = vnalloc(mp);
352
353 mutex_enter(&mntvnode_lock);
354 TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
355 vp->v_usecount = 1;
356 mutex_exit(&mntvnode_lock);
357
358 *vip = (struct vnode_iterator *)vp;
359 }
360
361 void
362 vfs_vnode_iterator_destroy(struct vnode_iterator *vi)
363 {
364 struct vnode *mvp = &vi->vi_vnode;
365
366 mutex_enter(&mntvnode_lock);
367 KASSERT(ISSET(mvp->v_iflag, VI_MARKER));
368 if (mvp->v_usecount != 0)
369 TAILQ_REMOVE(&mvp->v_mount->mnt_vnodelist, mvp, v_mntvnodes);
370 mutex_exit(&mntvnode_lock);
371 vnfree(mvp);
372 }
373
374 struct vnode *
375 vfs_vnode_iterator_next(struct vnode_iterator *vi,
376 bool (*f)(void *, struct vnode *), void *cl)
377 {
378 struct vnode *mvp = &vi->vi_vnode;
379 struct mount *mp = mvp->v_mount;
380 struct vnode *vp;
381 int error;
382
383 KASSERT(ISSET(mvp->v_iflag, VI_MARKER));
384
385 do {
386 mutex_enter(&mntvnode_lock);
387 vp = TAILQ_NEXT(mvp, v_mntvnodes);
388 TAILQ_REMOVE(&mp->mnt_vnodelist, mvp, v_mntvnodes);
389 mvp->v_usecount = 0;
390 again:
391 if (vp == NULL) {
392 mutex_exit(&mntvnode_lock);
393 return NULL;
394 }
395 mutex_enter(vp->v_interlock);
396 if (ISSET(vp->v_iflag, VI_MARKER) ||
397 (f && !ISSET(vp->v_iflag, VI_XLOCK) && !(*f)(cl, vp))) {
398 mutex_exit(vp->v_interlock);
399 vp = TAILQ_NEXT(vp, v_mntvnodes);
400 goto again;
401 }
402
403 TAILQ_INSERT_AFTER(&mp->mnt_vnodelist, vp, mvp, v_mntvnodes);
404 mvp->v_usecount = 1;
405 mutex_exit(&mntvnode_lock);
406 error = vget(vp, 0);
407 KASSERT(error == 0 || error == ENOENT);
408 } while (error != 0);
409
410 return vp;
411 }
412
413 /*
414 * Move a vnode from one mount queue to another.
415 */
416 void
417 vfs_insmntque(vnode_t *vp, struct mount *mp)
418 {
419 struct mount *omp;
420
421 KASSERT(mp == NULL || (mp->mnt_iflag & IMNT_UNMOUNT) == 0 ||
422 vp->v_tag == VT_VFS);
423
424 mutex_enter(&mntvnode_lock);
425 /*
426 * Delete from old mount point vnode list, if on one.
427 */
428 if ((omp = vp->v_mount) != NULL)
429 TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes);
430 /*
431 * Insert into list of vnodes for the new mount point, if
432 * available. The caller must take a reference on the mount
433 * structure and donate to the vnode.
434 */
435 if ((vp->v_mount = mp) != NULL)
436 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
437 mutex_exit(&mntvnode_lock);
438
439 if (omp != NULL) {
440 /* Release reference to old mount. */
441 vfs_destroy(omp);
442 }
443 }
444
445 /*
446 * Remove any vnodes in the vnode table belonging to mount point mp.
447 *
448 * If FORCECLOSE is not specified, there should not be any active ones,
449 * return error if any are found (nb: this is a user error, not a
450 * system error). If FORCECLOSE is specified, detach any active vnodes
451 * that are found.
452 *
453 * If WRITECLOSE is set, only flush out regular file vnodes open for
454 * writing.
455 *
456 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
457 */
458 #ifdef DEBUG
459 int busyprt = 0; /* print out busy vnodes */
460 struct ctldebug debug1 = { "busyprt", &busyprt };
461 #endif
462
463 struct vflush_ctx {
464 const struct vnode *skipvp;
465 int flags;
466 };
467
468 static bool
469 vflush_selector(void *cl, struct vnode *vp)
470 {
471 struct vflush_ctx *c = cl;
472 /*
473 * Skip over a selected vnode.
474 */
475 if (vp == c->skipvp)
476 return false;
477 /*
478 * Skip over a vnodes marked VSYSTEM.
479 */
480 if ((c->flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM))
481 return false;
482
483 /*
484 * If WRITECLOSE is set, only flush out regular file
485 * vnodes open for writing.
486 */
487 if ((c->flags & WRITECLOSE) && vp->v_type == VREG) {
488 if (vp->v_writecount == 0)
489 return false;
490 }
491 return true;
492 }
493
494 static vnode_t *
495 vflushnext(struct vnode_iterator *marker, void *ctx, int *when)
496 {
497 if (hardclock_ticks > *when) {
498 yield();
499 *when = hardclock_ticks + hz / 10;
500 }
501 return vfs_vnode_iterator_next(marker, vflush_selector, ctx);
502 }
503
504
505 int
506 vflush(struct mount *mp, vnode_t *skipvp, int flags)
507 {
508 vnode_t *vp;
509 struct vnode_iterator *marker;
510 int busy = 0, when = 0;
511 struct vflush_ctx ctx;
512
513 /* First, flush out any vnode references from vrele_list. */
514 vrele_flush();
515
516 vfs_vnode_iterator_init(mp, &marker);
517
518 ctx.skipvp = skipvp;
519 ctx.flags = flags;
520 while ((vp = vflushnext(marker, &ctx, &when)) != NULL) {
521 /*
522 * First try to recycle the vnode.
523 */
524 if (vrecycle(vp))
525 continue;
526 /*
527 * If FORCECLOSE is set, forcibly close the vnode.
528 */
529 if (flags & FORCECLOSE) {
530 vgone(vp);
531 continue;
532 }
533 #ifdef DEBUG
534 if (busyprt)
535 vprint("vflush: busy vnode", vp);
536 #endif
537 vrele(vp);
538 busy++;
539 }
540 vfs_vnode_iterator_destroy(marker);
541 if (busy)
542 return (EBUSY);
543 return (0);
544 }
545
546 /*
547 * Mount a file system.
548 */
549
550 /*
551 * Scan all active processes to see if any of them have a current or root
552 * directory onto which the new filesystem has just been mounted. If so,
553 * replace them with the new mount point.
554 */
555 static void
556 mount_checkdirs(vnode_t *olddp)
557 {
558 vnode_t *newdp, *rele1, *rele2;
559 struct cwdinfo *cwdi;
560 struct proc *p;
561 bool retry;
562
563 if (olddp->v_usecount == 1) {
564 return;
565 }
566 if (VFS_ROOT(olddp->v_mountedhere, &newdp))
567 panic("mount: lost mount");
568
569 do {
570 retry = false;
571 mutex_enter(proc_lock);
572 PROCLIST_FOREACH(p, &allproc) {
573 if ((cwdi = p->p_cwdi) == NULL)
574 continue;
575 /*
576 * Cannot change to the old directory any more,
577 * so even if we see a stale value it is not a
578 * problem.
579 */
580 if (cwdi->cwdi_cdir != olddp &&
581 cwdi->cwdi_rdir != olddp)
582 continue;
583 retry = true;
584 rele1 = NULL;
585 rele2 = NULL;
586 atomic_inc_uint(&cwdi->cwdi_refcnt);
587 mutex_exit(proc_lock);
588 rw_enter(&cwdi->cwdi_lock, RW_WRITER);
589 if (cwdi->cwdi_cdir == olddp) {
590 rele1 = cwdi->cwdi_cdir;
591 vref(newdp);
592 cwdi->cwdi_cdir = newdp;
593 }
594 if (cwdi->cwdi_rdir == olddp) {
595 rele2 = cwdi->cwdi_rdir;
596 vref(newdp);
597 cwdi->cwdi_rdir = newdp;
598 }
599 rw_exit(&cwdi->cwdi_lock);
600 cwdfree(cwdi);
601 if (rele1 != NULL)
602 vrele(rele1);
603 if (rele2 != NULL)
604 vrele(rele2);
605 mutex_enter(proc_lock);
606 break;
607 }
608 mutex_exit(proc_lock);
609 } while (retry);
610
611 if (rootvnode == olddp) {
612 vrele(rootvnode);
613 vref(newdp);
614 rootvnode = newdp;
615 }
616 vput(newdp);
617 }
618
619 int
620 mount_domount(struct lwp *l, vnode_t **vpp, struct vfsops *vfsops,
621 const char *path, int flags, void *data, size_t *data_len)
622 {
623 vnode_t *vp = *vpp;
624 struct mount *mp;
625 struct pathbuf *pb;
626 struct nameidata nd;
627 int error;
628
629 error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
630 KAUTH_REQ_SYSTEM_MOUNT_NEW, vp, KAUTH_ARG(flags), data);
631 if (error) {
632 vfs_delref(vfsops);
633 return error;
634 }
635
636 /* Cannot make a non-dir a mount-point (from here anyway). */
637 if (vp->v_type != VDIR) {
638 vfs_delref(vfsops);
639 return ENOTDIR;
640 }
641
642 if (flags & MNT_EXPORTED) {
643 vfs_delref(vfsops);
644 return EINVAL;
645 }
646
647 if ((mp = vfs_mountalloc(vfsops, vp)) == NULL) {
648 vfs_delref(vfsops);
649 return ENOMEM;
650 }
651
652 mp->mnt_stat.f_owner = kauth_cred_geteuid(l->l_cred);
653
654 /*
655 * The underlying file system may refuse the mount for
656 * various reasons. Allow the user to force it to happen.
657 *
658 * Set the mount level flags.
659 */
660 mp->mnt_flag = flags & (MNT_BASIC_FLAGS | MNT_FORCE | MNT_IGNORE);
661
662 mutex_enter(&mp->mnt_updating);
663 error = VFS_MOUNT(mp, path, data, data_len);
664 mp->mnt_flag &= ~MNT_OP_FLAGS;
665
666 if (error != 0)
667 goto err_unmounted;
668
669 /*
670 * Validate and prepare the mount point.
671 */
672 error = pathbuf_copyin(path, &pb);
673 if (error != 0) {
674 goto err_mounted;
675 }
676 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
677 error = namei(&nd);
678 pathbuf_destroy(pb);
679 if (error != 0) {
680 goto err_mounted;
681 }
682 if (nd.ni_vp != vp) {
683 vput(nd.ni_vp);
684 error = EINVAL;
685 goto err_mounted;
686 }
687 if (vp->v_mountedhere != NULL) {
688 vput(nd.ni_vp);
689 error = EBUSY;
690 goto err_mounted;
691 }
692 error = vinvalbuf(vp, V_SAVE, l->l_cred, l, 0, 0);
693 if (error != 0) {
694 vput(nd.ni_vp);
695 goto err_mounted;
696 }
697
698 /*
699 * Put the new filesystem on the mount list after root.
700 */
701 cache_purge(vp);
702 mp->mnt_iflag &= ~IMNT_WANTRDWR;
703
704 mutex_enter(&mountlist_lock);
705 TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
706 mutex_exit(&mountlist_lock);
707 if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
708 error = vfs_allocate_syncvnode(mp);
709 if (error == 0)
710 vp->v_mountedhere = mp;
711 vput(nd.ni_vp);
712 if (error != 0)
713 goto err_onmountlist;
714
715 mount_checkdirs(vp);
716 mutex_exit(&mp->mnt_updating);
717
718 /* Hold an additional reference to the mount across VFS_START(). */
719 vfs_unbusy(mp, true, NULL);
720 (void) VFS_STATVFS(mp, &mp->mnt_stat);
721 error = VFS_START(mp, 0);
722 if (error) {
723 vrele(vp);
724 } else if (flags & MNT_EXTATTR) {
725 error = VFS_EXTATTRCTL(vp->v_mountedhere,
726 EXTATTR_CMD_START, NULL, 0, NULL);
727 if (error)
728 printf("%s: failed to start extattr: error = %d\n",
729 vp->v_mountedhere->mnt_stat.f_mntonname, error);
730 }
731 /* Drop reference held for VFS_START(). */
732 vfs_destroy(mp);
733 *vpp = NULL;
734 return error;
735
736 err_onmountlist:
737 mutex_enter(&mountlist_lock);
738 TAILQ_REMOVE(&mountlist, mp, mnt_list);
739 mp->mnt_iflag |= IMNT_GONE;
740 mutex_exit(&mountlist_lock);
741
742 err_mounted:
743 if (VFS_UNMOUNT(mp, MNT_FORCE) != 0)
744 panic("Unmounting fresh file system failed");
745
746 err_unmounted:
747 vp->v_mountedhere = NULL;
748 mutex_exit(&mp->mnt_updating);
749 vfs_unbusy(mp, false, NULL);
750 vfs_destroy(mp);
751
752 return error;
753 }
754
755 /*
756 * Do the actual file system unmount. File system is assumed to have
757 * been locked by the caller.
758 *
759 * => Caller hold reference to the mount, explicitly for dounmount().
760 */
761 int
762 dounmount(struct mount *mp, int flags, struct lwp *l)
763 {
764 vnode_t *coveredvp;
765 int error, async, used_syncer;
766
767 #if NVERIEXEC > 0
768 error = veriexec_unmountchk(mp);
769 if (error)
770 return (error);
771 #endif /* NVERIEXEC > 0 */
772
773 /*
774 * XXX Freeze syncer. Must do this before locking the
775 * mount point. See dounmount() for details.
776 */
777 mutex_enter(&syncer_mutex);
778
779 /*
780 * Abort unmount attempt when the filesystem is in use
781 */
782 mutex_enter(&mp->mnt_unmounting);
783 if (mp->mnt_busynest != 0) {
784 mutex_exit(&mp->mnt_unmounting);
785 mutex_exit(&syncer_mutex);
786 return EBUSY;
787 }
788
789 /*
790 * Abort unmount attempt when the filesystem is not mounted
791 */
792 if ((mp->mnt_iflag & IMNT_GONE) != 0) {
793 mutex_exit(&mp->mnt_unmounting);
794 mutex_exit(&syncer_mutex);
795 return ENOENT;
796 }
797
798 used_syncer = (mp->mnt_syncer != NULL);
799
800 /*
801 * XXX Syncer must be frozen when we get here. This should really
802 * be done on a per-mountpoint basis, but the syncer doesn't work
803 * like that.
804 *
805 * The caller of dounmount() must acquire syncer_mutex because
806 * the syncer itself acquires locks in syncer_mutex -> vfs_busy
807 * order, and we must preserve that order to avoid deadlock.
808 *
809 * So, if the file system did not use the syncer, now is
810 * the time to release the syncer_mutex.
811 */
812 if (used_syncer == 0) {
813 mutex_exit(&syncer_mutex);
814 }
815 mp->mnt_iflag |= IMNT_UNMOUNT;
816 mutex_enter(&mp->mnt_updating);
817 async = mp->mnt_flag & MNT_ASYNC;
818 mp->mnt_flag &= ~MNT_ASYNC;
819 cache_purgevfs(mp); /* remove cache entries for this file sys */
820 if (mp->mnt_syncer != NULL)
821 vfs_deallocate_syncvnode(mp);
822 error = 0;
823 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
824 error = VFS_SYNC(mp, MNT_WAIT, l->l_cred);
825 }
826 if (error == 0 || (flags & MNT_FORCE)) {
827 error = VFS_UNMOUNT(mp, flags);
828 }
829 if (error) {
830 mp->mnt_iflag &= ~IMNT_UNMOUNT;
831 mutex_exit(&mp->mnt_unmounting);
832 if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
833 (void) vfs_allocate_syncvnode(mp);
834 mp->mnt_flag |= async;
835 mutex_exit(&mp->mnt_updating);
836 if (used_syncer)
837 mutex_exit(&syncer_mutex);
838 return (error);
839 }
840 mutex_exit(&mp->mnt_updating);
841
842 /*
843 * release mnt_umounting lock here, because other code calls
844 * vfs_busy() while holding the mountlist_lock.
845 *
846 * mark filesystem as gone to prevent further umounts
847 * after mnt_umounting lock is gone, this also prevents
848 * vfs_busy() from succeeding.
849 */
850 mp->mnt_iflag |= IMNT_GONE;
851 mutex_exit(&mp->mnt_unmounting);
852
853 if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
854 vn_lock(coveredvp, LK_EXCLUSIVE | LK_RETRY);
855 coveredvp->v_mountedhere = NULL;
856 VOP_UNLOCK(coveredvp);
857 }
858 mutex_enter(&mountlist_lock);
859 TAILQ_REMOVE(&mountlist, mp, mnt_list);
860 mutex_exit(&mountlist_lock);
861 if (TAILQ_FIRST(&mp->mnt_vnodelist) != NULL)
862 panic("unmount: dangling vnode");
863 if (used_syncer)
864 mutex_exit(&syncer_mutex);
865 vfs_hooks_unmount(mp);
866
867 vfs_destroy(mp); /* reference from mount() */
868 if (coveredvp != NULLVP) {
869 vrele(coveredvp);
870 }
871 return (0);
872 }
873
874 /*
875 * Unmount all file systems.
876 * We traverse the list in reverse order under the assumption that doing so
877 * will avoid needing to worry about dependencies.
878 */
879 bool
880 vfs_unmountall(struct lwp *l)
881 {
882
883 printf("unmounting file systems...\n");
884 return vfs_unmountall1(l, true, true);
885 }
886
887 static void
888 vfs_unmount_print(struct mount *mp, const char *pfx)
889 {
890
891 aprint_verbose("%sunmounted %s on %s type %s\n", pfx,
892 mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname,
893 mp->mnt_stat.f_fstypename);
894 }
895
896 bool
897 vfs_unmount_forceone(struct lwp *l)
898 {
899 struct mount *mp, *nmp;
900 int error;
901
902 nmp = NULL;
903
904 TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
905 if (nmp == NULL || mp->mnt_gen > nmp->mnt_gen) {
906 nmp = mp;
907 }
908 }
909 if (nmp == NULL) {
910 return false;
911 }
912
913 #ifdef DEBUG
914 printf("forcefully unmounting %s (%s)...\n",
915 nmp->mnt_stat.f_mntonname, nmp->mnt_stat.f_mntfromname);
916 #endif
917 atomic_inc_uint(&nmp->mnt_refcnt);
918 if ((error = dounmount(nmp, MNT_FORCE, l)) == 0) {
919 vfs_unmount_print(nmp, "forcefully ");
920 return true;
921 } else {
922 vfs_destroy(nmp);
923 }
924
925 #ifdef DEBUG
926 printf("forceful unmount of %s failed with error %d\n",
927 nmp->mnt_stat.f_mntonname, error);
928 #endif
929
930 return false;
931 }
932
933 bool
934 vfs_unmountall1(struct lwp *l, bool force, bool verbose)
935 {
936 struct mount *mp, *nmp;
937 bool any_error = false, progress = false;
938 int error;
939
940 TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, nmp) {
941 #ifdef DEBUG
942 printf("unmounting %p %s (%s)...\n",
943 (void *)mp, mp->mnt_stat.f_mntonname,
944 mp->mnt_stat.f_mntfromname);
945 #endif
946 atomic_inc_uint(&mp->mnt_refcnt);
947 if ((error = dounmount(mp, force ? MNT_FORCE : 0, l)) == 0) {
948 vfs_unmount_print(mp, "");
949 progress = true;
950 } else {
951 vfs_destroy(mp);
952 if (verbose) {
953 printf("unmount of %s failed with error %d\n",
954 mp->mnt_stat.f_mntonname, error);
955 }
956 any_error = true;
957 }
958 }
959 if (verbose) {
960 printf("unmounting done\n");
961 }
962 if (any_error && verbose) {
963 printf("WARNING: some file systems would not unmount\n");
964 }
965 return progress;
966 }
967
968 void
969 vfs_sync_all(struct lwp *l)
970 {
971 printf("syncing disks... ");
972
973 /* remove user processes from run queue */
974 suspendsched();
975 (void)spl0();
976
977 /* avoid coming back this way again if we panic. */
978 doing_shutdown = 1;
979
980 do_sys_sync(l);
981
982 /* Wait for sync to finish. */
983 if (buf_syncwait() != 0) {
984 #if defined(DDB) && defined(DEBUG_HALT_BUSY)
985 Debugger();
986 #endif
987 printf("giving up\n");
988 return;
989 } else
990 printf("done\n");
991 }
992
993 /*
994 * Sync and unmount file systems before shutting down.
995 */
996 void
997 vfs_shutdown(void)
998 {
999 lwp_t *l = curlwp;
1000
1001 vfs_sync_all(l);
1002
1003 /*
1004 * If we have paniced - do not make the situation potentially
1005 * worse by unmounting the file systems.
1006 */
1007 if (panicstr != NULL) {
1008 return;
1009 }
1010
1011 /* Unmount file systems. */
1012 vfs_unmountall(l);
1013 }
1014
1015 /*
1016 * Print a list of supported file system types (used by vfs_mountroot)
1017 */
1018 static void
1019 vfs_print_fstypes(void)
1020 {
1021 struct vfsops *v;
1022 int cnt = 0;
1023
1024 mutex_enter(&vfs_list_lock);
1025 LIST_FOREACH(v, &vfs_list, vfs_list)
1026 ++cnt;
1027 mutex_exit(&vfs_list_lock);
1028
1029 if (cnt == 0) {
1030 printf("WARNING: No file system modules have been loaded.\n");
1031 return;
1032 }
1033
1034 printf("Supported file systems:");
1035 mutex_enter(&vfs_list_lock);
1036 LIST_FOREACH(v, &vfs_list, vfs_list) {
1037 printf(" %s", v->vfs_name);
1038 }
1039 mutex_exit(&vfs_list_lock);
1040 printf("\n");
1041 }
1042
1043 /*
1044 * Mount the root file system. If the operator didn't specify a
1045 * file system to use, try all possible file systems until one
1046 * succeeds.
1047 */
1048 int
1049 vfs_mountroot(void)
1050 {
1051 struct vfsops *v;
1052 int error = ENODEV;
1053
1054 if (root_device == NULL)
1055 panic("vfs_mountroot: root device unknown");
1056
1057 switch (device_class(root_device)) {
1058 case DV_IFNET:
1059 if (rootdev != NODEV)
1060 panic("vfs_mountroot: rootdev set for DV_IFNET "
1061 "(0x%llx -> %llu,%llu)",
1062 (unsigned long long)rootdev,
1063 (unsigned long long)major(rootdev),
1064 (unsigned long long)minor(rootdev));
1065 break;
1066
1067 case DV_DISK:
1068 if (rootdev == NODEV)
1069 panic("vfs_mountroot: rootdev not set for DV_DISK");
1070 if (bdevvp(rootdev, &rootvp))
1071 panic("vfs_mountroot: can't get vnode for rootdev");
1072 error = VOP_OPEN(rootvp, FREAD, FSCRED);
1073 if (error) {
1074 printf("vfs_mountroot: can't open root device\n");
1075 return (error);
1076 }
1077 break;
1078
1079 case DV_VIRTUAL:
1080 break;
1081
1082 default:
1083 printf("%s: inappropriate for root file system\n",
1084 device_xname(root_device));
1085 return (ENODEV);
1086 }
1087
1088 /*
1089 * If user specified a root fs type, use it. Make sure the
1090 * specified type exists and has a mount_root()
1091 */
1092 if (strcmp(rootfstype, ROOT_FSTYPE_ANY) != 0) {
1093 v = vfs_getopsbyname(rootfstype);
1094 error = EFTYPE;
1095 if (v != NULL) {
1096 if (v->vfs_mountroot != NULL) {
1097 error = (v->vfs_mountroot)();
1098 }
1099 v->vfs_refcount--;
1100 }
1101 goto done;
1102 }
1103
1104 /*
1105 * Try each file system currently configured into the kernel.
1106 */
1107 mutex_enter(&vfs_list_lock);
1108 LIST_FOREACH(v, &vfs_list, vfs_list) {
1109 if (v->vfs_mountroot == NULL)
1110 continue;
1111 #ifdef DEBUG
1112 aprint_normal("mountroot: trying %s...\n", v->vfs_name);
1113 #endif
1114 v->vfs_refcount++;
1115 mutex_exit(&vfs_list_lock);
1116 error = (*v->vfs_mountroot)();
1117 mutex_enter(&vfs_list_lock);
1118 v->vfs_refcount--;
1119 if (!error) {
1120 aprint_normal("root file system type: %s\n",
1121 v->vfs_name);
1122 break;
1123 }
1124 }
1125 mutex_exit(&vfs_list_lock);
1126
1127 if (v == NULL) {
1128 vfs_print_fstypes();
1129 printf("no file system for %s", device_xname(root_device));
1130 if (device_class(root_device) == DV_DISK)
1131 printf(" (dev 0x%llx)", (unsigned long long)rootdev);
1132 printf("\n");
1133 error = EFTYPE;
1134 }
1135
1136 done:
1137 if (error && device_class(root_device) == DV_DISK) {
1138 VOP_CLOSE(rootvp, FREAD, FSCRED);
1139 vrele(rootvp);
1140 }
1141 if (error == 0) {
1142 struct mount *mp;
1143 extern struct cwdinfo cwdi0;
1144
1145 mp = TAILQ_FIRST(&mountlist);
1146 mp->mnt_flag |= MNT_ROOTFS;
1147 mp->mnt_op->vfs_refcount++;
1148
1149 /*
1150 * Get the vnode for '/'. Set cwdi0.cwdi_cdir to
1151 * reference it.
1152 */
1153 error = VFS_ROOT(mp, &rootvnode);
1154 if (error)
1155 panic("cannot find root vnode, error=%d", error);
1156 cwdi0.cwdi_cdir = rootvnode;
1157 vref(cwdi0.cwdi_cdir);
1158 VOP_UNLOCK(rootvnode);
1159 cwdi0.cwdi_rdir = NULL;
1160
1161 /*
1162 * Now that root is mounted, we can fixup initproc's CWD
1163 * info. All other processes are kthreads, which merely
1164 * share proc0's CWD info.
1165 */
1166 initproc->p_cwdi->cwdi_cdir = rootvnode;
1167 vref(initproc->p_cwdi->cwdi_cdir);
1168 initproc->p_cwdi->cwdi_rdir = NULL;
1169 /*
1170 * Enable loading of modules from the filesystem
1171 */
1172 module_load_vfs_init();
1173
1174 }
1175 return (error);
1176 }
1177
1178 /*
1179 * mount_specific_key_create --
1180 * Create a key for subsystem mount-specific data.
1181 */
1182 int
1183 mount_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor)
1184 {
1185
1186 return specificdata_key_create(mount_specificdata_domain, keyp, dtor);
1187 }
1188
1189 /*
1190 * mount_specific_key_delete --
1191 * Delete a key for subsystem mount-specific data.
1192 */
1193 void
1194 mount_specific_key_delete(specificdata_key_t key)
1195 {
1196
1197 specificdata_key_delete(mount_specificdata_domain, key);
1198 }
1199
1200 /*
1201 * mount_initspecific --
1202 * Initialize a mount's specificdata container.
1203 */
1204 void
1205 mount_initspecific(struct mount *mp)
1206 {
1207 int error __diagused;
1208
1209 error = specificdata_init(mount_specificdata_domain,
1210 &mp->mnt_specdataref);
1211 KASSERT(error == 0);
1212 }
1213
1214 /*
1215 * mount_finispecific --
1216 * Finalize a mount's specificdata container.
1217 */
1218 void
1219 mount_finispecific(struct mount *mp)
1220 {
1221
1222 specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
1223 }
1224
1225 /*
1226 * mount_getspecific --
1227 * Return mount-specific data corresponding to the specified key.
1228 */
1229 void *
1230 mount_getspecific(struct mount *mp, specificdata_key_t key)
1231 {
1232
1233 return specificdata_getspecific(mount_specificdata_domain,
1234 &mp->mnt_specdataref, key);
1235 }
1236
1237 /*
1238 * mount_setspecific --
1239 * Set mount-specific data corresponding to the specified key.
1240 */
1241 void
1242 mount_setspecific(struct mount *mp, specificdata_key_t key, void *data)
1243 {
1244
1245 specificdata_setspecific(mount_specificdata_domain,
1246 &mp->mnt_specdataref, key, data);
1247 }
1248
1249 /*
1250 * Check to see if a filesystem is mounted on a block device.
1251 */
1252 int
1253 vfs_mountedon(vnode_t *vp)
1254 {
1255 vnode_t *vq;
1256 int error = 0;
1257
1258 if (vp->v_type != VBLK)
1259 return ENOTBLK;
1260 if (spec_node_getmountedfs(vp) != NULL)
1261 return EBUSY;
1262 if (spec_node_lookup_by_dev(vp->v_type, vp->v_rdev, &vq) == 0) {
1263 if (spec_node_getmountedfs(vq) != NULL)
1264 error = EBUSY;
1265 vrele(vq);
1266 }
1267
1268 return error;
1269 }
1270
1271 /*
1272 * Check if a device pointed to by vp is mounted.
1273 *
1274 * Returns:
1275 * EINVAL if it's not a disk
1276 * EBUSY if it's a disk and mounted
1277 * 0 if it's a disk and not mounted
1278 */
1279 int
1280 rawdev_mounted(vnode_t *vp, vnode_t **bvpp)
1281 {
1282 vnode_t *bvp;
1283 dev_t dev;
1284 int d_type;
1285
1286 bvp = NULL;
1287 d_type = D_OTHER;
1288
1289 if (iskmemvp(vp))
1290 return EINVAL;
1291
1292 switch (vp->v_type) {
1293 case VCHR: {
1294 const struct cdevsw *cdev;
1295
1296 dev = vp->v_rdev;
1297 cdev = cdevsw_lookup(dev);
1298 if (cdev != NULL) {
1299 dev_t blkdev;
1300
1301 blkdev = devsw_chr2blk(dev);
1302 if (blkdev != NODEV) {
1303 if (vfinddev(blkdev, VBLK, &bvp) != 0) {
1304 d_type = (cdev->d_flag & D_TYPEMASK);
1305 /* XXX: what if bvp disappears? */
1306 vrele(bvp);
1307 }
1308 }
1309 }
1310
1311 break;
1312 }
1313
1314 case VBLK: {
1315 const struct bdevsw *bdev;
1316
1317 dev = vp->v_rdev;
1318 bdev = bdevsw_lookup(dev);
1319 if (bdev != NULL)
1320 d_type = (bdev->d_flag & D_TYPEMASK);
1321
1322 bvp = vp;
1323
1324 break;
1325 }
1326
1327 default:
1328 break;
1329 }
1330
1331 if (d_type != D_DISK)
1332 return EINVAL;
1333
1334 if (bvpp != NULL)
1335 *bvpp = bvp;
1336
1337 /*
1338 * XXX: This is bogus. We should be failing the request
1339 * XXX: not only if this specific slice is mounted, but
1340 * XXX: if it's on a disk with any other mounted slice.
1341 */
1342 if (vfs_mountedon(bvp))
1343 return EBUSY;
1344
1345 return 0;
1346 }
1347
1348 /*
1349 * Make a 'unique' number from a mount type name.
1350 */
1351 long
1352 makefstype(const char *type)
1353 {
1354 long rv;
1355
1356 for (rv = 0; *type; type++) {
1357 rv <<= 2;
1358 rv ^= *type;
1359 }
1360 return rv;
1361 }
1362
1363 void
1364 mountlist_append(struct mount *mp)
1365 {
1366 mutex_enter(&mountlist_lock);
1367 TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
1368 mutex_exit(&mountlist_lock);
1369 }
1370