vfs_mount.c revision 1.36 1 /* $NetBSD: vfs_mount.c,v 1.36 2015/08/02 03:29:22 manu Exp $ */
2
3 /*-
4 * Copyright (c) 1997-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /*
34 * Copyright (c) 1989, 1993
35 * The Regents of the University of California. All rights reserved.
36 * (c) UNIX System Laboratories, Inc.
37 * All or some portions of this file are derived from material licensed
38 * to the University of California by American Telephone and Telegraph
39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40 * the permission of UNIX System Laboratories, Inc.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
67 */
68
69 #include <sys/cdefs.h>
70 __KERNEL_RCSID(0, "$NetBSD: vfs_mount.c,v 1.36 2015/08/02 03:29:22 manu Exp $");
71
72 #define _VFS_VNODE_PRIVATE
73
74 #include <sys/param.h>
75 #include <sys/kernel.h>
76
77 #include <sys/atomic.h>
78 #include <sys/buf.h>
79 #include <sys/conf.h>
80 #include <sys/fcntl.h>
81 #include <sys/filedesc.h>
82 #include <sys/device.h>
83 #include <sys/kauth.h>
84 #include <sys/kmem.h>
85 #include <sys/module.h>
86 #include <sys/mount.h>
87 #include <sys/namei.h>
88 #include <sys/extattr.h>
89 #include <sys/syscallargs.h>
90 #include <sys/sysctl.h>
91 #include <sys/systm.h>
92 #include <sys/vfs_syscalls.h>
93 #include <sys/vnode.h>
94
95 #include <miscfs/genfs/genfs.h>
96 #include <miscfs/specfs/specdev.h>
97
98 /* Root filesystem. */
99 vnode_t * rootvnode;
100
101 /* Mounted filesystem list. */
102 struct mntlist mountlist;
103 kmutex_t mountlist_lock;
104
105 kmutex_t mntvnode_lock;
106 kmutex_t vfs_list_lock;
107
108 static specificdata_domain_t mount_specificdata_domain;
109 static kmutex_t mntid_lock;
110
111 static kmutex_t mountgen_lock;
112 static uint64_t mountgen;
113
114 void
115 vfs_mount_sysinit(void)
116 {
117
118 TAILQ_INIT(&mountlist);
119 mutex_init(&mountlist_lock, MUTEX_DEFAULT, IPL_NONE);
120 mutex_init(&mntvnode_lock, MUTEX_DEFAULT, IPL_NONE);
121 mutex_init(&vfs_list_lock, MUTEX_DEFAULT, IPL_NONE);
122
123 mount_specificdata_domain = specificdata_domain_create();
124 mutex_init(&mntid_lock, MUTEX_DEFAULT, IPL_NONE);
125 mutex_init(&mountgen_lock, MUTEX_DEFAULT, IPL_NONE);
126 mountgen = 0;
127 }
128
129 struct mount *
130 vfs_mountalloc(struct vfsops *vfsops, vnode_t *vp)
131 {
132 struct mount *mp;
133 int error __diagused;
134
135 mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
136 if (mp == NULL)
137 return NULL;
138
139 mp->mnt_op = vfsops;
140 mp->mnt_refcnt = 1;
141 TAILQ_INIT(&mp->mnt_vnodelist);
142 mutex_init(&mp->mnt_unmounting, MUTEX_DEFAULT, IPL_NONE);
143 mutex_init(&mp->mnt_renamelock, MUTEX_DEFAULT, IPL_NONE);
144 mutex_init(&mp->mnt_updating, MUTEX_DEFAULT, IPL_NONE);
145 error = vfs_busy(mp, NULL);
146 KASSERT(error == 0);
147 mp->mnt_vnodecovered = vp;
148 mount_initspecific(mp);
149
150 mutex_enter(&mountgen_lock);
151 mp->mnt_gen = mountgen++;
152 mutex_exit(&mountgen_lock);
153
154 return mp;
155 }
156
157 /*
158 * vfs_rootmountalloc: lookup a filesystem type, and if found allocate and
159 * initialize a mount structure for it.
160 *
161 * Devname is usually updated by mount(8) after booting.
162 */
163 int
164 vfs_rootmountalloc(const char *fstypename, const char *devname,
165 struct mount **mpp)
166 {
167 struct vfsops *vfsp = NULL;
168 struct mount *mp;
169
170 mutex_enter(&vfs_list_lock);
171 LIST_FOREACH(vfsp, &vfs_list, vfs_list)
172 if (!strncmp(vfsp->vfs_name, fstypename,
173 sizeof(mp->mnt_stat.f_fstypename)))
174 break;
175 if (vfsp == NULL) {
176 mutex_exit(&vfs_list_lock);
177 return (ENODEV);
178 }
179 vfsp->vfs_refcount++;
180 mutex_exit(&vfs_list_lock);
181
182 if ((mp = vfs_mountalloc(vfsp, NULL)) == NULL)
183 return ENOMEM;
184 mp->mnt_flag = MNT_RDONLY;
185 (void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name,
186 sizeof(mp->mnt_stat.f_fstypename));
187 mp->mnt_stat.f_mntonname[0] = '/';
188 mp->mnt_stat.f_mntonname[1] = '\0';
189 mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] =
190 '\0';
191 (void)copystr(devname, mp->mnt_stat.f_mntfromname,
192 sizeof(mp->mnt_stat.f_mntfromname) - 1, 0);
193 *mpp = mp;
194 return 0;
195 }
196
197 /*
198 * vfs_getnewfsid: get a new unique fsid.
199 */
200 void
201 vfs_getnewfsid(struct mount *mp)
202 {
203 static u_short xxxfs_mntid;
204 fsid_t tfsid;
205 int mtype;
206
207 mutex_enter(&mntid_lock);
208 mtype = makefstype(mp->mnt_op->vfs_name);
209 mp->mnt_stat.f_fsidx.__fsid_val[0] = makedev(mtype, 0);
210 mp->mnt_stat.f_fsidx.__fsid_val[1] = mtype;
211 mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
212 if (xxxfs_mntid == 0)
213 ++xxxfs_mntid;
214 tfsid.__fsid_val[0] = makedev(mtype & 0xff, xxxfs_mntid);
215 tfsid.__fsid_val[1] = mtype;
216 if (!TAILQ_EMPTY(&mountlist)) {
217 while (vfs_getvfs(&tfsid)) {
218 tfsid.__fsid_val[0]++;
219 xxxfs_mntid++;
220 }
221 }
222 mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0];
223 mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
224 mutex_exit(&mntid_lock);
225 }
226
227 /*
228 * Lookup a mount point by filesystem identifier.
229 *
230 * XXX Needs to add a reference to the mount point.
231 */
232 struct mount *
233 vfs_getvfs(fsid_t *fsid)
234 {
235 struct mount *mp;
236
237 mutex_enter(&mountlist_lock);
238 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
239 if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] &&
240 mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) {
241 mutex_exit(&mountlist_lock);
242 return (mp);
243 }
244 }
245 mutex_exit(&mountlist_lock);
246 return NULL;
247 }
248
249 /*
250 * Drop a reference to a mount structure, freeing if the last reference.
251 */
252 void
253 vfs_destroy(struct mount *mp)
254 {
255
256 if (__predict_true((int)atomic_dec_uint_nv(&mp->mnt_refcnt) > 0)) {
257 return;
258 }
259
260 /*
261 * Nothing else has visibility of the mount: we can now
262 * free the data structures.
263 */
264 KASSERT(mp->mnt_refcnt == 0);
265 specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
266 mutex_destroy(&mp->mnt_unmounting);
267 mutex_destroy(&mp->mnt_updating);
268 mutex_destroy(&mp->mnt_renamelock);
269 if (mp->mnt_op != NULL) {
270 vfs_delref(mp->mnt_op);
271 }
272 kmem_free(mp, sizeof(*mp));
273 }
274
275 /*
276 * Mark a mount point as busy, and gain a new reference to it. Used to
277 * prevent the file system from being unmounted during critical sections.
278 *
279 * vfs_busy can be called multiple times and by multiple threads
280 * and must be accompanied by the same number of vfs_unbusy calls.
281 *
282 * => The caller must hold a pre-existing reference to the mount.
283 * => Will fail if the file system is being unmounted, or is unmounted.
284 */
285 int
286 vfs_busy(struct mount *mp, struct mount **nextp)
287 {
288
289 KASSERT(mp->mnt_refcnt > 0);
290
291 mutex_enter(&mp->mnt_unmounting);
292 if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) {
293 mutex_exit(&mp->mnt_unmounting);
294 if (nextp != NULL) {
295 KASSERT(mutex_owned(&mountlist_lock));
296 *nextp = TAILQ_NEXT(mp, mnt_list);
297 }
298 return ENOENT;
299 }
300 ++mp->mnt_busynest;
301 KASSERT(mp->mnt_busynest != 0);
302 mutex_exit(&mp->mnt_unmounting);
303 if (nextp != NULL) {
304 mutex_exit(&mountlist_lock);
305 }
306 atomic_inc_uint(&mp->mnt_refcnt);
307 return 0;
308 }
309
310 /*
311 * Unbusy a busy filesystem.
312 *
313 * Every successful vfs_busy() call must be undone by a vfs_unbusy() call.
314 *
315 * => If keepref is true, preserve reference added by vfs_busy().
316 * => If nextp != NULL, acquire mountlist_lock.
317 */
318 void
319 vfs_unbusy(struct mount *mp, bool keepref, struct mount **nextp)
320 {
321
322 KASSERT(mp->mnt_refcnt > 0);
323
324 if (nextp != NULL) {
325 mutex_enter(&mountlist_lock);
326 }
327 mutex_enter(&mp->mnt_unmounting);
328 KASSERT(mp->mnt_busynest != 0);
329 mp->mnt_busynest--;
330 mutex_exit(&mp->mnt_unmounting);
331 if (!keepref) {
332 vfs_destroy(mp);
333 }
334 if (nextp != NULL) {
335 KASSERT(mutex_owned(&mountlist_lock));
336 *nextp = TAILQ_NEXT(mp, mnt_list);
337 }
338 }
339
340 struct vnode_iterator {
341 struct vnode vi_vnode;
342 };
343
344 void
345 vfs_vnode_iterator_init(struct mount *mp, struct vnode_iterator **vip)
346 {
347 struct vnode *vp;
348
349 vp = vnalloc(mp);
350
351 mutex_enter(&mntvnode_lock);
352 TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
353 vp->v_usecount = 1;
354 mutex_exit(&mntvnode_lock);
355
356 *vip = (struct vnode_iterator *)vp;
357 }
358
359 void
360 vfs_vnode_iterator_destroy(struct vnode_iterator *vi)
361 {
362 struct vnode *mvp = &vi->vi_vnode;
363
364 mutex_enter(&mntvnode_lock);
365 KASSERT(ISSET(mvp->v_iflag, VI_MARKER));
366 if (mvp->v_usecount != 0) {
367 TAILQ_REMOVE(&mvp->v_mount->mnt_vnodelist, mvp, v_mntvnodes);
368 mvp->v_usecount = 0;
369 }
370 mutex_exit(&mntvnode_lock);
371 vnfree(mvp);
372 }
373
374 struct vnode *
375 vfs_vnode_iterator_next(struct vnode_iterator *vi,
376 bool (*f)(void *, struct vnode *), void *cl)
377 {
378 struct vnode *mvp = &vi->vi_vnode;
379 struct mount *mp = mvp->v_mount;
380 struct vnode *vp;
381 int error;
382
383 KASSERT(ISSET(mvp->v_iflag, VI_MARKER));
384
385 do {
386 mutex_enter(&mntvnode_lock);
387 vp = TAILQ_NEXT(mvp, v_mntvnodes);
388 TAILQ_REMOVE(&mp->mnt_vnodelist, mvp, v_mntvnodes);
389 mvp->v_usecount = 0;
390 again:
391 if (vp == NULL) {
392 mutex_exit(&mntvnode_lock);
393 return NULL;
394 }
395 mutex_enter(vp->v_interlock);
396 if (ISSET(vp->v_iflag, VI_MARKER) ||
397 (f && !ISSET(vp->v_iflag, VI_XLOCK) && !(*f)(cl, vp))) {
398 mutex_exit(vp->v_interlock);
399 vp = TAILQ_NEXT(vp, v_mntvnodes);
400 goto again;
401 }
402
403 TAILQ_INSERT_AFTER(&mp->mnt_vnodelist, vp, mvp, v_mntvnodes);
404 mvp->v_usecount = 1;
405 mutex_exit(&mntvnode_lock);
406 error = vget(vp, 0, true /* wait */);
407 KASSERT(error == 0 || error == ENOENT);
408 } while (error != 0);
409
410 return vp;
411 }
412
413 /*
414 * Move a vnode from one mount queue to another.
415 */
416 void
417 vfs_insmntque(vnode_t *vp, struct mount *mp)
418 {
419 struct mount *omp;
420
421 KASSERT(mp == NULL || (mp->mnt_iflag & IMNT_UNMOUNT) == 0 ||
422 vp->v_tag == VT_VFS);
423
424 mutex_enter(&mntvnode_lock);
425 /*
426 * Delete from old mount point vnode list, if on one.
427 */
428 if ((omp = vp->v_mount) != NULL)
429 TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes);
430 /*
431 * Insert into list of vnodes for the new mount point, if
432 * available. The caller must take a reference on the mount
433 * structure and donate to the vnode.
434 */
435 if ((vp->v_mount = mp) != NULL)
436 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
437 mutex_exit(&mntvnode_lock);
438
439 if (omp != NULL) {
440 /* Release reference to old mount. */
441 vfs_destroy(omp);
442 }
443 }
444
445 /*
446 * Remove any vnodes in the vnode table belonging to mount point mp.
447 *
448 * If FORCECLOSE is not specified, there should not be any active ones,
449 * return error if any are found (nb: this is a user error, not a
450 * system error). If FORCECLOSE is specified, detach any active vnodes
451 * that are found.
452 *
453 * If WRITECLOSE is set, only flush out regular file vnodes open for
454 * writing.
455 *
456 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
457 */
458 #ifdef DEBUG
459 int busyprt = 0; /* print out busy vnodes */
460 struct ctldebug debug1 = { "busyprt", &busyprt };
461 #endif
462
463 struct vflush_ctx {
464 const struct vnode *skipvp;
465 int flags;
466 };
467
468 static bool
469 vflush_selector(void *cl, struct vnode *vp)
470 {
471 struct vflush_ctx *c = cl;
472 /*
473 * Skip over a selected vnode.
474 */
475 if (vp == c->skipvp)
476 return false;
477 /*
478 * Skip over a vnodes marked VSYSTEM.
479 */
480 if ((c->flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM))
481 return false;
482
483 /*
484 * If WRITECLOSE is set, only flush out regular file
485 * vnodes open for writing.
486 */
487 if ((c->flags & WRITECLOSE) && vp->v_type == VREG) {
488 if (vp->v_writecount == 0)
489 return false;
490 }
491 return true;
492 }
493
494 static vnode_t *
495 vflushnext(struct vnode_iterator *marker, void *ctx, int *when)
496 {
497 if (hardclock_ticks > *when) {
498 yield();
499 *when = hardclock_ticks + hz / 10;
500 }
501 return vfs_vnode_iterator_next(marker, vflush_selector, ctx);
502 }
503
504
505 int
506 vflush(struct mount *mp, vnode_t *skipvp, int flags)
507 {
508 vnode_t *vp;
509 struct vnode_iterator *marker;
510 int busy = 0, when = 0;
511 struct vflush_ctx ctx;
512
513 /* First, flush out any vnode references from vrele_list. */
514 vrele_flush();
515
516 vfs_vnode_iterator_init(mp, &marker);
517
518 ctx.skipvp = skipvp;
519 ctx.flags = flags;
520 while ((vp = vflushnext(marker, &ctx, &when)) != NULL) {
521 /*
522 * First try to recycle the vnode.
523 */
524 if (vrecycle(vp))
525 continue;
526 /*
527 * If FORCECLOSE is set, forcibly close the vnode.
528 */
529 if (flags & FORCECLOSE) {
530 vgone(vp);
531 continue;
532 }
533 #ifdef DEBUG
534 if (busyprt)
535 vprint("vflush: busy vnode", vp);
536 #endif
537 vrele(vp);
538 busy++;
539 }
540 vfs_vnode_iterator_destroy(marker);
541 if (busy)
542 return (EBUSY);
543 return (0);
544 }
545
546 /*
547 * Mount a file system.
548 */
549
550 /*
551 * Scan all active processes to see if any of them have a current or root
552 * directory onto which the new filesystem has just been mounted. If so,
553 * replace them with the new mount point.
554 */
555 static void
556 mount_checkdirs(vnode_t *olddp)
557 {
558 vnode_t *newdp, *rele1, *rele2;
559 struct cwdinfo *cwdi;
560 struct proc *p;
561 bool retry;
562
563 if (olddp->v_usecount == 1) {
564 return;
565 }
566 if (VFS_ROOT(olddp->v_mountedhere, &newdp))
567 panic("mount: lost mount");
568
569 do {
570 retry = false;
571 mutex_enter(proc_lock);
572 PROCLIST_FOREACH(p, &allproc) {
573 if ((cwdi = p->p_cwdi) == NULL)
574 continue;
575 /*
576 * Cannot change to the old directory any more,
577 * so even if we see a stale value it is not a
578 * problem.
579 */
580 if (cwdi->cwdi_cdir != olddp &&
581 cwdi->cwdi_rdir != olddp)
582 continue;
583 retry = true;
584 rele1 = NULL;
585 rele2 = NULL;
586 atomic_inc_uint(&cwdi->cwdi_refcnt);
587 mutex_exit(proc_lock);
588 rw_enter(&cwdi->cwdi_lock, RW_WRITER);
589 if (cwdi->cwdi_cdir == olddp) {
590 rele1 = cwdi->cwdi_cdir;
591 vref(newdp);
592 cwdi->cwdi_cdir = newdp;
593 }
594 if (cwdi->cwdi_rdir == olddp) {
595 rele2 = cwdi->cwdi_rdir;
596 vref(newdp);
597 cwdi->cwdi_rdir = newdp;
598 }
599 rw_exit(&cwdi->cwdi_lock);
600 cwdfree(cwdi);
601 if (rele1 != NULL)
602 vrele(rele1);
603 if (rele2 != NULL)
604 vrele(rele2);
605 mutex_enter(proc_lock);
606 break;
607 }
608 mutex_exit(proc_lock);
609 } while (retry);
610
611 if (rootvnode == olddp) {
612 vrele(rootvnode);
613 vref(newdp);
614 rootvnode = newdp;
615 }
616 vput(newdp);
617 }
618
619 /*
620 * Start extended attributes
621 */
622 static int
623 start_extattr(struct mount *mp)
624 {
625 int error;
626
627 error = VFS_EXTATTRCTL(mp, EXTATTR_CMD_START, NULL, 0, NULL);
628 if (error)
629 printf("%s: failed to start extattr: error = %d\n",
630 mp->mnt_stat.f_mntonname, error);
631
632 return error;
633 }
634
635 int
636 mount_domount(struct lwp *l, vnode_t **vpp, struct vfsops *vfsops,
637 const char *path, int flags, void *data, size_t *data_len)
638 {
639 vnode_t *vp = *vpp;
640 struct mount *mp;
641 struct pathbuf *pb;
642 struct nameidata nd;
643 int error;
644
645 error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
646 KAUTH_REQ_SYSTEM_MOUNT_NEW, vp, KAUTH_ARG(flags), data);
647 if (error) {
648 vfs_delref(vfsops);
649 return error;
650 }
651
652 /* Cannot make a non-dir a mount-point (from here anyway). */
653 if (vp->v_type != VDIR) {
654 vfs_delref(vfsops);
655 return ENOTDIR;
656 }
657
658 if (flags & MNT_EXPORTED) {
659 vfs_delref(vfsops);
660 return EINVAL;
661 }
662
663 if ((mp = vfs_mountalloc(vfsops, vp)) == NULL) {
664 vfs_delref(vfsops);
665 return ENOMEM;
666 }
667
668 mp->mnt_stat.f_owner = kauth_cred_geteuid(l->l_cred);
669
670 /*
671 * The underlying file system may refuse the mount for
672 * various reasons. Allow the user to force it to happen.
673 *
674 * Set the mount level flags.
675 */
676 mp->mnt_flag = flags & (MNT_BASIC_FLAGS | MNT_FORCE | MNT_IGNORE);
677
678 mutex_enter(&mp->mnt_updating);
679 error = VFS_MOUNT(mp, path, data, data_len);
680 mp->mnt_flag &= ~MNT_OP_FLAGS;
681
682 if (error != 0)
683 goto err_unmounted;
684
685 /*
686 * Validate and prepare the mount point.
687 */
688 error = pathbuf_copyin(path, &pb);
689 if (error != 0) {
690 goto err_mounted;
691 }
692 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
693 error = namei(&nd);
694 pathbuf_destroy(pb);
695 if (error != 0) {
696 goto err_mounted;
697 }
698 if (nd.ni_vp != vp) {
699 vput(nd.ni_vp);
700 error = EINVAL;
701 goto err_mounted;
702 }
703 if (vp->v_mountedhere != NULL) {
704 vput(nd.ni_vp);
705 error = EBUSY;
706 goto err_mounted;
707 }
708 error = vinvalbuf(vp, V_SAVE, l->l_cred, l, 0, 0);
709 if (error != 0) {
710 vput(nd.ni_vp);
711 goto err_mounted;
712 }
713
714 /*
715 * Put the new filesystem on the mount list after root.
716 */
717 cache_purge(vp);
718 mp->mnt_iflag &= ~IMNT_WANTRDWR;
719
720 mutex_enter(&mountlist_lock);
721 TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
722 mutex_exit(&mountlist_lock);
723 if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
724 vfs_syncer_add_to_worklist(mp);
725 vp->v_mountedhere = mp;
726 vput(nd.ni_vp);
727
728 mount_checkdirs(vp);
729 mutex_exit(&mp->mnt_updating);
730
731 /* Hold an additional reference to the mount across VFS_START(). */
732 vfs_unbusy(mp, true, NULL);
733 (void) VFS_STATVFS(mp, &mp->mnt_stat);
734 error = VFS_START(mp, 0);
735 if (error) {
736 vrele(vp);
737 } else if (flags & MNT_EXTATTR) {
738 (void)start_extattr(mp);
739 }
740 /* Drop reference held for VFS_START(). */
741 vfs_destroy(mp);
742 *vpp = NULL;
743 return error;
744
745 err_mounted:
746 if (VFS_UNMOUNT(mp, MNT_FORCE) != 0)
747 panic("Unmounting fresh file system failed");
748
749 err_unmounted:
750 vp->v_mountedhere = NULL;
751 mutex_exit(&mp->mnt_updating);
752 vfs_unbusy(mp, false, NULL);
753 vfs_destroy(mp);
754
755 return error;
756 }
757
758 /*
759 * Do the actual file system unmount. File system is assumed to have
760 * been locked by the caller.
761 *
762 * => Caller hold reference to the mount, explicitly for dounmount().
763 */
764 int
765 dounmount(struct mount *mp, int flags, struct lwp *l)
766 {
767 vnode_t *coveredvp;
768 int error, async, used_syncer, used_extattr;
769
770 #if NVERIEXEC > 0
771 error = veriexec_unmountchk(mp);
772 if (error)
773 return (error);
774 #endif /* NVERIEXEC > 0 */
775
776 /*
777 * XXX Freeze syncer. Must do this before locking the
778 * mount point. See dounmount() for details.
779 */
780 mutex_enter(&syncer_mutex);
781
782 /*
783 * Abort unmount attempt when the filesystem is in use
784 */
785 mutex_enter(&mp->mnt_unmounting);
786 if (mp->mnt_busynest != 0) {
787 mutex_exit(&mp->mnt_unmounting);
788 mutex_exit(&syncer_mutex);
789 return EBUSY;
790 }
791
792 /*
793 * Abort unmount attempt when the filesystem is not mounted
794 */
795 if ((mp->mnt_iflag & IMNT_GONE) != 0) {
796 mutex_exit(&mp->mnt_unmounting);
797 mutex_exit(&syncer_mutex);
798 return ENOENT;
799 }
800
801 used_syncer = (mp->mnt_iflag & IMNT_ONWORKLIST) != 0;
802 used_extattr = mp->mnt_flag & MNT_EXTATTR;
803
804 /*
805 * XXX Syncer must be frozen when we get here. This should really
806 * be done on a per-mountpoint basis, but the syncer doesn't work
807 * like that.
808 *
809 * The caller of dounmount() must acquire syncer_mutex because
810 * the syncer itself acquires locks in syncer_mutex -> vfs_busy
811 * order, and we must preserve that order to avoid deadlock.
812 *
813 * So, if the file system did not use the syncer, now is
814 * the time to release the syncer_mutex.
815 */
816 if (used_syncer == 0) {
817 mutex_exit(&syncer_mutex);
818 }
819 mp->mnt_iflag |= IMNT_UNMOUNT;
820 mutex_enter(&mp->mnt_updating);
821 async = mp->mnt_flag & MNT_ASYNC;
822 mp->mnt_flag &= ~MNT_ASYNC;
823 cache_purgevfs(mp); /* remove cache entries for this file sys */
824 if (used_syncer)
825 vfs_syncer_remove_from_worklist(mp);
826 error = 0;
827 if (((mp->mnt_flag & MNT_RDONLY) == 0) && ((flags & MNT_FORCE) == 0)) {
828 error = VFS_SYNC(mp, MNT_WAIT, l->l_cred);
829 }
830 if (error == 0 || (flags & MNT_FORCE)) {
831 error = VFS_UNMOUNT(mp, flags);
832 }
833 if (error) {
834 mp->mnt_iflag &= ~IMNT_UNMOUNT;
835 mutex_exit(&mp->mnt_unmounting);
836 if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
837 vfs_syncer_add_to_worklist(mp);
838 mp->mnt_flag |= async;
839 mutex_exit(&mp->mnt_updating);
840 if (used_syncer)
841 mutex_exit(&syncer_mutex);
842 if (used_extattr) {
843 if (start_extattr(mp) != 0)
844 mp->mnt_flag &= ~MNT_EXTATTR;
845 else
846 mp->mnt_flag |= MNT_EXTATTR;
847 }
848 return (error);
849 }
850 mutex_exit(&mp->mnt_updating);
851
852 /*
853 * release mnt_umounting lock here, because other code calls
854 * vfs_busy() while holding the mountlist_lock.
855 *
856 * mark filesystem as gone to prevent further umounts
857 * after mnt_umounting lock is gone, this also prevents
858 * vfs_busy() from succeeding.
859 */
860 mp->mnt_iflag |= IMNT_GONE;
861 mutex_exit(&mp->mnt_unmounting);
862
863 if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
864 vn_lock(coveredvp, LK_EXCLUSIVE | LK_RETRY);
865 coveredvp->v_mountedhere = NULL;
866 VOP_UNLOCK(coveredvp);
867 }
868 mutex_enter(&mountlist_lock);
869 TAILQ_REMOVE(&mountlist, mp, mnt_list);
870 mutex_exit(&mountlist_lock);
871 if (TAILQ_FIRST(&mp->mnt_vnodelist) != NULL)
872 panic("unmount: dangling vnode");
873 if (used_syncer)
874 mutex_exit(&syncer_mutex);
875 vfs_hooks_unmount(mp);
876
877 vfs_destroy(mp); /* reference from mount() */
878 if (coveredvp != NULLVP) {
879 vrele(coveredvp);
880 }
881 return (0);
882 }
883
884 /*
885 * Unmount all file systems.
886 * We traverse the list in reverse order under the assumption that doing so
887 * will avoid needing to worry about dependencies.
888 */
889 bool
890 vfs_unmountall(struct lwp *l)
891 {
892
893 printf("unmounting file systems...\n");
894 return vfs_unmountall1(l, true, true);
895 }
896
897 static void
898 vfs_unmount_print(struct mount *mp, const char *pfx)
899 {
900
901 aprint_verbose("%sunmounted %s on %s type %s\n", pfx,
902 mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname,
903 mp->mnt_stat.f_fstypename);
904 }
905
906 bool
907 vfs_unmount_forceone(struct lwp *l)
908 {
909 struct mount *mp, *nmp;
910 int error;
911
912 nmp = NULL;
913
914 TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
915 if (nmp == NULL || mp->mnt_gen > nmp->mnt_gen) {
916 nmp = mp;
917 }
918 }
919 if (nmp == NULL) {
920 return false;
921 }
922
923 #ifdef DEBUG
924 printf("forcefully unmounting %s (%s)...\n",
925 nmp->mnt_stat.f_mntonname, nmp->mnt_stat.f_mntfromname);
926 #endif
927 atomic_inc_uint(&nmp->mnt_refcnt);
928 if ((error = dounmount(nmp, MNT_FORCE, l)) == 0) {
929 vfs_unmount_print(nmp, "forcefully ");
930 return true;
931 } else {
932 vfs_destroy(nmp);
933 }
934
935 #ifdef DEBUG
936 printf("forceful unmount of %s failed with error %d\n",
937 nmp->mnt_stat.f_mntonname, error);
938 #endif
939
940 return false;
941 }
942
943 bool
944 vfs_unmountall1(struct lwp *l, bool force, bool verbose)
945 {
946 struct mount *mp, *nmp;
947 bool any_error = false, progress = false;
948 int error;
949
950 TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, nmp) {
951 #ifdef DEBUG
952 printf("unmounting %p %s (%s)...\n",
953 (void *)mp, mp->mnt_stat.f_mntonname,
954 mp->mnt_stat.f_mntfromname);
955 #endif
956 atomic_inc_uint(&mp->mnt_refcnt);
957 if ((error = dounmount(mp, force ? MNT_FORCE : 0, l)) == 0) {
958 vfs_unmount_print(mp, "");
959 progress = true;
960 } else {
961 vfs_destroy(mp);
962 if (verbose) {
963 printf("unmount of %s failed with error %d\n",
964 mp->mnt_stat.f_mntonname, error);
965 }
966 any_error = true;
967 }
968 }
969 if (verbose) {
970 printf("unmounting done\n");
971 }
972 if (any_error && verbose) {
973 printf("WARNING: some file systems would not unmount\n");
974 }
975 return progress;
976 }
977
978 void
979 vfs_sync_all(struct lwp *l)
980 {
981 printf("syncing disks... ");
982
983 /* remove user processes from run queue */
984 suspendsched();
985 (void)spl0();
986
987 /* avoid coming back this way again if we panic. */
988 doing_shutdown = 1;
989
990 do_sys_sync(l);
991
992 /* Wait for sync to finish. */
993 if (buf_syncwait() != 0) {
994 #if defined(DDB) && defined(DEBUG_HALT_BUSY)
995 Debugger();
996 #endif
997 printf("giving up\n");
998 return;
999 } else
1000 printf("done\n");
1001 }
1002
1003 /*
1004 * Sync and unmount file systems before shutting down.
1005 */
1006 void
1007 vfs_shutdown(void)
1008 {
1009 lwp_t *l = curlwp;
1010
1011 vfs_sync_all(l);
1012
1013 /*
1014 * If we have paniced - do not make the situation potentially
1015 * worse by unmounting the file systems.
1016 */
1017 if (panicstr != NULL) {
1018 return;
1019 }
1020
1021 /* Unmount file systems. */
1022 vfs_unmountall(l);
1023 }
1024
1025 /*
1026 * Print a list of supported file system types (used by vfs_mountroot)
1027 */
1028 static void
1029 vfs_print_fstypes(void)
1030 {
1031 struct vfsops *v;
1032 int cnt = 0;
1033
1034 mutex_enter(&vfs_list_lock);
1035 LIST_FOREACH(v, &vfs_list, vfs_list)
1036 ++cnt;
1037 mutex_exit(&vfs_list_lock);
1038
1039 if (cnt == 0) {
1040 printf("WARNING: No file system modules have been loaded.\n");
1041 return;
1042 }
1043
1044 printf("Supported file systems:");
1045 mutex_enter(&vfs_list_lock);
1046 LIST_FOREACH(v, &vfs_list, vfs_list) {
1047 printf(" %s", v->vfs_name);
1048 }
1049 mutex_exit(&vfs_list_lock);
1050 printf("\n");
1051 }
1052
1053 /*
1054 * Mount the root file system. If the operator didn't specify a
1055 * file system to use, try all possible file systems until one
1056 * succeeds.
1057 */
1058 int
1059 vfs_mountroot(void)
1060 {
1061 struct vfsops *v;
1062 int error = ENODEV;
1063
1064 if (root_device == NULL)
1065 panic("vfs_mountroot: root device unknown");
1066
1067 switch (device_class(root_device)) {
1068 case DV_IFNET:
1069 if (rootdev != NODEV)
1070 panic("vfs_mountroot: rootdev set for DV_IFNET "
1071 "(0x%llx -> %llu,%llu)",
1072 (unsigned long long)rootdev,
1073 (unsigned long long)major(rootdev),
1074 (unsigned long long)minor(rootdev));
1075 break;
1076
1077 case DV_DISK:
1078 if (rootdev == NODEV)
1079 panic("vfs_mountroot: rootdev not set for DV_DISK");
1080 if (bdevvp(rootdev, &rootvp))
1081 panic("vfs_mountroot: can't get vnode for rootdev");
1082 error = VOP_OPEN(rootvp, FREAD, FSCRED);
1083 if (error) {
1084 printf("vfs_mountroot: can't open root device\n");
1085 return (error);
1086 }
1087 break;
1088
1089 case DV_VIRTUAL:
1090 break;
1091
1092 default:
1093 printf("%s: inappropriate for root file system\n",
1094 device_xname(root_device));
1095 return (ENODEV);
1096 }
1097
1098 /*
1099 * If user specified a root fs type, use it. Make sure the
1100 * specified type exists and has a mount_root()
1101 */
1102 if (strcmp(rootfstype, ROOT_FSTYPE_ANY) != 0) {
1103 v = vfs_getopsbyname(rootfstype);
1104 error = EFTYPE;
1105 if (v != NULL) {
1106 if (v->vfs_mountroot != NULL) {
1107 error = (v->vfs_mountroot)();
1108 }
1109 v->vfs_refcount--;
1110 }
1111 goto done;
1112 }
1113
1114 /*
1115 * Try each file system currently configured into the kernel.
1116 */
1117 mutex_enter(&vfs_list_lock);
1118 LIST_FOREACH(v, &vfs_list, vfs_list) {
1119 if (v->vfs_mountroot == NULL)
1120 continue;
1121 #ifdef DEBUG
1122 aprint_normal("mountroot: trying %s...\n", v->vfs_name);
1123 #endif
1124 v->vfs_refcount++;
1125 mutex_exit(&vfs_list_lock);
1126 error = (*v->vfs_mountroot)();
1127 mutex_enter(&vfs_list_lock);
1128 v->vfs_refcount--;
1129 if (!error) {
1130 aprint_normal("root file system type: %s\n",
1131 v->vfs_name);
1132 break;
1133 }
1134 }
1135 mutex_exit(&vfs_list_lock);
1136
1137 if (v == NULL) {
1138 vfs_print_fstypes();
1139 printf("no file system for %s", device_xname(root_device));
1140 if (device_class(root_device) == DV_DISK)
1141 printf(" (dev 0x%llx)", (unsigned long long)rootdev);
1142 printf("\n");
1143 error = EFTYPE;
1144 }
1145
1146 done:
1147 if (error && device_class(root_device) == DV_DISK) {
1148 VOP_CLOSE(rootvp, FREAD, FSCRED);
1149 vrele(rootvp);
1150 }
1151 if (error == 0) {
1152 struct mount *mp;
1153 extern struct cwdinfo cwdi0;
1154
1155 mp = TAILQ_FIRST(&mountlist);
1156 mp->mnt_flag |= MNT_ROOTFS;
1157 mp->mnt_op->vfs_refcount++;
1158
1159 /*
1160 * Get the vnode for '/'. Set cwdi0.cwdi_cdir to
1161 * reference it.
1162 */
1163 error = VFS_ROOT(mp, &rootvnode);
1164 if (error)
1165 panic("cannot find root vnode, error=%d", error);
1166 cwdi0.cwdi_cdir = rootvnode;
1167 vref(cwdi0.cwdi_cdir);
1168 VOP_UNLOCK(rootvnode);
1169 cwdi0.cwdi_rdir = NULL;
1170
1171 /*
1172 * Now that root is mounted, we can fixup initproc's CWD
1173 * info. All other processes are kthreads, which merely
1174 * share proc0's CWD info.
1175 */
1176 initproc->p_cwdi->cwdi_cdir = rootvnode;
1177 vref(initproc->p_cwdi->cwdi_cdir);
1178 initproc->p_cwdi->cwdi_rdir = NULL;
1179 /*
1180 * Enable loading of modules from the filesystem
1181 */
1182 module_load_vfs_init();
1183
1184 }
1185 return (error);
1186 }
1187
1188 /*
1189 * mount_specific_key_create --
1190 * Create a key for subsystem mount-specific data.
1191 */
1192 int
1193 mount_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor)
1194 {
1195
1196 return specificdata_key_create(mount_specificdata_domain, keyp, dtor);
1197 }
1198
1199 /*
1200 * mount_specific_key_delete --
1201 * Delete a key for subsystem mount-specific data.
1202 */
1203 void
1204 mount_specific_key_delete(specificdata_key_t key)
1205 {
1206
1207 specificdata_key_delete(mount_specificdata_domain, key);
1208 }
1209
1210 /*
1211 * mount_initspecific --
1212 * Initialize a mount's specificdata container.
1213 */
1214 void
1215 mount_initspecific(struct mount *mp)
1216 {
1217 int error __diagused;
1218
1219 error = specificdata_init(mount_specificdata_domain,
1220 &mp->mnt_specdataref);
1221 KASSERT(error == 0);
1222 }
1223
1224 /*
1225 * mount_finispecific --
1226 * Finalize a mount's specificdata container.
1227 */
1228 void
1229 mount_finispecific(struct mount *mp)
1230 {
1231
1232 specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
1233 }
1234
1235 /*
1236 * mount_getspecific --
1237 * Return mount-specific data corresponding to the specified key.
1238 */
1239 void *
1240 mount_getspecific(struct mount *mp, specificdata_key_t key)
1241 {
1242
1243 return specificdata_getspecific(mount_specificdata_domain,
1244 &mp->mnt_specdataref, key);
1245 }
1246
1247 /*
1248 * mount_setspecific --
1249 * Set mount-specific data corresponding to the specified key.
1250 */
1251 void
1252 mount_setspecific(struct mount *mp, specificdata_key_t key, void *data)
1253 {
1254
1255 specificdata_setspecific(mount_specificdata_domain,
1256 &mp->mnt_specdataref, key, data);
1257 }
1258
1259 /*
1260 * Check to see if a filesystem is mounted on a block device.
1261 */
1262 int
1263 vfs_mountedon(vnode_t *vp)
1264 {
1265 vnode_t *vq;
1266 int error = 0;
1267
1268 if (vp->v_type != VBLK)
1269 return ENOTBLK;
1270 if (spec_node_getmountedfs(vp) != NULL)
1271 return EBUSY;
1272 if (spec_node_lookup_by_dev(vp->v_type, vp->v_rdev, &vq) == 0) {
1273 if (spec_node_getmountedfs(vq) != NULL)
1274 error = EBUSY;
1275 vrele(vq);
1276 }
1277
1278 return error;
1279 }
1280
1281 /*
1282 * Check if a device pointed to by vp is mounted.
1283 *
1284 * Returns:
1285 * EINVAL if it's not a disk
1286 * EBUSY if it's a disk and mounted
1287 * 0 if it's a disk and not mounted
1288 */
1289 int
1290 rawdev_mounted(vnode_t *vp, vnode_t **bvpp)
1291 {
1292 vnode_t *bvp;
1293 dev_t dev;
1294 int d_type;
1295
1296 bvp = NULL;
1297 d_type = D_OTHER;
1298
1299 if (iskmemvp(vp))
1300 return EINVAL;
1301
1302 switch (vp->v_type) {
1303 case VCHR: {
1304 const struct cdevsw *cdev;
1305
1306 dev = vp->v_rdev;
1307 cdev = cdevsw_lookup(dev);
1308 if (cdev != NULL) {
1309 dev_t blkdev;
1310
1311 blkdev = devsw_chr2blk(dev);
1312 if (blkdev != NODEV) {
1313 if (vfinddev(blkdev, VBLK, &bvp) != 0) {
1314 d_type = (cdev->d_flag & D_TYPEMASK);
1315 /* XXX: what if bvp disappears? */
1316 vrele(bvp);
1317 }
1318 }
1319 }
1320
1321 break;
1322 }
1323
1324 case VBLK: {
1325 const struct bdevsw *bdev;
1326
1327 dev = vp->v_rdev;
1328 bdev = bdevsw_lookup(dev);
1329 if (bdev != NULL)
1330 d_type = (bdev->d_flag & D_TYPEMASK);
1331
1332 bvp = vp;
1333
1334 break;
1335 }
1336
1337 default:
1338 break;
1339 }
1340
1341 if (d_type != D_DISK)
1342 return EINVAL;
1343
1344 if (bvpp != NULL)
1345 *bvpp = bvp;
1346
1347 /*
1348 * XXX: This is bogus. We should be failing the request
1349 * XXX: not only if this specific slice is mounted, but
1350 * XXX: if it's on a disk with any other mounted slice.
1351 */
1352 if (vfs_mountedon(bvp))
1353 return EBUSY;
1354
1355 return 0;
1356 }
1357
1358 /*
1359 * Make a 'unique' number from a mount type name.
1360 */
1361 long
1362 makefstype(const char *type)
1363 {
1364 long rv;
1365
1366 for (rv = 0; *type; type++) {
1367 rv <<= 2;
1368 rv ^= *type;
1369 }
1370 return rv;
1371 }
1372
1373 void
1374 mountlist_append(struct mount *mp)
1375 {
1376 mutex_enter(&mountlist_lock);
1377 TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
1378 mutex_exit(&mountlist_lock);
1379 }
1380