Home | History | Annotate | Line # | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
     23  * Copyright (c) 2011 Pawel Jakub Dawidek <pawel (at) dawidek.net>.
     24  * All rights reserved.
     25  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
     26  * Copyright (c) 2014 Integros [integros.com]
     27  */
     28 
     29 /* Portions Copyright 2010 Robert Milkowski */
     30 
     31 #include <sys/types.h>
     32 #include <sys/param.h>
     33 #include <sys/systm.h>
     34 #include <sys/kernel.h>
     35 #include <sys/sysmacros.h>
     36 #include <sys/kmem.h>
     37 #include <sys/acl.h>
     38 #include <sys/vnode.h>
     39 #include <sys/vfs.h>
     40 #include <sys/mntent.h>
     41 #include <sys/mount.h>
     42 #include <sys/cmn_err.h>
     43 #include <sys/zfs_znode.h>
     44 #include <sys/zfs_dir.h>
     45 #include <sys/zil.h>
     46 #include <sys/fs/zfs.h>
     47 #include <sys/dmu.h>
     48 #include <sys/dsl_prop.h>
     49 #include <sys/dsl_dataset.h>
     50 #include <sys/dsl_deleg.h>
     51 #include <sys/spa.h>
     52 #include <sys/zap.h>
     53 #include <sys/sa.h>
     54 #include <sys/sa_impl.h>
     55 #include <sys/varargs.h>
     56 #include <sys/policy.h>
     57 #include <sys/atomic.h>
     58 #include <sys/zfs_ioctl.h>
     59 #include <sys/zfs_ctldir.h>
     60 #include <sys/zfs_fuid.h>
     61 #include <sys/sunddi.h>
     62 #include <sys/dnlc.h>
     63 #include <sys/dmu_objset.h>
     64 #include <sys/spa_boot.h>
     65 #include "zfs_comutil.h"
     66 
     67 #ifdef __FreeBSD_kernel__
     68 
     69 #include <sys/jail.h>
     70 
     71 struct mtx zfs_debug_mtx;
     72 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
     73 
     74 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
     75 
     76 int zfs_super_owner;
     77 SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
     78     "File system owner can perform privileged operation on his file systems");
     79 
     80 int zfs_debug_level;
     81 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
     82     "Debug level");
     83 
     84 SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
     85 static int zfs_version_acl = ZFS_ACL_VERSION;
     86 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
     87     "ZFS_ACL_VERSION");
     88 static int zfs_version_spa = SPA_VERSION;
     89 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
     90     "SPA_VERSION");
     91 static int zfs_version_zpl = ZPL_VERSION;
     92 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
     93     "ZPL_VERSION");
     94 
     95 static int zfs_mount(vfs_t *vfsp);
     96 static int zfs_umount(vfs_t *vfsp, int fflag);
     97 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
     98 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
     99 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
    100 static int zfs_sync(vfs_t *vfsp, int waitfor);
    101 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
    102     struct ucred **credanonp, int *numsecflavors, int **secflavors);
    103 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
    104 static void zfs_objset_close(zfsvfs_t *zfsvfs);
    105 static void zfs_freevfs(vfs_t *vfsp);
    106 
    107 struct vfsops zfs_vfsops = {
    108 	.vfs_mount =		zfs_mount,
    109 	.vfs_unmount =		zfs_umount,
    110 	.vfs_root =		zfs_root,
    111 	.vfs_statfs =		zfs_statfs,
    112 	.vfs_vget =		zfs_vget,
    113 	.vfs_sync =		zfs_sync,
    114 	.vfs_checkexp =		zfs_checkexp,
    115 	.vfs_fhtovp =		zfs_fhtovp,
    116 };
    117 
    118 VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
    119 
    120 #endif /* __FreeBSD_kernel__ */
    121 
    122 #ifdef __NetBSD__
    123 
    124 #include <sys/fstrans.h>
    125 #include <sys/mkdev.h>
    126 #include <miscfs/genfs/genfs.h>
    127 
    128 int zfs_debug_level;
    129 kmutex_t zfs_debug_mtx;
    130 
    131 #define	DROP_GIANT()	/* nothing */
    132 #define PICKUP_GIANT()	/* nothing */
    133 #define vfs_stdsync(a, b) 0
    134 
    135 static int zfs_mount(vfs_t *vfsp, const char *path, void *data, size_t *data_len);
    136 static int zfs_umount(vfs_t *vfsp, int fflag);
    137 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
    138 static int zfs_statvfs(vfs_t *vfsp, struct statvfs *statp);
    139 static int zfs_netbsd_vptofh(vnode_t *vp, fid_t *fidp, size_t *fh_size);
    140 static int zfs_netbsd_fhtovp(vfs_t *vfsp, fid_t *fidp, int lktype, vnode_t **vpp);
    141 static int zfs_vget(vfs_t *vfsp, ino_t ino, int lktype, vnode_t **vpp);
    142 static int zfs_sync(vfs_t *vfsp, int waitfor);
    143 static int zfs_netbsd_sync(vfs_t *vfsp, int waitfor, cred_t *cr);
    144 static void zfs_freevfs(vfs_t *vfsp);
    145 
    146 void zfs_init(void);
    147 void zfs_fini(void);
    148 
    149 extern const struct vnodeopv_desc zfs_vnodeop_opv_desc;
    150 extern const struct vnodeopv_desc zfs_specop_opv_desc;
    151 extern const struct vnodeopv_desc zfs_fifoop_opv_desc;
    152 extern const struct vnodeopv_desc zfs_sfsop_opv_desc;
    153 
    154 static const struct vnodeopv_desc * const zfs_vnodeop_descs[] = {
    155 	&zfs_vnodeop_opv_desc,
    156 	&zfs_specop_opv_desc,
    157 	&zfs_fifoop_opv_desc,
    158 	&zfs_sfsop_opv_desc,
    159 	NULL,
    160 };
    161 
    162 struct vfsops zfs_vfsops = {
    163 	.vfs_name = MOUNT_ZFS,
    164 	.vfs_min_mount_data = sizeof(struct zfs_args),
    165 	.vfs_opv_descs = zfs_vnodeop_descs,
    166 	.vfs_mount = zfs_mount,
    167 	.vfs_unmount = zfs_umount,
    168 	.vfs_root = zfs_root,
    169 	.vfs_statvfs = zfs_statvfs,
    170 	.vfs_sync = zfs_netbsd_sync,
    171 	.vfs_vget = zfs_vget,
    172 	.vfs_loadvnode = zfs_loadvnode,
    173 	.vfs_newvnode = zfs_newvnode,
    174 	.vfs_init = zfs_init,
    175 	.vfs_done = zfs_fini,
    176 	.vfs_start = (void *)nullop,
    177 	.vfs_renamelock_enter = genfs_renamelock_enter,
    178 	.vfs_renamelock_exit = genfs_renamelock_exit,
    179 	.vfs_reinit = (void *)nullop,
    180 	.vfs_vptofh = zfs_netbsd_vptofh,
    181 	.vfs_fhtovp = zfs_netbsd_fhtovp,
    182 	.vfs_quotactl = (void *)eopnotsupp,
    183 	.vfs_extattrctl = (void *)eopnotsupp,
    184 	.vfs_suspendctl = genfs_suspendctl,
    185 	.vfs_snapshot = (void *)eopnotsupp,
    186 	.vfs_fsync = (void *)eopnotsupp,
    187 };
    188 
    189 static int
    190 zfs_netbsd_sync(vfs_t *vfsp, int waitfor, cred_t *cr)
    191 {
    192 	/*
    193 	 * Do the regular ZFS stuff.
    194 	 */
    195 	return zfs_sync(vfsp, waitfor);
    196 }
    197 
    198 static int
    199 zfs_netbsd_vptofh(vnode_t *vp, fid_t *fidp, size_t *fh_size)
    200 {
    201 	znode_t		*zp;
    202 	zfsvfs_t	*zfsvfs;
    203 	uint32_t	gen;
    204 	uint64_t	gen64;
    205 	uint64_t	object;
    206 	zfid_short_t	*zfid;
    207 	int		size, i, error;
    208 
    209 	if (zfsctl_is_node(vp))
    210 		return zfsctl_vptofh(vp, fidp, fh_size);
    211 
    212 	zp = VTOZ(vp);
    213 	zfsvfs = zp->z_zfsvfs;
    214 	object = zp->z_id;
    215 
    216 	ZFS_ENTER(zfsvfs);
    217 	ZFS_VERIFY_ZP(zp);
    218 
    219 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
    220 	    &gen64, sizeof (uint64_t))) != 0) {
    221 		ZFS_EXIT(zfsvfs);
    222 		return (error);
    223 	}
    224 
    225 	gen = (uint32_t)gen64;
    226 
    227 	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
    228 
    229 	if (*fh_size < size) {
    230 		ZFS_EXIT(zfsvfs);
    231 		*fh_size = size;
    232 		return SET_ERROR(E2BIG);
    233 	}
    234 	*fh_size = size;
    235 
    236 	zfid = (zfid_short_t *)fidp;
    237 
    238 	zfid->zf_len = size;
    239 
    240 	for (i = 0; i < sizeof (zfid->zf_object); i++)
    241 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
    242 
    243 	/* Must have a non-zero generation number to distinguish from .zfs */
    244 	if (gen == 0)
    245 		gen = 1;
    246 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
    247 		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
    248 
    249 	if (size == LONG_FID_LEN) {
    250 		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
    251 		zfid_long_t	*zlfid;
    252 
    253 		zlfid = (zfid_long_t *)fidp;
    254 
    255 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
    256 			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
    257 
    258 		/* XXX - this should be the generation number for the objset */
    259 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
    260 			zlfid->zf_setgen[i] = 0;
    261 	}
    262 
    263 	ZFS_EXIT(zfsvfs);
    264 	return 0;
    265 }
    266 
    267 static int
    268 zfs_netbsd_fhtovp(vfs_t *vfsp, fid_t *fidp, int lktype, vnode_t **vpp)
    269 {
    270 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
    271 	znode_t		*zp;
    272 	vnode_t		*dvp;
    273 	uint64_t	object = 0;
    274 	uint64_t	fid_gen = 0;
    275 	uint64_t	gen_mask;
    276 	uint64_t	zp_gen;
    277 	int 		i, err;
    278 
    279 	*vpp = NULL;
    280 
    281 	ZFS_ENTER(zfsvfs);
    282 
    283 	if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
    284 		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
    285 		uint64_t	objsetid = 0;
    286 		uint64_t	setgen = 0;
    287 
    288 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
    289 			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
    290 
    291 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
    292 			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
    293 
    294 		ZFS_EXIT(zfsvfs);
    295 
    296 		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
    297 		if (err)
    298 			return (SET_ERROR(EINVAL));
    299 		ZFS_ENTER(zfsvfs);
    300 	}
    301 
    302 	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
    303 		zfid_short_t	*zfid = (zfid_short_t *)fidp;
    304 
    305 		for (i = 0; i < sizeof (zfid->zf_object); i++)
    306 			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
    307 
    308 		for (i = 0; i < sizeof (zfid->zf_gen); i++)
    309 			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
    310 	} else {
    311 		ZFS_EXIT(zfsvfs);
    312 		return (SET_ERROR(EINVAL));
    313 	}
    314 
    315 	/* A zero fid_gen means we are in the .zfs control directories */
    316 	if (fid_gen == 0 &&
    317 	     (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
    318 		ZFS_EXIT(zfsvfs);
    319 		if (object == ZFSCTL_INO_ROOT)
    320 			err = zfsctl_root(zfsvfs, vpp);
    321 		else
    322 			err = zfsctl_snapshot(zfsvfs, vpp);
    323 		if (err)
    324 			return err;
    325 		err = vn_lock(*vpp, LK_EXCLUSIVE);
    326 		if (err) {
    327 			vrele(*vpp);
    328 			*vpp = NULL;
    329 			return err;
    330 		}
    331 		return 0;
    332 	}
    333 
    334 	gen_mask = -1ULL >> (64 - 8 * i);
    335 
    336 	dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
    337 	if (err = zfs_zget(zfsvfs, object, &zp)) {
    338 		ZFS_EXIT(zfsvfs);
    339 		return SET_ERROR(ESTALE);
    340 	}
    341 	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
    342 	    sizeof (uint64_t));
    343 	zp_gen = zp_gen & gen_mask;
    344 	if (zp_gen == 0)
    345 		zp_gen = 1;
    346 	if (zp->z_unlinked || zp_gen != fid_gen) {
    347 		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
    348 		VN_RELE(ZTOV(zp));
    349 		ZFS_EXIT(zfsvfs);
    350 		return SET_ERROR(ESTALE);
    351 	}
    352 
    353 	*vpp = ZTOV(zp);
    354 	ZFS_EXIT(zfsvfs);
    355 	err = vn_lock(*vpp, lktype);
    356 	if (err) {
    357 		vrele(*vpp);
    358 		*vpp = NULL;
    359 		return err;
    360 	}
    361 	return 0;
    362 }
    363 #endif /* __NetBSD__ */
    364 
    365 /*
    366  * We need to keep a count of active fs's.
    367  * This is necessary to prevent our module
    368  * from being unloaded after a umount -f
    369  */
    370 static uint32_t	zfs_active_fs_count = 0;
    371 
    372 /*ARGSUSED*/
    373 static int
    374 zfs_sync(vfs_t *vfsp, int waitfor)
    375 {
    376         /*
    377 	 * Data integrity is job one.  We don't want a compromised kernel
    378 	 * writing to the storage pool, so we never sync during panic.
    379 	 */
    380 	if (panicstr)
    381 		return (0);
    382 
    383 	/*
    384 	 * Ignore the system syncher.  ZFS already commits async data
    385 	 * at zfs_txg_timeout intervals.
    386 	 */
    387 	if (waitfor == MNT_LAZY)
    388 		return (0);
    389 
    390 	if (vfsp != NULL) {
    391 		/*
    392 		 * Sync a specific filesystem.
    393 		 */
    394 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
    395 		dsl_pool_t *dp;
    396 		int error;
    397 
    398 		error = vfs_stdsync(vfsp, waitfor);
    399 		if (error != 0)
    400 			return (error);
    401 
    402 		ZFS_ENTER(zfsvfs);
    403 		dp = dmu_objset_pool(zfsvfs->z_os);
    404 
    405 		/*
    406 		 * If the system is shutting down, then skip any
    407 		 * filesystems which may exist on a suspended pool.
    408 		 */
    409 		if (sys_shutdown && spa_suspended(dp->dp_spa)) {
    410 			ZFS_EXIT(zfsvfs);
    411 			return (0);
    412 		}
    413 
    414 		if (zfsvfs->z_log != NULL)
    415 			zil_commit(zfsvfs->z_log, 0);
    416 
    417 		ZFS_EXIT(zfsvfs);
    418 	} else {
    419 		/*
    420 		 * Sync all ZFS filesystems.  This is what happens when you
    421 		 * run sync(1M).  Unlike other filesystems, ZFS honors the
    422 		 * request by waiting for all pools to commit all dirty data.
    423 		 */
    424 		spa_sync_allpools();
    425 	}
    426 
    427 	return (0);
    428 }
    429 
    430 #ifdef illumos
    431 static int
    432 zfs_create_unique_device(dev_t *dev)
    433 {
    434 	major_t new_major;
    435 
    436 	do {
    437 		ASSERT3U(zfs_minor, <=, MAXMIN32);
    438 		minor_t start = zfs_minor;
    439 		do {
    440 			mutex_enter(&zfs_dev_mtx);
    441 			if (zfs_minor >= MAXMIN32) {
    442 				/*
    443 				 * If we're still using the real major
    444 				 * keep out of /dev/zfs and /dev/zvol minor
    445 				 * number space.  If we're using a getudev()'ed
    446 				 * major number, we can use all of its minors.
    447 				 */
    448 				if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
    449 					zfs_minor = ZFS_MIN_MINOR;
    450 				else
    451 					zfs_minor = 0;
    452 			} else {
    453 				zfs_minor++;
    454 			}
    455 			*dev = makedevice(zfs_major, zfs_minor);
    456 			mutex_exit(&zfs_dev_mtx);
    457 		} while (vfs_devismounted(*dev) && zfs_minor != start);
    458 #ifdef illumos
    459 		if (zfs_minor == start) {
    460 			/*
    461 			 * We are using all ~262,000 minor numbers for the
    462 			 * current major number.  Create a new major number.
    463 			 */
    464 			if ((new_major = getudev()) == (major_t)-1) {
    465 				cmn_err(CE_WARN,
    466 				    "zfs_mount: Can't get unique major "
    467 				    "device number.");
    468 				return (-1);
    469 			}
    470 			mutex_enter(&zfs_dev_mtx);
    471 			zfs_major = new_major;
    472 			zfs_minor = 0;
    473 
    474 			mutex_exit(&zfs_dev_mtx);
    475 		} else {
    476 			break;
    477 		}
    478 		/* CONSTANTCONDITION */
    479 #endif
    480 	} while (1);
    481 
    482 	return (0);
    483 }
    484 #endif	/* illumos */
    485 
    486 
    487 static void
    488 atime_changed_cb(void *arg, uint64_t newval)
    489 {
    490 	zfsvfs_t *zfsvfs = arg;
    491 
    492 	if (newval == TRUE) {
    493 		zfsvfs->z_atime = TRUE;
    494 		zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
    495 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
    496 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
    497 	} else {
    498 		zfsvfs->z_atime = FALSE;
    499 		zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
    500 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
    501 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
    502 	}
    503 }
    504 
    505 static void
    506 xattr_changed_cb(void *arg, uint64_t newval)
    507 {
    508 	zfsvfs_t *zfsvfs = arg;
    509 
    510 	if (newval == TRUE) {
    511 		/* XXX locking on vfs_flag? */
    512 #ifdef TODO
    513 		zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
    514 #endif
    515 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
    516 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
    517 	} else {
    518 		/* XXX locking on vfs_flag? */
    519 #ifdef TODO
    520 		zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
    521 #endif
    522 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
    523 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
    524 	}
    525 }
    526 
    527 static void
    528 blksz_changed_cb(void *arg, uint64_t newval)
    529 {
    530 	zfsvfs_t *zfsvfs = arg;
    531 	ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
    532 	ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
    533 	ASSERT(ISP2(newval));
    534 
    535 	zfsvfs->z_max_blksz = newval;
    536 	zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
    537 }
    538 
    539 static void
    540 readonly_changed_cb(void *arg, uint64_t newval)
    541 {
    542 	zfsvfs_t *zfsvfs = arg;
    543 
    544 	if (newval) {
    545 		/* XXX locking on vfs_flag? */
    546 		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
    547 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
    548 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
    549 	} else {
    550 		/* XXX locking on vfs_flag? */
    551 		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
    552 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
    553 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
    554 	}
    555 }
    556 
    557 static void
    558 setuid_changed_cb(void *arg, uint64_t newval)
    559 {
    560 	zfsvfs_t *zfsvfs = arg;
    561 
    562 	if (newval == FALSE) {
    563 		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
    564 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
    565 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
    566 	} else {
    567 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
    568 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
    569 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
    570 	}
    571 }
    572 
    573 static void
    574 exec_changed_cb(void *arg, uint64_t newval)
    575 {
    576 	zfsvfs_t *zfsvfs = arg;
    577 
    578 	if (newval == FALSE) {
    579 		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
    580 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
    581 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
    582 	} else {
    583 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
    584 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
    585 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
    586 	}
    587 }
    588 
    589 /*
    590  * The nbmand mount option can be changed at mount time.
    591  * We can't allow it to be toggled on live file systems or incorrect
    592  * behavior may be seen from cifs clients
    593  *
    594  * This property isn't registered via dsl_prop_register(), but this callback
    595  * will be called when a file system is first mounted
    596  */
    597 static void
    598 nbmand_changed_cb(void *arg, uint64_t newval)
    599 {
    600 	zfsvfs_t *zfsvfs = arg;
    601 	if (newval == FALSE) {
    602 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
    603 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
    604 	} else {
    605 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
    606 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
    607 	}
    608 }
    609 
    610 static void
    611 snapdir_changed_cb(void *arg, uint64_t newval)
    612 {
    613 	zfsvfs_t *zfsvfs = arg;
    614 
    615 	zfsvfs->z_show_ctldir = newval;
    616 }
    617 
    618 static void
    619 vscan_changed_cb(void *arg, uint64_t newval)
    620 {
    621 	zfsvfs_t *zfsvfs = arg;
    622 
    623 	zfsvfs->z_vscan = newval;
    624 }
    625 
    626 static void
    627 acl_mode_changed_cb(void *arg, uint64_t newval)
    628 {
    629 	zfsvfs_t *zfsvfs = arg;
    630 
    631 	zfsvfs->z_acl_mode = newval;
    632 }
    633 
    634 static void
    635 acl_inherit_changed_cb(void *arg, uint64_t newval)
    636 {
    637 	zfsvfs_t *zfsvfs = arg;
    638 
    639 	zfsvfs->z_acl_inherit = newval;
    640 }
    641 
    642 static int
    643 zfs_register_callbacks(vfs_t *vfsp)
    644 {
    645 	struct dsl_dataset *ds = NULL;
    646 	objset_t *os = NULL;
    647 	zfsvfs_t *zfsvfs = NULL;
    648 	uint64_t nbmand;
    649 	boolean_t readonly = B_FALSE;
    650 	boolean_t do_readonly = B_FALSE;
    651 	boolean_t setuid = B_FALSE;
    652 	boolean_t do_setuid = B_FALSE;
    653 	boolean_t exec = B_FALSE;
    654 	boolean_t do_exec = B_FALSE;
    655 #ifdef illumos
    656 	boolean_t devices = B_FALSE;
    657 	boolean_t do_devices = B_FALSE;
    658 #endif
    659 	boolean_t xattr = B_FALSE;
    660 	boolean_t do_xattr = B_FALSE;
    661 	boolean_t atime = B_FALSE;
    662 	boolean_t do_atime = B_FALSE;
    663 	int error = 0;
    664 
    665 	ASSERT(vfsp);
    666 	zfsvfs = vfsp->vfs_data;
    667 	ASSERT(zfsvfs);
    668 	os = zfsvfs->z_os;
    669 
    670 	/*
    671 	 * This function can be called for a snapshot when we update snapshot's
    672 	 * mount point, which isn't really supported.
    673 	 */
    674 	if (dmu_objset_is_snapshot(os))
    675 		return (EOPNOTSUPP);
    676 
    677 	/*
    678 	 * The act of registering our callbacks will destroy any mount
    679 	 * options we may have.  In order to enable temporary overrides
    680 	 * of mount options, we stash away the current values and
    681 	 * restore them after we register the callbacks.
    682 	 */
    683 	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
    684 	    !spa_writeable(dmu_objset_spa(os))) {
    685 		readonly = B_TRUE;
    686 		do_readonly = B_TRUE;
    687 	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
    688 		readonly = B_FALSE;
    689 		do_readonly = B_TRUE;
    690 	}
    691 	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
    692 		setuid = B_FALSE;
    693 		do_setuid = B_TRUE;
    694 	} else {
    695 		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
    696 			setuid = B_FALSE;
    697 			do_setuid = B_TRUE;
    698 		} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
    699 			setuid = B_TRUE;
    700 			do_setuid = B_TRUE;
    701 		}
    702 	}
    703 	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
    704 		exec = B_FALSE;
    705 		do_exec = B_TRUE;
    706 	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
    707 		exec = B_TRUE;
    708 		do_exec = B_TRUE;
    709 	}
    710 	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
    711 		xattr = B_FALSE;
    712 		do_xattr = B_TRUE;
    713 	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
    714 		xattr = B_TRUE;
    715 		do_xattr = B_TRUE;
    716 	}
    717 	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
    718 		atime = B_FALSE;
    719 		do_atime = B_TRUE;
    720 	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
    721 		atime = B_TRUE;
    722 		do_atime = B_TRUE;
    723 	}
    724 
    725 	/*
    726 	 * We need to enter pool configuration here, so that we can use
    727 	 * dsl_prop_get_int_ds() to handle the special nbmand property below.
    728 	 * dsl_prop_get_integer() can not be used, because it has to acquire
    729 	 * spa_namespace_lock and we can not do that because we already hold
    730 	 * z_teardown_lock.  The problem is that spa_config_sync() is called
    731 	 * with spa_namespace_lock held and the function calls ZFS vnode
    732 	 * operations to write the cache file and thus z_teardown_lock is
    733 	 * acquired after spa_namespace_lock.
    734 	 */
    735 	ds = dmu_objset_ds(os);
    736 	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
    737 
    738 	/*
    739 	 * nbmand is a special property.  It can only be changed at
    740 	 * mount time.
    741 	 *
    742 	 * This is weird, but it is documented to only be changeable
    743 	 * at mount time.
    744 	 */
    745 	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
    746 		nbmand = B_FALSE;
    747 	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
    748 		nbmand = B_TRUE;
    749 	} else if (error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0) {
    750 		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
    751 		return (error);
    752 	}
    753 
    754 	/*
    755 	 * Register property callbacks.
    756 	 *
    757 	 * It would probably be fine to just check for i/o error from
    758 	 * the first prop_register(), but I guess I like to go
    759 	 * overboard...
    760 	 */
    761 	error = dsl_prop_register(ds,
    762 	    zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
    763 	error = error ? error : dsl_prop_register(ds,
    764 	    zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
    765 	error = error ? error : dsl_prop_register(ds,
    766 	    zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
    767 	error = error ? error : dsl_prop_register(ds,
    768 	    zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
    769 #ifdef illumos
    770 	error = error ? error : dsl_prop_register(ds,
    771 	    zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs);
    772 #endif
    773 	error = error ? error : dsl_prop_register(ds,
    774 	    zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
    775 	error = error ? error : dsl_prop_register(ds,
    776 	    zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
    777 	error = error ? error : dsl_prop_register(ds,
    778 	    zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
    779 	error = error ? error : dsl_prop_register(ds,
    780 	    zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
    781 	error = error ? error : dsl_prop_register(ds,
    782 	    zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
    783 	    zfsvfs);
    784 	error = error ? error : dsl_prop_register(ds,
    785 	    zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
    786 	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
    787 	if (error)
    788 		goto unregister;
    789 
    790 	/*
    791 	 * Invoke our callbacks to restore temporary mount options.
    792 	 */
    793 	if (do_readonly)
    794 		readonly_changed_cb(zfsvfs, readonly);
    795 	if (do_setuid)
    796 		setuid_changed_cb(zfsvfs, setuid);
    797 	if (do_exec)
    798 		exec_changed_cb(zfsvfs, exec);
    799 	if (do_xattr)
    800 		xattr_changed_cb(zfsvfs, xattr);
    801 	if (do_atime)
    802 		atime_changed_cb(zfsvfs, atime);
    803 
    804 	nbmand_changed_cb(zfsvfs, nbmand);
    805 
    806 	return (0);
    807 
    808 unregister:
    809 	dsl_prop_unregister_all(ds, zfsvfs);
    810 	return (error);
    811 }
    812 
    813 static int
    814 zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
    815     uint64_t *userp, uint64_t *groupp)
    816 {
    817 	/*
    818 	 * Is it a valid type of object to track?
    819 	 */
    820 	if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
    821 		return (SET_ERROR(ENOENT));
    822 
    823 	/*
    824 	 * If we have a NULL data pointer
    825 	 * then assume the id's aren't changing and
    826 	 * return EEXIST to the dmu to let it know to
    827 	 * use the same ids
    828 	 */
    829 	if (data == NULL)
    830 		return (SET_ERROR(EEXIST));
    831 
    832 	if (bonustype == DMU_OT_ZNODE) {
    833 		znode_phys_t *znp = data;
    834 		*userp = znp->zp_uid;
    835 		*groupp = znp->zp_gid;
    836 	} else {
    837 		int hdrsize;
    838 		sa_hdr_phys_t *sap = data;
    839 		sa_hdr_phys_t sa = *sap;
    840 		boolean_t swap = B_FALSE;
    841 
    842 		ASSERT(bonustype == DMU_OT_SA);
    843 
    844 		if (sa.sa_magic == 0) {
    845 			/*
    846 			 * This should only happen for newly created
    847 			 * files that haven't had the znode data filled
    848 			 * in yet.
    849 			 */
    850 			*userp = 0;
    851 			*groupp = 0;
    852 			return (0);
    853 		}
    854 		if (sa.sa_magic == BSWAP_32(SA_MAGIC)) {
    855 			sa.sa_magic = SA_MAGIC;
    856 			sa.sa_layout_info = BSWAP_16(sa.sa_layout_info);
    857 			swap = B_TRUE;
    858 		} else {
    859 			VERIFY3U(sa.sa_magic, ==, SA_MAGIC);
    860 		}
    861 
    862 		hdrsize = sa_hdrsize(&sa);
    863 		VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t));
    864 		*userp = *((uint64_t *)((uintptr_t)data + hdrsize +
    865 		    SA_UID_OFFSET));
    866 		*groupp = *((uint64_t *)((uintptr_t)data + hdrsize +
    867 		    SA_GID_OFFSET));
    868 		if (swap) {
    869 			*userp = BSWAP_64(*userp);
    870 			*groupp = BSWAP_64(*groupp);
    871 		}
    872 	}
    873 	return (0);
    874 }
    875 
    876 static void
    877 fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
    878     char *domainbuf, int buflen, uid_t *ridp)
    879 {
    880 	uint64_t fuid;
    881 	const char *domain;
    882 
    883 	fuid = strtonum(fuidstr, NULL);
    884 
    885 	domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
    886 	if (domain)
    887 		(void) strlcpy(domainbuf, domain, buflen);
    888 	else
    889 		domainbuf[0] = '\0';
    890 	*ridp = FUID_RID(fuid);
    891 }
    892 
    893 static uint64_t
    894 zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
    895 {
    896 	switch (type) {
    897 	case ZFS_PROP_USERUSED:
    898 		return (DMU_USERUSED_OBJECT);
    899 	case ZFS_PROP_GROUPUSED:
    900 		return (DMU_GROUPUSED_OBJECT);
    901 	case ZFS_PROP_USERQUOTA:
    902 		return (zfsvfs->z_userquota_obj);
    903 	case ZFS_PROP_GROUPQUOTA:
    904 		return (zfsvfs->z_groupquota_obj);
    905 	}
    906 	return (0);
    907 }
    908 
    909 int
    910 zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
    911     uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
    912 {
    913 	int error;
    914 	zap_cursor_t zc;
    915 	zap_attribute_t za;
    916 	zfs_useracct_t *buf = vbuf;
    917 	uint64_t obj;
    918 
    919 	if (!dmu_objset_userspace_present(zfsvfs->z_os))
    920 		return (SET_ERROR(ENOTSUP));
    921 
    922 	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
    923 	if (obj == 0) {
    924 		*bufsizep = 0;
    925 		return (0);
    926 	}
    927 
    928 	for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
    929 	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
    930 	    zap_cursor_advance(&zc)) {
    931 		if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
    932 		    *bufsizep)
    933 			break;
    934 
    935 		fuidstr_to_sid(zfsvfs, za.za_name,
    936 		    buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
    937 
    938 		buf->zu_space = za.za_first_integer;
    939 		buf++;
    940 	}
    941 	if (error == ENOENT)
    942 		error = 0;
    943 
    944 	ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
    945 	*bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
    946 	*cookiep = zap_cursor_serialize(&zc);
    947 	zap_cursor_fini(&zc);
    948 	return (error);
    949 }
    950 
    951 /*
    952  * buf must be big enough (eg, 32 bytes)
    953  */
    954 static int
    955 id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
    956     char *buf, boolean_t addok)
    957 {
    958 	uint64_t fuid;
    959 	int domainid = 0;
    960 
    961 	if (domain && domain[0]) {
    962 		domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
    963 		if (domainid == -1)
    964 			return (SET_ERROR(ENOENT));
    965 	}
    966 	fuid = FUID_ENCODE(domainid, rid);
    967 	(void) sprintf(buf, "%llx", (longlong_t)fuid);
    968 	return (0);
    969 }
    970 
    971 int
    972 zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
    973     const char *domain, uint64_t rid, uint64_t *valp)
    974 {
    975 	char buf[32];
    976 	int err;
    977 	uint64_t obj;
    978 
    979 	*valp = 0;
    980 
    981 	if (!dmu_objset_userspace_present(zfsvfs->z_os))
    982 		return (SET_ERROR(ENOTSUP));
    983 
    984 	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
    985 	if (obj == 0)
    986 		return (0);
    987 
    988 	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE);
    989 	if (err)
    990 		return (err);
    991 
    992 	err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
    993 	if (err == ENOENT)
    994 		err = 0;
    995 	return (err);
    996 }
    997 
    998 int
    999 zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
   1000     const char *domain, uint64_t rid, uint64_t quota)
   1001 {
   1002 	char buf[32];
   1003 	int err;
   1004 	dmu_tx_t *tx;
   1005 	uint64_t *objp;
   1006 	boolean_t fuid_dirtied;
   1007 
   1008 	if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
   1009 		return (SET_ERROR(EINVAL));
   1010 
   1011 	if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
   1012 		return (SET_ERROR(ENOTSUP));
   1013 
   1014 	objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
   1015 	    &zfsvfs->z_groupquota_obj;
   1016 
   1017 	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
   1018 	if (err)
   1019 		return (err);
   1020 	fuid_dirtied = zfsvfs->z_fuid_dirty;
   1021 
   1022 	tx = dmu_tx_create(zfsvfs->z_os);
   1023 	dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
   1024 	if (*objp == 0) {
   1025 		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
   1026 		    zfs_userquota_prop_prefixes[type]);
   1027 	}
   1028 	if (fuid_dirtied)
   1029 		zfs_fuid_txhold(zfsvfs, tx);
   1030 	err = dmu_tx_assign(tx, TXG_WAIT);
   1031 	if (err) {
   1032 		dmu_tx_abort(tx);
   1033 		return (err);
   1034 	}
   1035 
   1036 	mutex_enter(&zfsvfs->z_lock);
   1037 	if (*objp == 0) {
   1038 		*objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
   1039 		    DMU_OT_NONE, 0, tx);
   1040 		VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
   1041 		    zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
   1042 	}
   1043 	mutex_exit(&zfsvfs->z_lock);
   1044 
   1045 	if (quota == 0) {
   1046 		err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
   1047 		if (err == ENOENT)
   1048 			err = 0;
   1049 	} else {
   1050 		err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
   1051 	}
   1052 	ASSERT(err == 0);
   1053 	if (fuid_dirtied)
   1054 		zfs_fuid_sync(zfsvfs, tx);
   1055 	dmu_tx_commit(tx);
   1056 	return (err);
   1057 }
   1058 
   1059 boolean_t
   1060 zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
   1061 {
   1062 	char buf[32];
   1063 	uint64_t used, quota, usedobj, quotaobj;
   1064 	int err;
   1065 
   1066 	usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
   1067 	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
   1068 
   1069 	if (quotaobj == 0 || zfsvfs->z_replay)
   1070 		return (B_FALSE);
   1071 
   1072 	(void) sprintf(buf, "%llx", (longlong_t)fuid);
   1073 	err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
   1074 	if (err != 0)
   1075 		return (B_FALSE);
   1076 
   1077 	err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
   1078 	if (err != 0)
   1079 		return (B_FALSE);
   1080 	return (used >= quota);
   1081 }
   1082 
   1083 boolean_t
   1084 zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup)
   1085 {
   1086 	uint64_t fuid;
   1087 	uint64_t quotaobj;
   1088 
   1089 	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
   1090 
   1091 	fuid = isgroup ? zp->z_gid : zp->z_uid;
   1092 
   1093 	if (quotaobj == 0 || zfsvfs->z_replay)
   1094 		return (B_FALSE);
   1095 
   1096 	return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
   1097 }
   1098 
   1099 /*
   1100  * Associate this zfsvfs with the given objset, which must be owned.
   1101  * This will cache a bunch of on-disk state from the objset in the
   1102  * zfsvfs.
   1103  */
   1104 static int
   1105 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
   1106 {
   1107 	int error;
   1108 	uint64_t val;
   1109 
   1110 	zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
   1111 	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
   1112 	zfsvfs->z_os = os;
   1113 
   1114 	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
   1115 	if (error != 0)
   1116 		return (error);
   1117 	if (zfsvfs->z_version >
   1118 	    zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
   1119 		(void) printf("Can't mount a version %lld file system "
   1120 		    "on a version %lld pool\n. Pool must be upgraded to mount "
   1121 		    "this file system.", (u_longlong_t)zfsvfs->z_version,
   1122 		    (u_longlong_t)spa_version(dmu_objset_spa(os)));
   1123 		return (SET_ERROR(ENOTSUP));
   1124 	}
   1125 	error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
   1126 	if (error != 0)
   1127 		return (error);
   1128 	zfsvfs->z_norm = (int)val;
   1129 
   1130 	error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
   1131 	if (error != 0)
   1132 		return (error);
   1133 	zfsvfs->z_utf8 = (val != 0);
   1134 
   1135 	error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
   1136 	if (error != 0)
   1137 		return (error);
   1138 	zfsvfs->z_case = (uint_t)val;
   1139 
   1140 	/*
   1141 	 * Fold case on file systems that are always or sometimes case
   1142 	 * insensitive.
   1143 	 */
   1144 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
   1145 	    zfsvfs->z_case == ZFS_CASE_MIXED)
   1146 		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
   1147 
   1148 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
   1149 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
   1150 
   1151 	uint64_t sa_obj = 0;
   1152 	if (zfsvfs->z_use_sa) {
   1153 		/* should either have both of these objects or none */
   1154 		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
   1155 		    &sa_obj);
   1156 		if (error != 0)
   1157 			return (error);
   1158 	}
   1159 
   1160 	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
   1161 	    &zfsvfs->z_attr_table);
   1162 	if (error != 0)
   1163 		return (error);
   1164 
   1165 	if (zfsvfs->z_version >= ZPL_VERSION_SA)
   1166 		sa_register_update_callback(os, zfs_sa_upgrade);
   1167 
   1168 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
   1169 	    &zfsvfs->z_root);
   1170 	if (error != 0)
   1171 		return (error);
   1172 	ASSERT(zfsvfs->z_root != 0);
   1173 
   1174 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
   1175 	    &zfsvfs->z_unlinkedobj);
   1176 	if (error != 0)
   1177 		return (error);
   1178 
   1179 	error = zap_lookup(os, MASTER_NODE_OBJ,
   1180 	    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
   1181 	    8, 1, &zfsvfs->z_userquota_obj);
   1182 	if (error == ENOENT)
   1183 		zfsvfs->z_userquota_obj = 0;
   1184 	else if (error != 0)
   1185 		return (error);
   1186 
   1187 	error = zap_lookup(os, MASTER_NODE_OBJ,
   1188 	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
   1189 	    8, 1, &zfsvfs->z_groupquota_obj);
   1190 	if (error == ENOENT)
   1191 		zfsvfs->z_groupquota_obj = 0;
   1192 	else if (error != 0)
   1193 		return (error);
   1194 
   1195 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
   1196 	    &zfsvfs->z_fuid_obj);
   1197 	if (error == ENOENT)
   1198 		zfsvfs->z_fuid_obj = 0;
   1199 	else if (error != 0)
   1200 		return (error);
   1201 
   1202 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
   1203 	    &zfsvfs->z_shares_dir);
   1204 	if (error == ENOENT)
   1205 		zfsvfs->z_shares_dir = 0;
   1206 	else if (error != 0)
   1207 		return (error);
   1208 
   1209 	/*
   1210 	 * Only use the name cache if we are looking for a
   1211 	 * name on a file system that does not require normalization
   1212 	 * or case folding.  We can also look there if we happen to be
   1213 	 * on a non-normalizing, mixed sensitivity file system IF we
   1214 	 * are looking for the exact name (which is always the case on
   1215 	 * FreeBSD).
   1216 	 */
   1217 	zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
   1218 	    ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
   1219 	    !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
   1220 
   1221 	return (0);
   1222 }
   1223 
   1224 int
   1225 zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
   1226 {
   1227 	objset_t *os;
   1228 	zfsvfs_t *zfsvfs;
   1229 	int error;
   1230 
   1231 	/*
   1232 	 * XXX: Fix struct statfs so this isn't necessary!
   1233 	 *
   1234 	 * The 'osname' is used as the filesystem's special node, which means
   1235 	 * it must fit in statfs.f_mntfromname, or else it can't be
   1236 	 * enumerated, so libzfs_mnttab_find() returns NULL, which causes
   1237 	 * 'zfs unmount' to think it's not mounted when it is.
   1238 	 */
   1239 	if (strlen(osname) >= MNAMELEN)
   1240 		return (SET_ERROR(ENAMETOOLONG));
   1241 
   1242 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
   1243 
   1244 	/*
   1245 	 * We claim to always be readonly so we can open snapshots;
   1246 	 * other ZPL code will prevent us from writing to snapshots.
   1247 	 */
   1248 	error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
   1249 	if (error) {
   1250 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
   1251 		return (error);
   1252 	}
   1253 
   1254 	zfsvfs->z_vfs = NULL;
   1255 	zfsvfs->z_parent = zfsvfs;
   1256 
   1257 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
   1258 	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
   1259 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
   1260 	    offsetof(znode_t, z_link_node));
   1261 #ifdef DIAGNOSTIC
   1262 	rrm_init(&zfsvfs->z_teardown_lock, B_TRUE);
   1263 #else
   1264 	rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
   1265 #endif
   1266 	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
   1267 	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
   1268 	for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
   1269 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
   1270 
   1271 	error = zfsvfs_init(zfsvfs, os);
   1272 	if (error != 0) {
   1273 		dmu_objset_disown(os, zfsvfs);
   1274 		*zfvp = NULL;
   1275 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
   1276 		return (error);
   1277 	}
   1278 
   1279 	*zfvp = zfsvfs;
   1280 	return (0);
   1281 }
   1282 
   1283 static int
   1284 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
   1285 {
   1286 	int error;
   1287 
   1288 	error = zfs_register_callbacks(zfsvfs->z_vfs);
   1289 	if (error)
   1290 		return (error);
   1291 
   1292 	zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
   1293 
   1294 	/*
   1295 	 * If we are not mounting (ie: online recv), then we don't
   1296 	 * have to worry about replaying the log as we blocked all
   1297 	 * operations out since we closed the ZIL.
   1298 	 */
   1299 	if (mounting) {
   1300 		boolean_t readonly;
   1301 
   1302 		/*
   1303 		 * During replay we remove the read only flag to
   1304 		 * allow replays to succeed.
   1305 		 */
   1306 		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
   1307 		if (readonly != 0)
   1308 			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
   1309 		else
   1310 			zfs_unlinked_drain(zfsvfs);
   1311 
   1312 		/*
   1313 		 * Parse and replay the intent log.
   1314 		 *
   1315 		 * Because of ziltest, this must be done after
   1316 		 * zfs_unlinked_drain().  (Further note: ziltest
   1317 		 * doesn't use readonly mounts, where
   1318 		 * zfs_unlinked_drain() isn't called.)  This is because
   1319 		 * ziltest causes spa_sync() to think it's committed,
   1320 		 * but actually it is not, so the intent log contains
   1321 		 * many txg's worth of changes.
   1322 		 *
   1323 		 * In particular, if object N is in the unlinked set in
   1324 		 * the last txg to actually sync, then it could be
   1325 		 * actually freed in a later txg and then reallocated
   1326 		 * in a yet later txg.  This would write a "create
   1327 		 * object N" record to the intent log.  Normally, this
   1328 		 * would be fine because the spa_sync() would have
   1329 		 * written out the fact that object N is free, before
   1330 		 * we could write the "create object N" intent log
   1331 		 * record.
   1332 		 *
   1333 		 * But when we are in ziltest mode, we advance the "open
   1334 		 * txg" without actually spa_sync()-ing the changes to
   1335 		 * disk.  So we would see that object N is still
   1336 		 * allocated and in the unlinked set, and there is an
   1337 		 * intent log record saying to allocate it.
   1338 		 */
   1339 		if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
   1340 			if (zil_replay_disable) {
   1341 				zil_destroy(zfsvfs->z_log, B_FALSE);
   1342 			} else {
   1343 				zfsvfs->z_replay = B_TRUE;
   1344 				zil_replay(zfsvfs->z_os, zfsvfs,
   1345 				    zfs_replay_vector);
   1346 				zfsvfs->z_replay = B_FALSE;
   1347 			}
   1348 		}
   1349 		zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
   1350 	}
   1351 
   1352 	/*
   1353 	 * Set the objset user_ptr to track its zfsvfs.
   1354 	 */
   1355 	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
   1356 	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
   1357 	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
   1358 
   1359 	return (0);
   1360 }
   1361 
   1362 extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
   1363 
   1364 void
   1365 zfsvfs_free(zfsvfs_t *zfsvfs)
   1366 {
   1367 	int i;
   1368 
   1369 	/*
   1370 	 * This is a barrier to prevent the filesystem from going away in
   1371 	 * zfs_znode_move() until we can safely ensure that the filesystem is
   1372 	 * not unmounted. We consider the filesystem valid before the barrier
   1373 	 * and invalid after the barrier.
   1374 	 */
   1375 	rw_enter(&zfsvfs_lock, RW_READER);
   1376 	rw_exit(&zfsvfs_lock);
   1377 
   1378 	zfs_fuid_destroy(zfsvfs);
   1379 
   1380 	mutex_destroy(&zfsvfs->z_znodes_lock);
   1381 	mutex_destroy(&zfsvfs->z_lock);
   1382 	list_destroy(&zfsvfs->z_all_znodes);
   1383 	rrm_destroy(&zfsvfs->z_teardown_lock);
   1384 	rw_destroy(&zfsvfs->z_teardown_inactive_lock);
   1385 	rw_destroy(&zfsvfs->z_fuid_lock);
   1386 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
   1387 		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
   1388 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
   1389 }
   1390 
   1391 static void
   1392 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
   1393 {
   1394 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
   1395 	if (zfsvfs->z_vfs) {
   1396 		if (zfsvfs->z_use_fuids) {
   1397 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
   1398 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
   1399 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
   1400 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
   1401 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
   1402 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
   1403 		} else {
   1404 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
   1405 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
   1406 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
   1407 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
   1408 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
   1409 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
   1410 		}
   1411 	}
   1412 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
   1413 }
   1414 
   1415 #ifdef __NetBSD__
   1416 int
   1417 #else
   1418 static int
   1419 #endif
   1420 zfs_domount(vfs_t *vfsp, char *osname)
   1421 {
   1422 	uint64_t recordsize, fsid_guid;
   1423 	int error = 0;
   1424 	zfsvfs_t *zfsvfs;
   1425 	vnode_t *vp;
   1426 
   1427 	ASSERT(vfsp);
   1428 	ASSERT(osname);
   1429 
   1430 	error = zfsvfs_create(osname, &zfsvfs);
   1431 	if (error)
   1432 		return (error);
   1433 	zfsvfs->z_vfs = vfsp;
   1434 
   1435 #ifdef illumos
   1436 	/* Initialize the generic filesystem structure. */
   1437 	vfsp->vfs_bcount = 0;
   1438 	vfsp->vfs_data = NULL;
   1439 
   1440 	if (zfs_create_unique_device(&mount_dev) == -1) {
   1441 		error = SET_ERROR(ENODEV);
   1442 		goto out;
   1443 	}
   1444 	ASSERT(vfs_devismounted(mount_dev) == 0);
   1445 #endif
   1446 
   1447 	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
   1448 	    NULL))
   1449 		goto out;
   1450 	zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
   1451 	zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
   1452 
   1453 	vfsp->vfs_data = zfsvfs;
   1454 #ifdef __FreeBSD_kernel__
   1455 	vfsp->mnt_flag |= MNT_LOCAL;
   1456 	vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
   1457 	vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
   1458 	vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
   1459 	vfsp->mnt_kern_flag |= MNTK_NO_IOPF;	/* vn_io_fault can be used */
   1460 #endif
   1461 #ifdef __NetBSD__
   1462 	vfsp->mnt_flag |= MNT_LOCAL;
   1463 	vfsp->mnt_iflag |= IMNT_MPSAFE | IMNT_NCLOOKUP;
   1464 #endif
   1465 
   1466 	/*
   1467 	 * The fsid is 64 bits, composed of an 8-bit fs type, which
   1468 	 * separates our fsid from any other filesystem types, and a
   1469 	 * 56-bit objset unique ID.  The objset unique ID is unique to
   1470 	 * all objsets open on this system, provided by unique_create().
   1471 	 * The 8-bit fs type must be put in the low bits of fsid[1]
   1472 	 * because that's where other Solaris filesystems put it.
   1473 	 */
   1474 	fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
   1475 	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
   1476 #ifdef __FreeBSD_kernel__
   1477 	vfsp->vfs_fsid.val[0] = fsid_guid;
   1478 	vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
   1479 	    vfsp->mnt_vfc->vfc_typenum & 0xFF;
   1480 #endif
   1481 #ifdef __NetBSD__
   1482 	vfsp->mnt_stat.f_fsidx.__fsid_val[0] = fsid_guid;
   1483 	vfsp->mnt_stat.f_fsidx.__fsid_val[1] = ((fsid_guid>>32) << 8) |
   1484 	    makefstype(vfsp->mnt_op->vfs_name) & 0xFF;
   1485 	/*
   1486 	 * Truncate fsid_guid to 32-bit for f_fsid.
   1487 	 *
   1488 	 * - f_fsid is a long, which can not hold 56-bit fsid_guid
   1489 	 *   on 32-bit architectures.
   1490 	 *
   1491 	 * - We use this value for stat(2)'s st_dev (dev_t) as well.
   1492 	 *   Some applications seem to assume the round-trip with
   1493 	 *   makedev macros. that is,
   1494 	 *
   1495 	 *      st_dev == makedev(major(st_dev), minor(st_dev))
   1496 	 *
   1497 	 *   While NetBSD's dev_t has been 64-bit since 2009, our
   1498 	 *   version of these macros only preserve the lower 32-bits.
   1499 	 */
   1500 	vfsp->mnt_stat.f_fsid = (uint32_t)fsid_guid;
   1501 #endif
   1502 
   1503 	/*
   1504 	 * Set features for file system.
   1505 	 */
   1506 	zfs_set_fuid_feature(zfsvfs);
   1507 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
   1508 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
   1509 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
   1510 		vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
   1511 	} else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
   1512 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
   1513 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
   1514 	}
   1515 	vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
   1516 
   1517 	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
   1518 		uint64_t pval;
   1519 
   1520 		atime_changed_cb(zfsvfs, B_FALSE);
   1521 		readonly_changed_cb(zfsvfs, B_TRUE);
   1522 		if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
   1523 			goto out;
   1524 		xattr_changed_cb(zfsvfs, pval);
   1525 		zfsvfs->z_issnap = B_TRUE;
   1526 		zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
   1527 
   1528 		mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
   1529 		dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
   1530 		mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
   1531 	} else {
   1532 		error = zfsvfs_setup(zfsvfs, B_TRUE);
   1533 	}
   1534 
   1535 #ifdef __FreeBSD_kernel__
   1536 	vfs_mountedfrom(vfsp, osname);
   1537 #endif
   1538 #ifdef __NetBSD__
   1539 	set_statvfs_info("on-name", UIO_SYSSPACE, osname, UIO_SYSSPACE, "zfs", vfsp, curlwp);
   1540 #endif
   1541 
   1542 	if (!zfsvfs->z_issnap)
   1543 		zfsctl_create(zfsvfs);
   1544 out:
   1545 	if (error) {
   1546 		dmu_objset_disown(zfsvfs->z_os, zfsvfs);
   1547 		zfsvfs_free(zfsvfs);
   1548 	} else {
   1549 		atomic_inc_32(&zfs_active_fs_count);
   1550 	}
   1551 
   1552 	return (error);
   1553 }
   1554 
   1555 void
   1556 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
   1557 {
   1558 	objset_t *os = zfsvfs->z_os;
   1559 
   1560 	if (!dmu_objset_is_snapshot(os))
   1561 		dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
   1562 }
   1563 
   1564 #ifdef SECLABEL
   1565 /*
   1566  * Convert a decimal digit string to a uint64_t integer.
   1567  */
   1568 static int
   1569 str_to_uint64(char *str, uint64_t *objnum)
   1570 {
   1571 	uint64_t num = 0;
   1572 
   1573 	while (*str) {
   1574 		if (*str < '0' || *str > '9')
   1575 			return (SET_ERROR(EINVAL));
   1576 
   1577 		num = num*10 + *str++ - '0';
   1578 	}
   1579 
   1580 	*objnum = num;
   1581 	return (0);
   1582 }
   1583 
   1584 /*
   1585  * The boot path passed from the boot loader is in the form of
   1586  * "rootpool-name/root-filesystem-object-number'. Convert this
   1587  * string to a dataset name: "rootpool-name/root-filesystem-name".
   1588  */
   1589 static int
   1590 zfs_parse_bootfs(char *bpath, char *outpath)
   1591 {
   1592 	char *slashp;
   1593 	uint64_t objnum;
   1594 	int error;
   1595 
   1596 	if (*bpath == 0 || *bpath == '/')
   1597 		return (SET_ERROR(EINVAL));
   1598 
   1599 	(void) strcpy(outpath, bpath);
   1600 
   1601 	slashp = strchr(bpath, '/');
   1602 
   1603 	/* if no '/', just return the pool name */
   1604 	if (slashp == NULL) {
   1605 		return (0);
   1606 	}
   1607 
   1608 	/* if not a number, just return the root dataset name */
   1609 	if (str_to_uint64(slashp+1, &objnum)) {
   1610 		return (0);
   1611 	}
   1612 
   1613 	*slashp = '\0';
   1614 	error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
   1615 	*slashp = '/';
   1616 
   1617 	return (error);
   1618 }
   1619 
   1620 /*
   1621  * Check that the hex label string is appropriate for the dataset being
   1622  * mounted into the global_zone proper.
   1623  *
   1624  * Return an error if the hex label string is not default or
   1625  * admin_low/admin_high.  For admin_low labels, the corresponding
   1626  * dataset must be readonly.
   1627  */
   1628 int
   1629 zfs_check_global_label(const char *dsname, const char *hexsl)
   1630 {
   1631 	if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
   1632 		return (0);
   1633 	if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
   1634 		return (0);
   1635 	if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
   1636 		/* must be readonly */
   1637 		uint64_t rdonly;
   1638 
   1639 		if (dsl_prop_get_integer(dsname,
   1640 		    zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
   1641 			return (SET_ERROR(EACCES));
   1642 		return (rdonly ? 0 : EACCES);
   1643 	}
   1644 	return (SET_ERROR(EACCES));
   1645 }
   1646 
   1647 /*
   1648  * Determine whether the mount is allowed according to MAC check.
   1649  * by comparing (where appropriate) label of the dataset against
   1650  * the label of the zone being mounted into.  If the dataset has
   1651  * no label, create one.
   1652  *
   1653  * Returns 0 if access allowed, error otherwise (e.g. EACCES)
   1654  */
   1655 static int
   1656 zfs_mount_label_policy(vfs_t *vfsp, char *osname)
   1657 {
   1658 	int		error, retv;
   1659 	zone_t		*mntzone = NULL;
   1660 	ts_label_t	*mnt_tsl;
   1661 	bslabel_t	*mnt_sl;
   1662 	bslabel_t	ds_sl;
   1663 	char		ds_hexsl[MAXNAMELEN];
   1664 
   1665 	retv = EACCES;				/* assume the worst */
   1666 
   1667 	/*
   1668 	 * Start by getting the dataset label if it exists.
   1669 	 */
   1670 	error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
   1671 	    1, sizeof (ds_hexsl), &ds_hexsl, NULL);
   1672 	if (error)
   1673 		return (SET_ERROR(EACCES));
   1674 
   1675 	/*
   1676 	 * If labeling is NOT enabled, then disallow the mount of datasets
   1677 	 * which have a non-default label already.  No other label checks
   1678 	 * are needed.
   1679 	 */
   1680 	if (!is_system_labeled()) {
   1681 		if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
   1682 			return (0);
   1683 		return (SET_ERROR(EACCES));
   1684 	}
   1685 
   1686 	/*
   1687 	 * Get the label of the mountpoint.  If mounting into the global
   1688 	 * zone (i.e. mountpoint is not within an active zone and the
   1689 	 * zoned property is off), the label must be default or
   1690 	 * admin_low/admin_high only; no other checks are needed.
   1691 	 */
   1692 	mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
   1693 	if (mntzone->zone_id == GLOBAL_ZONEID) {
   1694 		uint64_t zoned;
   1695 
   1696 		zone_rele(mntzone);
   1697 
   1698 		if (dsl_prop_get_integer(osname,
   1699 		    zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
   1700 			return (SET_ERROR(EACCES));
   1701 		if (!zoned)
   1702 			return (zfs_check_global_label(osname, ds_hexsl));
   1703 		else
   1704 			/*
   1705 			 * This is the case of a zone dataset being mounted
   1706 			 * initially, before the zone has been fully created;
   1707 			 * allow this mount into global zone.
   1708 			 */
   1709 			return (0);
   1710 	}
   1711 
   1712 	mnt_tsl = mntzone->zone_slabel;
   1713 	ASSERT(mnt_tsl != NULL);
   1714 	label_hold(mnt_tsl);
   1715 	mnt_sl = label2bslabel(mnt_tsl);
   1716 
   1717 	if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
   1718 		/*
   1719 		 * The dataset doesn't have a real label, so fabricate one.
   1720 		 */
   1721 		char *str = NULL;
   1722 
   1723 		if (l_to_str_internal(mnt_sl, &str) == 0 &&
   1724 		    dsl_prop_set_string(osname,
   1725 		    zfs_prop_to_name(ZFS_PROP_MLSLABEL),
   1726 		    ZPROP_SRC_LOCAL, str) == 0)
   1727 			retv = 0;
   1728 		if (str != NULL)
   1729 			kmem_free(str, strlen(str) + 1);
   1730 	} else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
   1731 		/*
   1732 		 * Now compare labels to complete the MAC check.  If the
   1733 		 * labels are equal then allow access.  If the mountpoint
   1734 		 * label dominates the dataset label, allow readonly access.
   1735 		 * Otherwise, access is denied.
   1736 		 */
   1737 		if (blequal(mnt_sl, &ds_sl))
   1738 			retv = 0;
   1739 		else if (bldominates(mnt_sl, &ds_sl)) {
   1740 			vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
   1741 			retv = 0;
   1742 		}
   1743 	}
   1744 
   1745 	label_rele(mnt_tsl);
   1746 	zone_rele(mntzone);
   1747 	return (retv);
   1748 }
   1749 #endif	/* SECLABEL */
   1750 
   1751 #ifdef OPENSOLARIS_MOUNTROOT
   1752 static int
   1753 zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
   1754 {
   1755 	int error = 0;
   1756 	static int zfsrootdone = 0;
   1757 	zfsvfs_t *zfsvfs = NULL;
   1758 	znode_t *zp = NULL;
   1759 	vnode_t *vp = NULL;
   1760 	char *zfs_bootfs;
   1761 	char *zfs_devid;
   1762 
   1763 	ASSERT(vfsp);
   1764 
   1765 	/*
   1766 	 * The filesystem that we mount as root is defined in the
   1767 	 * boot property "zfs-bootfs" with a format of
   1768 	 * "poolname/root-dataset-objnum".
   1769 	 */
   1770 	if (why == ROOT_INIT) {
   1771 		if (zfsrootdone++)
   1772 			return (SET_ERROR(EBUSY));
   1773 		/*
   1774 		 * the process of doing a spa_load will require the
   1775 		 * clock to be set before we could (for example) do
   1776 		 * something better by looking at the timestamp on
   1777 		 * an uberblock, so just set it to -1.
   1778 		 */
   1779 		clkset(-1);
   1780 
   1781 		if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
   1782 			cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
   1783 			    "bootfs name");
   1784 			return (SET_ERROR(EINVAL));
   1785 		}
   1786 		zfs_devid = spa_get_bootprop("diskdevid");
   1787 		error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
   1788 		if (zfs_devid)
   1789 			spa_free_bootprop(zfs_devid);
   1790 		if (error) {
   1791 			spa_free_bootprop(zfs_bootfs);
   1792 			cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
   1793 			    error);
   1794 			return (error);
   1795 		}
   1796 		if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
   1797 			spa_free_bootprop(zfs_bootfs);
   1798 			cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
   1799 			    error);
   1800 			return (error);
   1801 		}
   1802 
   1803 		spa_free_bootprop(zfs_bootfs);
   1804 
   1805 		if (error = vfs_lock(vfsp))
   1806 			return (error);
   1807 
   1808 		if (error = zfs_domount(vfsp, rootfs.bo_name)) {
   1809 			cmn_err(CE_NOTE, "zfs_domount: error %d", error);
   1810 			goto out;
   1811 		}
   1812 
   1813 		zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
   1814 		ASSERT(zfsvfs);
   1815 		if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
   1816 			cmn_err(CE_NOTE, "zfs_zget: error %d", error);
   1817 			goto out;
   1818 		}
   1819 
   1820 		vp = ZTOV(zp);
   1821 		mutex_enter(&vp->v_lock);
   1822 		vp->v_flag |= VROOT;
   1823 		mutex_exit(&vp->v_lock);
   1824 		rootvp = vp;
   1825 
   1826 		/*
   1827 		 * Leave rootvp held.  The root file system is never unmounted.
   1828 		 */
   1829 
   1830 		vfs_add((struct vnode *)0, vfsp,
   1831 		    (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
   1832 out:
   1833 		vfs_unlock(vfsp);
   1834 		return (error);
   1835 	} else if (why == ROOT_REMOUNT) {
   1836 		readonly_changed_cb(vfsp->vfs_data, B_FALSE);
   1837 		vfsp->vfs_flag |= VFS_REMOUNT;
   1838 
   1839 		/* refresh mount options */
   1840 		zfs_unregister_callbacks(vfsp->vfs_data);
   1841 		return (zfs_register_callbacks(vfsp));
   1842 
   1843 	} else if (why == ROOT_UNMOUNT) {
   1844 		zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
   1845 		(void) zfs_sync(vfsp, 0, 0);
   1846 		return (0);
   1847 	}
   1848 
   1849 	/*
   1850 	 * if "why" is equal to anything else other than ROOT_INIT,
   1851 	 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
   1852 	 */
   1853 	return (SET_ERROR(ENOTSUP));
   1854 }
   1855 #endif	/* OPENSOLARIS_MOUNTROOT */
   1856 
   1857 static int
   1858 getpoolname(const char *osname, char *poolname)
   1859 {
   1860 	char *p;
   1861 
   1862 	p = strchr(osname, '/');
   1863 	if (p == NULL) {
   1864 		if (strlen(osname) >= MAXNAMELEN)
   1865 			return (ENAMETOOLONG);
   1866 		(void) strcpy(poolname, osname);
   1867 	} else {
   1868 		if (p - osname >= MAXNAMELEN)
   1869 			return (ENAMETOOLONG);
   1870 		(void) strncpy(poolname, osname, p - osname);
   1871 		poolname[p - osname] = '\0';
   1872 	}
   1873 	return (0);
   1874 }
   1875 
   1876 /*ARGSUSED*/
   1877 #ifdef illumos
   1878 static int
   1879 zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
   1880 #endif
   1881 #ifdef __FreeBSD_kernel__
   1882 static int
   1883 zfs_mount(vfs_t *vfsp)
   1884 #endif
   1885 #ifdef __NetBSD__
   1886 static int
   1887 zfs_mount(vfs_t *vfsp, const char *path, void *data, size_t *data_len)
   1888 #endif
   1889 {
   1890 	vnode_t		*mvp = vfsp->mnt_vnodecovered;
   1891 	char		*osname;
   1892 	int		error = 0;
   1893 	int		canwrite;
   1894 
   1895 #ifdef illumos
   1896 	if (mvp->v_type != VDIR)
   1897 		return (SET_ERROR(ENOTDIR));
   1898 
   1899 	mutex_enter(&mvp->v_lock);
   1900 	if ((uap->flags & MS_REMOUNT) == 0 &&
   1901 	    (uap->flags & MS_OVERLAY) == 0 &&
   1902 	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
   1903 		mutex_exit(&mvp->v_lock);
   1904 		return (SET_ERROR(EBUSY));
   1905 	}
   1906 	mutex_exit(&mvp->v_lock);
   1907 
   1908 	/*
   1909 	 * ZFS does not support passing unparsed data in via MS_DATA.
   1910 	 * Users should use the MS_OPTIONSTR interface; this means
   1911 	 * that all option parsing is already done and the options struct
   1912 	 * can be interrogated.
   1913 	 */
   1914 	if ((uap->flags & MS_DATA) && uap->datalen > 0)
   1915 		return (SET_ERROR(EINVAL));
   1916 #endif /* illumos */
   1917 
   1918 #ifdef __FreeBSD_kernel__
   1919 	kthread_t	*td = curthread;
   1920 	cred_t		*cr = td->td_ucred;
   1921 
   1922 	if (!prison_allow(td->td_ucred, PR_ALLOW_MOUNT_ZFS))
   1923 		return (SET_ERROR(EPERM));
   1924 
   1925 	if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
   1926 		return (SET_ERROR(EINVAL));
   1927 
   1928 	/*
   1929 	 * If full-owner-access is enabled and delegated administration is
   1930 	 * turned on, we must set nosuid.
   1931 	 */
   1932 	if (zfs_super_owner &&
   1933 	    dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
   1934 		secpolicy_fs_mount_clearopts(cr, vfsp);
   1935 	}
   1936 
   1937 #endif /* __FreeBSD_kernel__ */
   1938 
   1939 #ifdef __NetBSD__
   1940 	cred_t		*cr = CRED();
   1941 	struct mounta	*uap = data;
   1942 
   1943 	/*
   1944 	 * reject all op flags for now.
   1945 	 *
   1946 	 * - the code below is inconsistent. sometimes it checks uap->flags,
   1947 	 *   sometimes vfsp->vfs_flag. (aka mnt_flag)
   1948 	 *
   1949 	 * - our userland tools (zfs, mount_zfs) currently don't seem to have
   1950 	 *   a way to pass these flags anyway. (zmount in libzfs always passes
   1951 	 *   0 to both of mount(2) 'flags' argument and 'uap->flags'. although
   1952 	 *   it stores something in uap->mflag and uap->optptr, nothing uses
   1953 	 *   them. it doesn't even set MS_OPTIONSTR. we don't implement
   1954 	 *   MS_OPTIONSTR anyway.)
   1955 	 */
   1956 	if ((vfsp->mnt_flag & MNT_OP_FLAGS) != 0)
   1957 		return (SET_ERROR(ENOTSUP));
   1958 
   1959 	if (uap == NULL)
   1960 		return (SET_ERROR(EINVAL));
   1961 
   1962 	if (*data_len < sizeof *uap)
   1963 		return (SET_ERROR(EINVAL));
   1964 
   1965 	if (mvp->v_type != VDIR)
   1966 		return (SET_ERROR(ENOTDIR));
   1967 
   1968 	mutex_enter(mvp->v_interlock);
   1969 	if ((uap->flags & MS_REMOUNT) == 0 &&
   1970 	    (uap->flags & MS_OVERLAY) == 0 &&
   1971 	    (vrefcnt(mvp) != 1 || (mvp->v_flag & VROOT))) {
   1972 		mutex_exit(mvp->v_interlock);
   1973 		return (SET_ERROR(EBUSY));
   1974 	}
   1975 	mutex_exit(mvp->v_interlock);
   1976 
   1977 	osname = PNBUF_GET();
   1978 	strlcpy(osname, uap->fspec, strlen(uap->fspec) + 1);
   1979 #endif /* __NetBSD__ */
   1980 
   1981 	/*
   1982 	 * Check for mount privilege?
   1983 	 *
   1984 	 * If we don't have privilege then see if
   1985 	 * we have local permission to allow it
   1986 	 */
   1987 	error = secpolicy_fs_mount(cr, mvp, vfsp);
   1988 	if (error) {
   1989 		if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
   1990 			goto out;
   1991 
   1992 		if (!(vfsp->vfs_flag & MS_REMOUNT)) {
   1993 			vattr_t		vattr;
   1994 
   1995 			/*
   1996 			 * Make sure user is the owner of the mount point
   1997 			 * or has sufficient privileges.
   1998 			 */
   1999 
   2000 			vattr.va_mask = AT_UID;
   2001 
   2002 #ifdef __FreeBSD_kernel__
   2003 			vn_lock(mvp, LK_SHARED | LK_RETRY);
   2004 			if (VOP_GETATTR(mvp, &vattr, cr)) {
   2005 				VOP_UNLOCK(mvp, 0);
   2006 				goto out;
   2007 			}
   2008 
   2009 			if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
   2010 			    VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
   2011 				VOP_UNLOCK(mvp, 0);
   2012 				goto out;
   2013 			}
   2014 			VOP_UNLOCK(mvp, 0);
   2015 #endif
   2016 #ifdef __NetBSD__
   2017 			vn_lock(mvp, LK_SHARED | LK_RETRY);
   2018 			if (VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) {
   2019 				VOP_UNLOCK(mvp, 0);
   2020 				goto out;
   2021 			}
   2022 
   2023 			if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
   2024 			    VOP_ACCESS(mvp, VWRITE, cr) != 0) {
   2025 				VOP_UNLOCK(mvp, 0);
   2026 				goto out;
   2027 			}
   2028 			VOP_UNLOCK(mvp, 0);
   2029 #endif
   2030 		}
   2031 
   2032 		secpolicy_fs_mount_clearopts(cr, vfsp);
   2033 	}
   2034 
   2035 	/*
   2036 	 * Refuse to mount a filesystem if we are in a local zone and the
   2037 	 * dataset is not visible.
   2038 	 */
   2039 	if (!INGLOBALZONE(curthread) &&
   2040 	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
   2041 		error = SET_ERROR(EPERM);
   2042 		goto out;
   2043 	}
   2044 
   2045 #ifdef SECLABEL
   2046 	error = zfs_mount_label_policy(vfsp, osname);
   2047 	if (error)
   2048 		goto out;
   2049 #endif
   2050 
   2051 #ifdef __FreeBSD_kernel__
   2052 	vfsp->vfs_flag |= MNT_NFS4ACLS;
   2053 #endif
   2054 #ifdef __NetBSD__
   2055 	vfsp->mnt_iflag |= IMNT_MPSAFE | IMNT_NCLOOKUP;
   2056 #endif
   2057 
   2058 	/*
   2059 	 * When doing a remount, we simply refresh our temporary properties
   2060 	 * according to those options set in the current VFS options.
   2061 	 */
   2062 	if (vfsp->vfs_flag & MS_REMOUNT) {
   2063 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
   2064 
   2065 		/*
   2066 		 * Refresh mount options with z_teardown_lock blocking I/O while
   2067 		 * the filesystem is in an inconsistent state.
   2068 		 * The lock also serializes this code with filesystem
   2069 		 * manipulations between entry to zfs_suspend_fs() and return
   2070 		 * from zfs_resume_fs().
   2071 		 */
   2072 		rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
   2073 		zfs_unregister_callbacks(zfsvfs);
   2074 		error = zfs_register_callbacks(vfsp);
   2075 		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
   2076 		goto out;
   2077 	}
   2078 
   2079 #ifdef __FreeBSD_kernel__
   2080 	/* Initial root mount: try hard to import the requested root pool. */
   2081 	if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
   2082 	    (vfsp->vfs_flag & MNT_UPDATE) == 0) {
   2083 		char pname[MAXNAMELEN];
   2084 
   2085 		error = getpoolname(osname, pname);
   2086 		if (error == 0)
   2087 			error = spa_import_rootpool(pname);
   2088 		if (error)
   2089 			goto out;
   2090 	}
   2091 #endif
   2092 
   2093 	DROP_GIANT();
   2094 	error = zfs_domount(vfsp, osname);
   2095 	PICKUP_GIANT();
   2096 
   2097 #ifdef illumos
   2098 	/*
   2099 	 * Add an extra VFS_HOLD on our parent vfs so that it can't
   2100 	 * disappear due to a forced unmount.
   2101 	 */
   2102 	if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
   2103 		VFS_HOLD(mvp->v_vfsp);
   2104 #endif
   2105 
   2106 #ifdef __NetBSD__
   2107 	/* setup zfs mount info */
   2108 	strlcpy(vfsp->mnt_stat.f_mntfromname, osname,
   2109 	    sizeof(vfsp->mnt_stat.f_mntfromname));
   2110 	set_statvfs_info(path, UIO_USERSPACE, vfsp->mnt_stat.f_mntfromname,
   2111 	    UIO_SYSSPACE, vfsp->mnt_op->vfs_name, vfsp, curlwp);
   2112 #endif
   2113 
   2114 out:
   2115 	return (error);
   2116 }
   2117 
   2118 #ifdef __FreeBSD_kernel__
   2119 static int
   2120 zfs_statfs(vfs_t *vfsp, struct statfs *statp)
   2121 #endif
   2122 #ifdef __NetBSD__
   2123 static int
   2124 zfs_statvfs(vfs_t *vfsp, struct statvfs *statp)
   2125 #endif
   2126 {
   2127 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
   2128 	uint64_t refdbytes, availbytes, usedobjs, availobjs;
   2129 
   2130 #ifdef __FreeBSD_kernel__
   2131 	statp->f_version = STATFS_VERSION;
   2132 #endif
   2133 
   2134 	ZFS_ENTER(zfsvfs);
   2135 
   2136 	dmu_objset_space(zfsvfs->z_os,
   2137 	    &refdbytes, &availbytes, &usedobjs, &availobjs);
   2138 
   2139 	/*
   2140 	 * The underlying storage pool actually uses multiple block sizes.
   2141 	 * We report the fragsize as the smallest block size we support,
   2142 	 * and we report our blocksize as the filesystem's maximum blocksize.
   2143 	 */
   2144 	statp->f_bsize = SPA_MINBLOCKSIZE;
   2145 #ifdef __NetBSD__
   2146 	statp->f_frsize = SPA_MINBLOCKSIZE;
   2147 #endif
   2148 	statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
   2149 
   2150 	/*
   2151 	 * The following report "total" blocks of various kinds in the
   2152 	 * file system, but reported in terms of f_frsize - the
   2153 	 * "fragment" size.
   2154 	 */
   2155 
   2156 	statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
   2157 	statp->f_bfree = availbytes / statp->f_bsize;
   2158 	statp->f_bavail = statp->f_bfree; /* no root reservation */
   2159 	statp->f_bresvd = 0;
   2160 
   2161 	/*
   2162 	 * statvfs() should really be called statufs(), because it assumes
   2163 	 * static metadata.  ZFS doesn't preallocate files, so the best
   2164 	 * we can do is report the max that could possibly fit in f_files,
   2165 	 * and that minus the number actually used in f_ffree.
   2166 	 * For f_ffree, report the smaller of the number of object available
   2167 	 * and the number of blocks (each object will take at least a block).
   2168 	 */
   2169 	statp->f_ffree = MIN(availobjs, statp->f_bfree);
   2170 #ifndef __FreeBSD__
   2171 	statp->f_favail = statp->f_ffree;	/* no "root reservation" */
   2172 #endif
   2173 	statp->f_files = statp->f_ffree + usedobjs;
   2174 	statp->f_fresvd = 0;
   2175 
   2176 #ifdef __FreeBSD__
   2177 	(void) cmpldev(&d32, vfsp->vfs_dev);
   2178 	statp->f_fsid = d32;
   2179 #endif
   2180 #ifdef __NetBSD__
   2181 	statp->f_fsid = vfsp->mnt_stat.f_fsid;
   2182 	statp->f_fsidx = vfsp->mnt_stat.f_fsidx;
   2183 #endif
   2184 
   2185 	/*
   2186 	 * We're a zfs filesystem.
   2187 	 */
   2188 	(void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));
   2189 
   2190 	strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
   2191 	    sizeof(statp->f_mntfromname));
   2192 	strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
   2193 	    sizeof(statp->f_mntonname));
   2194 
   2195 #ifdef __FreeBSD_kernel__
   2196 	statp->f_namemax = MAXNAMELEN - 1;
   2197 #endif
   2198 #ifdef __NetBSD__
   2199 	statp->f_namemax = ZFS_MAXNAMELEN;
   2200 #endif
   2201 
   2202 	ZFS_EXIT(zfsvfs);
   2203 	return (0);
   2204 }
   2205 
   2206 static int
   2207 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
   2208 {
   2209 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
   2210 	znode_t *rootzp;
   2211 	int error;
   2212 
   2213 	ZFS_ENTER(zfsvfs);
   2214 
   2215 	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
   2216 	if (error == 0)
   2217 		*vpp = ZTOV(rootzp);
   2218 
   2219 	ZFS_EXIT(zfsvfs);
   2220 
   2221 	if (error == 0) {
   2222 		error = vn_lock(*vpp, flags);
   2223 		if (error != 0) {
   2224 			VN_RELE(*vpp);
   2225 			*vpp = NULL;
   2226 		}
   2227 	}
   2228 	return (error);
   2229 }
   2230 
   2231 /*
   2232  * Teardown the zfsvfs::z_os.
   2233  *
   2234  * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
   2235  * and 'z_teardown_inactive_lock' held.
   2236  */
   2237 static int
   2238 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
   2239 {
   2240 	znode_t	*zp;
   2241 
   2242 	rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
   2243 
   2244 	if (!unmounting) {
   2245 		/*
   2246 		 * We purge the parent filesystem's vfsp as the parent
   2247 		 * filesystem and all of its snapshots have their vnode's
   2248 		 * v_vfsp set to the parent's filesystem's vfsp.  Note,
   2249 		 * 'z_parent' is self referential for non-snapshots.
   2250 		 */
   2251 		(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
   2252 #ifdef FREEBSD_NAMECACHE
   2253 		cache_purgevfs(zfsvfs->z_parent->z_vfs, true);
   2254 #endif
   2255 #ifdef __NetBSD__
   2256 		cache_purgevfs(zfsvfs->z_parent->z_vfs);
   2257 #endif
   2258 	}
   2259 
   2260 	/*
   2261 	 * Close the zil. NB: Can't close the zil while zfs_inactive
   2262 	 * threads are blocked as zil_close can call zfs_inactive.
   2263 	 */
   2264 	if (zfsvfs->z_log) {
   2265 		zil_close(zfsvfs->z_log);
   2266 		zfsvfs->z_log = NULL;
   2267 	}
   2268 
   2269 	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
   2270 
   2271 	/*
   2272 	 * If we are not unmounting (ie: online recv) and someone already
   2273 	 * unmounted this file system while we were doing the switcheroo,
   2274 	 * or a reopen of z_os failed then just bail out now.
   2275 	 */
   2276 	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
   2277 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
   2278 		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
   2279 		return (SET_ERROR(EIO));
   2280 	}
   2281 
   2282 	/*
   2283 	 * At this point there are no vops active, and any new vops will
   2284 	 * fail with EIO since we have z_teardown_lock for writer (only
   2285 	 * relavent for forced unmount).
   2286 	 *
   2287 	 * Release all holds on dbufs.
   2288 	 */
   2289 	mutex_enter(&zfsvfs->z_znodes_lock);
   2290 	for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
   2291 	    zp = list_next(&zfsvfs->z_all_znodes, zp))
   2292 		if (zp->z_sa_hdl) {
   2293 #ifdef __NetBSD__
   2294 			ASSERT(vrefcnt(ZTOV(zp)) >= 0);
   2295 #else
   2296 			ASSERT(ZTOV(zp)->v_count >= 0);
   2297 #endif
   2298 			zfs_znode_dmu_fini(zp);
   2299 		}
   2300 	mutex_exit(&zfsvfs->z_znodes_lock);
   2301 
   2302 	/*
   2303 	 * If we are unmounting, set the unmounted flag and let new vops
   2304 	 * unblock.  zfs_inactive will have the unmounted behavior, and all
   2305 	 * other vops will fail with EIO.
   2306 	 */
   2307 	if (unmounting) {
   2308 		zfsvfs->z_unmounted = B_TRUE;
   2309 		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
   2310 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
   2311 	}
   2312 
   2313 	/*
   2314 	 * z_os will be NULL if there was an error in attempting to reopen
   2315 	 * zfsvfs, so just return as the properties had already been
   2316 	 * unregistered and cached data had been evicted before.
   2317 	 */
   2318 	if (zfsvfs->z_os == NULL)
   2319 		return (0);
   2320 
   2321 	/*
   2322 	 * Unregister properties.
   2323 	 */
   2324 	zfs_unregister_callbacks(zfsvfs);
   2325 
   2326 	/*
   2327 	 * Evict cached data
   2328 	 */
   2329 	if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) &&
   2330 	    !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
   2331 		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
   2332 	dmu_objset_evict_dbufs(zfsvfs->z_os);
   2333 
   2334 	return (0);
   2335 }
   2336 
   2337 /*ARGSUSED*/
   2338 static int
   2339 zfs_umount(vfs_t *vfsp, int fflag)
   2340 {
   2341 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
   2342 	objset_t *os;
   2343 	int ret;
   2344 #ifdef __FreeBSD_kernel__
   2345 	kthread_t *td = curthread;
   2346 	cred_t *cr = td->td_ucred;
   2347 #endif
   2348 #ifdef __NetBSD__
   2349 	cred_t *cr = CRED();
   2350 	struct vnode_iterator *marker;
   2351 	vnode_t *vp;
   2352 #endif
   2353 
   2354 	ret = secpolicy_fs_unmount(cr, vfsp);
   2355 	if (ret) {
   2356 		if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
   2357 		    ZFS_DELEG_PERM_MOUNT, cr))
   2358 			return (ret);
   2359 	}
   2360 
   2361 	/*
   2362 	 * We purge the parent filesystem's vfsp as the parent filesystem
   2363 	 * and all of its snapshots have their vnode's v_vfsp set to the
   2364 	 * parent's filesystem's vfsp.  Note, 'z_parent' is self
   2365 	 * referential for non-snapshots.
   2366 	 */
   2367 	(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
   2368 
   2369 	/*
   2370 	 * Unmount any snapshots mounted under .zfs before unmounting the
   2371 	 * dataset itself.
   2372 	 */
   2373 	if (zfsvfs->z_ctldir != NULL) {
   2374 		if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
   2375 			return (ret);
   2376 	}
   2377 
   2378 	if (fflag & MS_FORCE) {
   2379 		/*
   2380 		 * Mark file system as unmounted before calling
   2381 		 * vflush(FORCECLOSE). This way we ensure no future vnops
   2382 		 * will be called and risk operating on DOOMED vnodes.
   2383 		 */
   2384 		rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
   2385 		zfsvfs->z_unmounted = B_TRUE;
   2386 		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
   2387 	}
   2388 
   2389 	/*
   2390 	 * Flush all the files.
   2391 	 */
   2392 #ifdef __FreeBSD_kernel__
   2393 	ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
   2394 	if (ret != 0)
   2395 		return (ret);
   2396 #endif
   2397 #ifdef __NetBSD__
   2398 	/*
   2399 	 * we loop here because zil_commit can bring some vnodes
   2400 	 * back to mnt_vnodelist via zfs_get_data.
   2401 	 */
   2402 	vfs_vnode_iterator_init(vfsp, &marker);
   2403 	while ((vp = vfs_vnode_iterator_next(marker, NULL, NULL))) {
   2404 		VN_RELE(vp);
   2405 		vfs_vnode_iterator_destroy(marker);
   2406 		ret = vflush(vfsp, NULL, (fflag & MS_FORCE) ? FORCECLOSE : 0);
   2407 		if (ret != 0)
   2408 			return (ret);
   2409 		if (zfsvfs->z_log)
   2410 			zil_commit(zfsvfs->z_log, 0);
   2411 		vfs_vnode_iterator_init(vfsp, &marker);
   2412 	}
   2413 	vfs_vnode_iterator_destroy(marker);
   2414 #endif
   2415 
   2416 #ifdef illumos
   2417 	if (!(fflag & MS_FORCE)) {
   2418 		/*
   2419 		 * Check the number of active vnodes in the file system.
   2420 		 * Our count is maintained in the vfs structure, but the
   2421 		 * number is off by 1 to indicate a hold on the vfs
   2422 		 * structure itself.
   2423 		 *
   2424 		 * The '.zfs' directory maintains a reference of its
   2425 		 * own, and any active references underneath are
   2426 		 * reflected in the vnode count.
   2427 		 */
   2428 		if (zfsvfs->z_ctldir == NULL) {
   2429 			if (vfsp->vfs_count > 1)
   2430 				return (SET_ERROR(EBUSY));
   2431 		} else {
   2432 			if (vfsp->vfs_count > 2 ||
   2433 			    zfsvfs->z_ctldir->v_count > 1)
   2434 				return (SET_ERROR(EBUSY));
   2435 		}
   2436 	}
   2437 #endif
   2438 
   2439 	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
   2440 	os = zfsvfs->z_os;
   2441 
   2442 	/*
   2443 	 * z_os will be NULL if there was an error in
   2444 	 * attempting to reopen zfsvfs.
   2445 	 */
   2446 	if (os != NULL) {
   2447 		/*
   2448 		 * Unset the objset user_ptr.
   2449 		 */
   2450 		mutex_enter(&os->os_user_ptr_lock);
   2451 		dmu_objset_set_user(os, NULL);
   2452 		mutex_exit(&os->os_user_ptr_lock);
   2453 
   2454 		/*
   2455 		 * Finally release the objset
   2456 		 */
   2457 		dmu_objset_disown(os, zfsvfs);
   2458 	}
   2459 
   2460 	/*
   2461 	 * We can now safely destroy the '.zfs' directory node.
   2462 	 */
   2463 	if (zfsvfs->z_ctldir != NULL)
   2464 		zfsctl_destroy(zfsvfs);
   2465 	zfs_freevfs(vfsp);
   2466 
   2467 	return (0);
   2468 }
   2469 
   2470 static int
   2471 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
   2472 {
   2473 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
   2474 	znode_t		*zp;
   2475 	int 		err;
   2476 
   2477 	/*
   2478 	 * zfs_zget() can't operate on virtual entries like .zfs/ or
   2479 	 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
   2480 	 * This will make NFS to switch to LOOKUP instead of using VGET.
   2481 	 */
   2482 	if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
   2483 	    (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
   2484 		return (EOPNOTSUPP);
   2485 
   2486 	ZFS_ENTER(zfsvfs);
   2487 	err = zfs_zget(zfsvfs, ino, &zp);
   2488 	if (err == 0 && zp->z_unlinked) {
   2489 		VN_RELE(ZTOV(zp));
   2490 		err = EINVAL;
   2491 	}
   2492 	if (err == 0)
   2493 		*vpp = ZTOV(zp);
   2494 	ZFS_EXIT(zfsvfs);
   2495 	if (err == 0)
   2496 		err = vn_lock(*vpp, flags);
   2497 	if (err != 0)
   2498 		*vpp = NULL;
   2499 
   2500 	return (err);
   2501 }
   2502 
   2503 #ifdef __FreeBSD_kernel__
   2504 static int
   2505 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
   2506     struct ucred **credanonp, int *numsecflavors, int **secflavors)
   2507 {
   2508 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
   2509 
   2510 	/*
   2511 	 * If this is regular file system vfsp is the same as
   2512 	 * zfsvfs->z_parent->z_vfs, but if it is snapshot,
   2513 	 * zfsvfs->z_parent->z_vfs represents parent file system
   2514 	 * which we have to use here, because only this file system
   2515 	 * has mnt_export configured.
   2516 	 */
   2517 	return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
   2518 	    credanonp, numsecflavors, secflavors));
   2519 }
   2520 
   2521 CTASSERT(SHORT_FID_LEN <= sizeof(struct fid));
   2522 CTASSERT(LONG_FID_LEN <= sizeof(struct fid));
   2523 #endif
   2524 
   2525 #ifdef __FreeBSD_kernel__
   2526 static int
   2527 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
   2528 {
   2529 	struct componentname cn;
   2530 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
   2531 	znode_t		*zp;
   2532 	vnode_t		*dvp;
   2533 	uint64_t	object = 0;
   2534 	uint64_t	fid_gen = 0;
   2535 	uint64_t	gen_mask;
   2536 	uint64_t	zp_gen;
   2537 	int 		i, err;
   2538 
   2539 	*vpp = NULL;
   2540 
   2541 	ZFS_ENTER(zfsvfs);
   2542 
   2543 	/*
   2544 	 * On FreeBSD we can get snapshot's mount point or its parent file
   2545 	 * system mount point depending if snapshot is already mounted or not.
   2546 	 */
   2547 	if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
   2548 		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
   2549 		uint64_t	objsetid = 0;
   2550 		uint64_t	setgen = 0;
   2551 
   2552 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
   2553 			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
   2554 
   2555 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
   2556 			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
   2557 
   2558 		ZFS_EXIT(zfsvfs);
   2559 
   2560 		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
   2561 		if (err)
   2562 			return (SET_ERROR(EINVAL));
   2563 		ZFS_ENTER(zfsvfs);
   2564 	}
   2565 
   2566 	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
   2567 		zfid_short_t	*zfid = (zfid_short_t *)fidp;
   2568 
   2569 		for (i = 0; i < sizeof (zfid->zf_object); i++)
   2570 			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
   2571 
   2572 		for (i = 0; i < sizeof (zfid->zf_gen); i++)
   2573 			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
   2574 	} else {
   2575 		ZFS_EXIT(zfsvfs);
   2576 		return (SET_ERROR(EINVAL));
   2577 	}
   2578 
   2579 	/*
   2580 	 * A zero fid_gen means we are in .zfs or the .zfs/snapshot
   2581 	 * directory tree. If the object == zfsvfs->z_shares_dir, then
   2582 	 * we are in the .zfs/shares directory tree.
   2583 	 */
   2584 	if ((fid_gen == 0 &&
   2585 	     (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
   2586 	    (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
   2587 		ZFS_EXIT(zfsvfs);
   2588 		VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
   2589 		if (object == ZFSCTL_INO_SNAPDIR) {
   2590 			cn.cn_nameptr = "snapshot";
   2591 			cn.cn_namelen = strlen(cn.cn_nameptr);
   2592 			cn.cn_nameiop = LOOKUP;
   2593 			cn.cn_flags = ISLASTCN | LOCKLEAF;
   2594 			cn.cn_lkflags = flags;
   2595 			VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
   2596 			vput(dvp);
   2597 		} else if (object == zfsvfs->z_shares_dir) {
   2598 			/*
   2599 			 * XXX This branch must not be taken,
   2600 			 * if it is, then the lookup below will
   2601 			 * explode.
   2602 			 */
   2603 			cn.cn_nameptr = "shares";
   2604 			cn.cn_namelen = strlen(cn.cn_nameptr);
   2605 			cn.cn_nameiop = LOOKUP;
   2606 			cn.cn_flags = ISLASTCN;
   2607 			cn.cn_lkflags = flags;
   2608 			VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
   2609 			vput(dvp);
   2610 		} else {
   2611 			*vpp = dvp;
   2612 		}
   2613 		return (err);
   2614 	}
   2615 
   2616 	gen_mask = -1ULL >> (64 - 8 * i);
   2617 
   2618 	dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
   2619 	if (err = zfs_zget(zfsvfs, object, &zp)) {
   2620 		ZFS_EXIT(zfsvfs);
   2621 		return (err);
   2622 	}
   2623 	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
   2624 	    sizeof (uint64_t));
   2625 	zp_gen = zp_gen & gen_mask;
   2626 	if (zp_gen == 0)
   2627 		zp_gen = 1;
   2628 	if (zp->z_unlinked || zp_gen != fid_gen) {
   2629 		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
   2630 		VN_RELE(ZTOV(zp));
   2631 		ZFS_EXIT(zfsvfs);
   2632 		return (SET_ERROR(EINVAL));
   2633 	}
   2634 
   2635 	*vpp = ZTOV(zp);
   2636 	ZFS_EXIT(zfsvfs);
   2637 	err = vn_lock(*vpp, flags);
   2638 	if (err == 0)
   2639 		vnode_create_vobject(*vpp, zp->z_size, curthread);
   2640 	else
   2641 		*vpp = NULL;
   2642 	return (err);
   2643 }
   2644 #endif /* __FreeBSD_kernel__ */
   2645 
   2646 /*
   2647  * Block out VOPs and close zfsvfs_t::z_os
   2648  *
   2649  * Note, if successful, then we return with the 'z_teardown_lock' and
   2650  * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
   2651  * dataset and objset intact so that they can be atomically handed off during
   2652  * a subsequent rollback or recv operation and the resume thereafter.
   2653  */
   2654 int
   2655 zfs_suspend_fs(zfsvfs_t *zfsvfs)
   2656 {
   2657 	int error;
   2658 
   2659 #ifdef __NetBSD__
   2660 	if ((error = vfs_suspend(zfsvfs->z_vfs, 0)) != 0)
   2661 		return error;
   2662 	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) {
   2663 		vfs_resume(zfsvfs->z_vfs);
   2664 		return (error);
   2665 	}
   2666 #else
   2667 	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
   2668 		return (error);
   2669 #endif
   2670 
   2671 	return (0);
   2672 }
   2673 
   2674 /*
   2675  * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
   2676  * is an invariant across any of the operations that can be performed while the
   2677  * filesystem was suspended.  Whether it succeeded or failed, the preconditions
   2678  * are the same: the relevant objset and associated dataset are owned by
   2679  * zfsvfs, held, and long held on entry.
   2680  */
   2681 #ifdef __NetBSD__
   2682 static bool
   2683 zfs_resume_selector(void *cl, struct vnode *vp)
   2684 {
   2685 
   2686 	if (zfsctl_is_node(vp))
   2687 		return false;
   2688 	return (VTOZ(vp)->z_sa_hdl == NULL);
   2689 }
   2690 #endif
   2691 int
   2692 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
   2693 {
   2694 	int err;
   2695 	znode_t *zp;
   2696 
   2697 	ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
   2698 	ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
   2699 
   2700 	/*
   2701 	 * We already own this, so just update the objset_t, as the one we
   2702 	 * had before may have been evicted.
   2703 	 */
   2704 	objset_t *os;
   2705 	VERIFY3P(ds->ds_owner, ==, zfsvfs);
   2706 	VERIFY(dsl_dataset_long_held(ds));
   2707 	VERIFY0(dmu_objset_from_ds(ds, &os));
   2708 
   2709 	err = zfsvfs_init(zfsvfs, os);
   2710 	if (err != 0)
   2711 		goto bail;
   2712 
   2713 	VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
   2714 
   2715 	zfs_set_fuid_feature(zfsvfs);
   2716 
   2717 	/*
   2718 	 * Attempt to re-establish all the active znodes with
   2719 	 * their dbufs.  If a zfs_rezget() fails, then we'll let
   2720 	 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
   2721 	 * when they try to use their znode.
   2722 	 */
   2723 	mutex_enter(&zfsvfs->z_znodes_lock);
   2724 	for (zp = list_head(&zfsvfs->z_all_znodes); zp;
   2725 	    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
   2726 		(void) zfs_rezget(zp);
   2727 	}
   2728 	mutex_exit(&zfsvfs->z_znodes_lock);
   2729 
   2730 bail:
   2731 	/* release the VOPs */
   2732 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
   2733 	rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
   2734 #ifdef __NetBSD__
   2735 	struct vnode_iterator *marker;
   2736 	vnode_t *vp;
   2737 
   2738 	vfs_vnode_iterator_init(zfsvfs->z_vfs, &marker);
   2739 	while ((vp = vfs_vnode_iterator_next(marker,
   2740 	    zfs_resume_selector, NULL))) {
   2741 		vgone(vp);
   2742 	}
   2743 	vfs_vnode_iterator_destroy(marker);
   2744 	vfs_resume(zfsvfs->z_vfs);
   2745 #endif
   2746 
   2747 	if (err) {
   2748 		/*
   2749 		 * Since we couldn't setup the sa framework, try to force
   2750 		 * unmount this file system.
   2751 		 */
   2752 		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
   2753 			vfs_ref(zfsvfs->z_vfs);
   2754 			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
   2755 		}
   2756 	}
   2757 	return (err);
   2758 }
   2759 
   2760 static void
   2761 zfs_freevfs(vfs_t *vfsp)
   2762 {
   2763 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
   2764 
   2765 #ifdef illumos
   2766 	/*
   2767 	 * If this is a snapshot, we have an extra VFS_HOLD on our parent
   2768 	 * from zfs_mount().  Release it here.  If we came through
   2769 	 * zfs_mountroot() instead, we didn't grab an extra hold, so
   2770 	 * skip the VFS_RELE for rootvfs.
   2771 	 */
   2772 	if (zfsvfs->z_issnap && (vfsp != rootvfs))
   2773 		VFS_RELE(zfsvfs->z_parent->z_vfs);
   2774 #endif
   2775 
   2776 	zfsvfs_free(zfsvfs);
   2777 
   2778 	atomic_dec_32(&zfs_active_fs_count);
   2779 }
   2780 
   2781 #ifdef __FreeBSD_kernel__
   2782 #ifdef __i386__
   2783 static int desiredvnodes_backup;
   2784 #endif
   2785 
   2786 static void
   2787 zfs_vnodes_adjust(void)
   2788 {
   2789 #ifdef __i386__
   2790 	int newdesiredvnodes;
   2791 
   2792 	desiredvnodes_backup = desiredvnodes;
   2793 
   2794 	/*
   2795 	 * We calculate newdesiredvnodes the same way it is done in
   2796 	 * vntblinit(). If it is equal to desiredvnodes, it means that
   2797 	 * it wasn't tuned by the administrator and we can tune it down.
   2798 	 */
   2799 	newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
   2800 	    vm_kmem_size / (5 * (sizeof(struct vm_object) +
   2801 	    sizeof(struct vnode))));
   2802 	if (newdesiredvnodes == desiredvnodes)
   2803 		desiredvnodes = (3 * newdesiredvnodes) / 4;
   2804 #endif
   2805 }
   2806 
   2807 static void
   2808 zfs_vnodes_adjust_back(void)
   2809 {
   2810 
   2811 #ifdef __i386__
   2812 	desiredvnodes = desiredvnodes_backup;
   2813 #endif
   2814 }
   2815 #endif /* __FreeBSD_kernel__ */
   2816 
   2817 #ifdef __NetBSD__
   2818 static void
   2819 zfs_vnodes_adjust(void)
   2820 {
   2821 }
   2822 
   2823 static void
   2824 zfs_vnodes_adjust_back(void)
   2825 {
   2826 }
   2827 #endif
   2828 
   2829 void
   2830 zfs_init(void)
   2831 {
   2832 
   2833 	printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
   2834 
   2835 	/*
   2836 	 * Initialize .zfs directory structures
   2837 	 */
   2838 	zfsctl_init();
   2839 
   2840 	/*
   2841 	 * Initialize znode cache, vnode ops, etc...
   2842 	 */
   2843 	zfs_znode_init();
   2844 
   2845 	/*
   2846 	 * Reduce number of vnodes. Originally number of vnodes is calculated
   2847 	 * with UFS inode in mind. We reduce it here, because it's too big for
   2848 	 * ZFS/i386.
   2849 	 */
   2850 	zfs_vnodes_adjust();
   2851 
   2852 	dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
   2853 }
   2854 
   2855 void
   2856 zfs_fini(void)
   2857 {
   2858 	zfsctl_fini();
   2859 	zfs_znode_fini();
   2860 	zfs_vnodes_adjust_back();
   2861 }
   2862 
   2863 int
   2864 zfs_busy(void)
   2865 {
   2866 	return (zfs_active_fs_count != 0);
   2867 }
   2868 
   2869 int
   2870 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
   2871 {
   2872 	int error;
   2873 	objset_t *os = zfsvfs->z_os;
   2874 	dmu_tx_t *tx;
   2875 
   2876 	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
   2877 		return (SET_ERROR(EINVAL));
   2878 
   2879 	if (newvers < zfsvfs->z_version)
   2880 		return (SET_ERROR(EINVAL));
   2881 
   2882 	if (zfs_spa_version_map(newvers) >
   2883 	    spa_version(dmu_objset_spa(zfsvfs->z_os)))
   2884 		return (SET_ERROR(ENOTSUP));
   2885 
   2886 	tx = dmu_tx_create(os);
   2887 	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
   2888 	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
   2889 		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
   2890 		    ZFS_SA_ATTRS);
   2891 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
   2892 	}
   2893 	error = dmu_tx_assign(tx, TXG_WAIT);
   2894 	if (error) {
   2895 		dmu_tx_abort(tx);
   2896 		return (error);
   2897 	}
   2898 
   2899 	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
   2900 	    8, 1, &newvers, tx);
   2901 
   2902 	if (error) {
   2903 		dmu_tx_commit(tx);
   2904 		return (error);
   2905 	}
   2906 
   2907 	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
   2908 		uint64_t sa_obj;
   2909 
   2910 		ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
   2911 		    SPA_VERSION_SA);
   2912 		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
   2913 		    DMU_OT_NONE, 0, tx);
   2914 
   2915 		error = zap_add(os, MASTER_NODE_OBJ,
   2916 		    ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
   2917 		ASSERT0(error);
   2918 
   2919 		VERIFY(0 == sa_set_sa_object(os, sa_obj));
   2920 		sa_register_update_callback(os, zfs_sa_upgrade);
   2921 	}
   2922 
   2923 	spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
   2924 	    "from %llu to %llu", zfsvfs->z_version, newvers);
   2925 
   2926 	dmu_tx_commit(tx);
   2927 
   2928 	zfsvfs->z_version = newvers;
   2929 
   2930 	zfs_set_fuid_feature(zfsvfs);
   2931 
   2932 	return (0);
   2933 }
   2934 
   2935 /*
   2936  * Read a property stored within the master node.
   2937  */
   2938 int
   2939 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
   2940 {
   2941 	const char *pname;
   2942 	int error = ENOENT;
   2943 
   2944 	/*
   2945 	 * Look up the file system's value for the property.  For the
   2946 	 * version property, we look up a slightly different string.
   2947 	 */
   2948 	if (prop == ZFS_PROP_VERSION)
   2949 		pname = ZPL_VERSION_STR;
   2950 	else
   2951 		pname = zfs_prop_to_name(prop);
   2952 
   2953 	if (os != NULL)
   2954 		error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
   2955 
   2956 	if (error == ENOENT) {
   2957 		/* No value set, use the default value */
   2958 		switch (prop) {
   2959 		case ZFS_PROP_VERSION:
   2960 			*value = ZPL_VERSION;
   2961 			break;
   2962 		case ZFS_PROP_NORMALIZE:
   2963 		case ZFS_PROP_UTF8ONLY:
   2964 			*value = 0;
   2965 			break;
   2966 		case ZFS_PROP_CASE:
   2967 			*value = ZFS_CASE_SENSITIVE;
   2968 			break;
   2969 		default:
   2970 			return (error);
   2971 		}
   2972 		error = 0;
   2973 	}
   2974 	return (error);
   2975 }
   2976 
   2977 #if defined(__FreeBSD_kernel__) || defined(__NetBSD__)
   2978 #ifdef _KERNEL
   2979 void
   2980 zfsvfs_update_fromname(const char *oldname, const char *newname)
   2981 {
   2982 	char tmpbuf[MAXPATHLEN];
   2983 	struct mount *mp;
   2984 	char *fromname;
   2985 	size_t oldlen;
   2986 
   2987 	oldlen = strlen(oldname);
   2988 
   2989 #ifdef __NetBSD__
   2990 	mount_iterator_t *iter;
   2991 	mountlist_iterator_init(&iter);
   2992 	while ((mp = mountlist_iterator_next(iter)) != NULL) {
   2993 #else
   2994 	mtx_lock(&mountlist_mtx);
   2995 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
   2996 #endif
   2997 		fromname = mp->mnt_stat.f_mntfromname;
   2998 		if (strcmp(fromname, oldname) == 0) {
   2999 			(void)strlcpy(fromname, newname,
   3000 			    sizeof(mp->mnt_stat.f_mntfromname));
   3001 			continue;
   3002 		}
   3003 		if (strncmp(fromname, oldname, oldlen) == 0 &&
   3004 		    (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
   3005 			(void)snprintf(tmpbuf, sizeof(tmpbuf), "%s%s",
   3006 			    newname, fromname + oldlen);
   3007 			(void)strlcpy(fromname, tmpbuf,
   3008 			    sizeof(mp->mnt_stat.f_mntfromname));
   3009 			continue;
   3010 		}
   3011 	}
   3012 #ifdef __NetBSD__
   3013 	mountlist_iterator_destroy(iter);
   3014 #else
   3015 	mtx_unlock(&mountlist_mtx);
   3016 #endif
   3017 }
   3018 #endif
   3019 #endif
   3020