Home | History | Annotate | Line # | Download | only in union
union_subr.c revision 1.70
      1 /*	$NetBSD: union_subr.c,v 1.70 2015/02/16 10:22:00 hannken Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 1994
      5  *	The Regents of the University of California.  All rights reserved.
      6  *
      7  * This code is derived from software contributed to Berkeley by
      8  * Jan-Simon Pendry.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  * 3. Neither the name of the University nor the names of its contributors
     19  *    may be used to endorse or promote products derived from this software
     20  *    without specific prior written permission.
     21  *
     22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     32  * SUCH DAMAGE.
     33  *
     34  *	@(#)union_subr.c	8.20 (Berkeley) 5/20/95
     35  */
     36 
     37 /*
     38  * Copyright (c) 1994 Jan-Simon Pendry
     39  *
     40  * This code is derived from software contributed to Berkeley by
     41  * Jan-Simon Pendry.
     42  *
     43  * Redistribution and use in source and binary forms, with or without
     44  * modification, are permitted provided that the following conditions
     45  * are met:
     46  * 1. Redistributions of source code must retain the above copyright
     47  *    notice, this list of conditions and the following disclaimer.
     48  * 2. Redistributions in binary form must reproduce the above copyright
     49  *    notice, this list of conditions and the following disclaimer in the
     50  *    documentation and/or other materials provided with the distribution.
     51  * 3. All advertising materials mentioning features or use of this software
     52  *    must display the following acknowledgement:
     53  *	This product includes software developed by the University of
     54  *	California, Berkeley and its contributors.
     55  * 4. Neither the name of the University nor the names of its contributors
     56  *    may be used to endorse or promote products derived from this software
     57  *    without specific prior written permission.
     58  *
     59  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     60  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     61  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     62  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     63  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     64  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     65  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     66  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     67  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     68  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     69  * SUCH DAMAGE.
     70  *
     71  *	@(#)union_subr.c	8.20 (Berkeley) 5/20/95
     72  */
     73 
     74 #include <sys/cdefs.h>
     75 __KERNEL_RCSID(0, "$NetBSD: union_subr.c,v 1.70 2015/02/16 10:22:00 hannken Exp $");
     76 
     77 #include <sys/param.h>
     78 #include <sys/systm.h>
     79 #include <sys/proc.h>
     80 #include <sys/time.h>
     81 #include <sys/kernel.h>
     82 #include <sys/vnode.h>
     83 #include <sys/namei.h>
     84 #include <sys/malloc.h>
     85 #include <sys/dirent.h>
     86 #include <sys/file.h>
     87 #include <sys/filedesc.h>
     88 #include <sys/queue.h>
     89 #include <sys/mount.h>
     90 #include <sys/stat.h>
     91 #include <sys/kauth.h>
     92 
     93 #include <uvm/uvm_extern.h>
     94 
     95 #include <fs/union/union.h>
     96 #include <miscfs/genfs/genfs.h>
     97 #include <miscfs/specfs/specdev.h>
     98 
     99 static LIST_HEAD(uhashhead, union_node) *uhashtbl;
    100 static u_long uhash_mask;		/* size of hash table - 1 */
    101 #define UNION_HASH(u, l) \
    102 	((((u_long) (u) + (u_long) (l)) >> 8) & uhash_mask)
    103 #define NOHASH	((u_long)-1)
    104 
    105 static kmutex_t uhash_lock;
    106 
    107 void union_updatevp(struct union_node *, struct vnode *, struct vnode *);
    108 static void union_ref(struct union_node *);
    109 static void union_rele(struct union_node *);
    110 static int union_do_lookup(struct vnode *, struct componentname *, kauth_cred_t,    const char *);
    111 int union_vn_close(struct vnode *, int, kauth_cred_t, struct lwp *);
    112 static void union_dircache_r(struct vnode *, struct vnode ***, int *);
    113 struct vnode *union_dircache(struct vnode *, struct lwp *);
    114 
    115 void
    116 union_init(void)
    117 {
    118 
    119 	mutex_init(&uhash_lock, MUTEX_DEFAULT, IPL_NONE);
    120 	uhashtbl = hashinit(desiredvnodes, HASH_LIST, true, &uhash_mask);
    121 }
    122 
    123 void
    124 union_reinit(void)
    125 {
    126 	struct union_node *un;
    127 	struct uhashhead *oldhash, *hash;
    128 	u_long oldmask, mask, val;
    129 	int i;
    130 
    131 	hash = hashinit(desiredvnodes, HASH_LIST, true, &mask);
    132 	mutex_enter(&uhash_lock);
    133 	oldhash = uhashtbl;
    134 	oldmask = uhash_mask;
    135 	uhashtbl = hash;
    136 	uhash_mask = mask;
    137 	for (i = 0; i <= oldmask; i++) {
    138 		while ((un = LIST_FIRST(&oldhash[i])) != NULL) {
    139 			LIST_REMOVE(un, un_cache);
    140 			val = UNION_HASH(un->un_uppervp, un->un_lowervp);
    141 			LIST_INSERT_HEAD(&hash[val], un, un_cache);
    142 		}
    143 	}
    144 	mutex_exit(&uhash_lock);
    145 	hashdone(oldhash, HASH_LIST, oldmask);
    146 }
    147 
    148 /*
    149  * Free global unionfs resources.
    150  */
    151 void
    152 union_done(void)
    153 {
    154 
    155 	hashdone(uhashtbl, HASH_LIST, uhash_mask);
    156 	mutex_destroy(&uhash_lock);
    157 
    158 	/* Make sure to unset the readdir hook. */
    159 	vn_union_readdir_hook = NULL;
    160 }
    161 
    162 void
    163 union_updatevp(struct union_node *un, struct vnode *uppervp,
    164 	struct vnode *lowervp)
    165 {
    166 	int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp);
    167 	int nhash = UNION_HASH(uppervp, lowervp);
    168 	int docache = (lowervp != NULLVP || uppervp != NULLVP);
    169 	bool un_unlock;
    170 
    171 	KASSERT(VOP_ISLOCKED(UNIONTOV(un)) == LK_EXCLUSIVE);
    172 
    173 	mutex_enter(&uhash_lock);
    174 
    175 	if (!docache || ohash != nhash) {
    176 		if (un->un_cflags & UN_CACHED) {
    177 			un->un_cflags &= ~UN_CACHED;
    178 			LIST_REMOVE(un, un_cache);
    179 		}
    180 	}
    181 
    182 	if (un->un_lowervp != lowervp) {
    183 		if (un->un_lowervp) {
    184 			vrele(un->un_lowervp);
    185 			if (un->un_path) {
    186 				free(un->un_path, M_TEMP);
    187 				un->un_path = 0;
    188 			}
    189 			if (un->un_dirvp) {
    190 				vrele(un->un_dirvp);
    191 				un->un_dirvp = NULLVP;
    192 			}
    193 		}
    194 		un->un_lowervp = lowervp;
    195 		mutex_enter(&un->un_lock);
    196 		un->un_lowersz = VNOVAL;
    197 		mutex_exit(&un->un_lock);
    198 	}
    199 
    200 	if (un->un_uppervp != uppervp) {
    201 		if (un->un_uppervp) {
    202 			un_unlock = false;
    203 			vrele(un->un_uppervp);
    204 		} else
    205 			un_unlock = true;
    206 
    207 		mutex_enter(&un->un_lock);
    208 		un->un_uppervp = uppervp;
    209 		mutex_exit(&un->un_lock);
    210 		if (un_unlock) {
    211 			struct vop_unlock_args ap;
    212 
    213 			ap.a_vp = UNIONTOV(un);
    214 			genfs_unlock(&ap);
    215 		}
    216 		mutex_enter(&un->un_lock);
    217 		un->un_uppersz = VNOVAL;
    218 		mutex_exit(&un->un_lock);
    219 		/* Update union vnode interlock. */
    220 		if (uppervp != NULL) {
    221 			mutex_obj_hold(uppervp->v_interlock);
    222 			uvm_obj_setlock(&UNIONTOV(un)->v_uobj,
    223 			    uppervp->v_interlock);
    224 		}
    225 	}
    226 
    227 	if (docache && (ohash != nhash)) {
    228 		LIST_INSERT_HEAD(&uhashtbl[nhash], un, un_cache);
    229 		un->un_cflags |= UN_CACHED;
    230 	}
    231 
    232 	mutex_exit(&uhash_lock);
    233 }
    234 
    235 void
    236 union_newlower(struct union_node *un, struct vnode *lowervp)
    237 {
    238 
    239 	union_updatevp(un, un->un_uppervp, lowervp);
    240 }
    241 
    242 void
    243 union_newupper(struct union_node *un, struct vnode *uppervp)
    244 {
    245 
    246 	union_updatevp(un, uppervp, un->un_lowervp);
    247 }
    248 
    249 /*
    250  * Keep track of size changes in the underlying vnodes.
    251  * If the size changes, then callback to the vm layer
    252  * giving priority to the upper layer size.
    253  *
    254  * Mutex un_lock hold on entry and released on return.
    255  */
    256 void
    257 union_newsize(struct vnode *vp, off_t uppersz, off_t lowersz)
    258 {
    259 	struct union_node *un = VTOUNION(vp);
    260 	off_t sz;
    261 
    262 	KASSERT(mutex_owned(&un->un_lock));
    263 	/* only interested in regular files */
    264 	if (vp->v_type != VREG) {
    265 		mutex_exit(&un->un_lock);
    266 		uvm_vnp_setsize(vp, 0);
    267 		return;
    268 	}
    269 
    270 	sz = VNOVAL;
    271 
    272 	if ((uppersz != VNOVAL) && (un->un_uppersz != uppersz)) {
    273 		un->un_uppersz = uppersz;
    274 		if (sz == VNOVAL)
    275 			sz = un->un_uppersz;
    276 	}
    277 
    278 	if ((lowersz != VNOVAL) && (un->un_lowersz != lowersz)) {
    279 		un->un_lowersz = lowersz;
    280 		if (sz == VNOVAL)
    281 			sz = un->un_lowersz;
    282 	}
    283 	mutex_exit(&un->un_lock);
    284 
    285 	if (sz != VNOVAL) {
    286 #ifdef UNION_DIAGNOSTIC
    287 		printf("union: %s size now %qd\n",
    288 		    uppersz != VNOVAL ? "upper" : "lower", sz);
    289 #endif
    290 		uvm_vnp_setsize(vp, sz);
    291 	}
    292 }
    293 
    294 static void
    295 union_ref(struct union_node *un)
    296 {
    297 
    298 	KASSERT(mutex_owned(&uhash_lock));
    299 	un->un_refs++;
    300 }
    301 
    302 static void
    303 union_rele(struct union_node *un)
    304 {
    305 
    306 	mutex_enter(&uhash_lock);
    307 	un->un_refs--;
    308 	if (un->un_refs > 0) {
    309 		mutex_exit(&uhash_lock);
    310 		return;
    311 	}
    312 	if (un->un_cflags & UN_CACHED) {
    313 		un->un_cflags &= ~UN_CACHED;
    314 		LIST_REMOVE(un, un_cache);
    315 	}
    316 	mutex_exit(&uhash_lock);
    317 
    318 	if (un->un_pvp != NULLVP)
    319 		vrele(un->un_pvp);
    320 	if (un->un_uppervp != NULLVP)
    321 		vrele(un->un_uppervp);
    322 	if (un->un_lowervp != NULLVP)
    323 		vrele(un->un_lowervp);
    324 	if (un->un_dirvp != NULLVP)
    325 		vrele(un->un_dirvp);
    326 	if (un->un_path)
    327 		free(un->un_path, M_TEMP);
    328 	mutex_destroy(&un->un_lock);
    329 
    330 	free(un, M_TEMP);
    331 }
    332 
    333 /*
    334  * allocate a union_node/vnode pair.  the vnode is
    335  * referenced and unlocked.  the new vnode is returned
    336  * via (vpp).  (mp) is the mountpoint of the union filesystem,
    337  * (dvp) is the parent directory where the upper layer object
    338  * should exist (but doesn't) and (cnp) is the componentname
    339  * information which is partially copied to allow the upper
    340  * layer object to be created at a later time.  (uppervp)
    341  * and (lowervp) reference the upper and lower layer objects
    342  * being mapped.  either, but not both, can be nil.
    343  * both, if supplied, are unlocked.
    344  * the reference is either maintained in the new union_node
    345  * object which is allocated, or they are vrele'd.
    346  *
    347  * all union_nodes are maintained on a hash
    348  * list.  new nodes are only allocated when they cannot
    349  * be found on this list.  entries on the list are
    350  * removed when the vfs reclaim entry is called.
    351  *
    352  * the vnode gets attached or referenced with vcache_get().
    353  */
    354 int
    355 union_allocvp(
    356 	struct vnode **vpp,
    357 	struct mount *mp,
    358 	struct vnode *undvp,		/* parent union vnode */
    359 	struct vnode *dvp,		/* may be null */
    360 	struct componentname *cnp,	/* may be null */
    361 	struct vnode *uppervp,		/* may be null */
    362 	struct vnode *lowervp,		/* may be null */
    363 	int docache)
    364 {
    365 	int error;
    366 	struct union_node *un = NULL, *un1;
    367 	struct vnode *vp, *xlowervp = NULLVP;
    368 	u_long hash[3];
    369 	int try;
    370 	bool is_dotdot;
    371 
    372 	is_dotdot = (dvp != NULL && cnp != NULL && (cnp->cn_flags & ISDOTDOT));
    373 
    374 	if (uppervp == NULLVP && lowervp == NULLVP)
    375 		panic("union: unidentifiable allocation");
    376 
    377 	if (uppervp && lowervp && (uppervp->v_type != lowervp->v_type)) {
    378 		xlowervp = lowervp;
    379 		lowervp = NULLVP;
    380 	}
    381 
    382 	if (!docache) {
    383 		un = NULL;
    384 		goto found;
    385 	}
    386 
    387 	/*
    388 	 * If both uppervp and lowervp are not NULL we have to
    389 	 * search union nodes with one vnode as NULL too.
    390 	 */
    391 	hash[0] = UNION_HASH(uppervp, lowervp);
    392 	if (uppervp == NULL || lowervp == NULL) {
    393 		hash[1] = hash[2] = NOHASH;
    394 	} else {
    395 		hash[1] = UNION_HASH(uppervp, NULLVP);
    396 		hash[2] = UNION_HASH(NULLVP, lowervp);
    397 	}
    398 
    399 loop:
    400 	mutex_enter(&uhash_lock);
    401 
    402 	for (try = 0; try < 3; try++) {
    403 		if (hash[try] == NOHASH)
    404 			continue;
    405 		LIST_FOREACH(un, &uhashtbl[hash[try]], un_cache) {
    406 			if ((un->un_lowervp && un->un_lowervp != lowervp) ||
    407 			    (un->un_uppervp && un->un_uppervp != uppervp) ||
    408 			    un->un_mount != mp)
    409 				continue;
    410 
    411 			union_ref(un);
    412 			mutex_exit(&uhash_lock);
    413 			error = vcache_get(mp, &un, sizeof(un), &vp);
    414 			KASSERT(error != 0 || UNIONTOV(un) == vp);
    415 			union_rele(un);
    416 			if (error == ENOENT)
    417 				goto loop;
    418 			else if (error)
    419 				goto out;
    420 			goto found;
    421 		}
    422 	}
    423 
    424 	mutex_exit(&uhash_lock);
    425 
    426 found:
    427 	if (un) {
    428 		if (uppervp != dvp) {
    429 			if (is_dotdot)
    430 				VOP_UNLOCK(dvp);
    431 			vn_lock(UNIONTOV(un), LK_EXCLUSIVE | LK_RETRY);
    432 			if (is_dotdot)
    433 				vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
    434 		}
    435 		/*
    436 		 * Save information about the upper layer.
    437 		 */
    438 		if (uppervp != un->un_uppervp) {
    439 			union_newupper(un, uppervp);
    440 		} else if (uppervp) {
    441 			vrele(uppervp);
    442 		}
    443 
    444 		/*
    445 		 * Save information about the lower layer.
    446 		 * This needs to keep track of pathname
    447 		 * and directory information which union_vn_create
    448 		 * might need.
    449 		 */
    450 		if (lowervp != un->un_lowervp) {
    451 			union_newlower(un, lowervp);
    452 			if (cnp && (lowervp != NULLVP)) {
    453 				un->un_path = malloc(cnp->cn_namelen+1,
    454 						M_TEMP, M_WAITOK);
    455 				memcpy(un->un_path, cnp->cn_nameptr,
    456 						cnp->cn_namelen);
    457 				un->un_path[cnp->cn_namelen] = '\0';
    458 				vref(dvp);
    459 				un->un_dirvp = dvp;
    460 			}
    461 		} else if (lowervp) {
    462 			vrele(lowervp);
    463 		}
    464 		*vpp = UNIONTOV(un);
    465 		if (uppervp != dvp)
    466 			VOP_UNLOCK(*vpp);
    467 		error = 0;
    468 		goto out;
    469 	}
    470 
    471 	un = malloc(sizeof(struct union_node), M_TEMP, M_WAITOK);
    472 	mutex_init(&un->un_lock, MUTEX_DEFAULT, IPL_NONE);
    473 	un->un_refs = 1;
    474 	un->un_mount = mp;
    475 	un->un_vnode = NULL;
    476 	un->un_uppervp = uppervp;
    477 	un->un_lowervp = lowervp;
    478 	un->un_pvp = undvp;
    479 	if (undvp != NULLVP)
    480 		vref(undvp);
    481 	un->un_dircache = 0;
    482 	un->un_openl = 0;
    483 	un->un_cflags = 0;
    484 
    485 	un->un_uppersz = VNOVAL;
    486 	un->un_lowersz = VNOVAL;
    487 
    488 	if (dvp && cnp && (lowervp != NULLVP)) {
    489 		un->un_path = malloc(cnp->cn_namelen+1, M_TEMP, M_WAITOK);
    490 		memcpy(un->un_path, cnp->cn_nameptr, cnp->cn_namelen);
    491 		un->un_path[cnp->cn_namelen] = '\0';
    492 		vref(dvp);
    493 		un->un_dirvp = dvp;
    494 	} else {
    495 		un->un_path = 0;
    496 		un->un_dirvp = 0;
    497 	}
    498 
    499 	if (docache) {
    500 		mutex_enter(&uhash_lock);
    501 		LIST_FOREACH(un1, &uhashtbl[hash[0]], un_cache) {
    502 			if (un1->un_lowervp == lowervp &&
    503 			    un1->un_uppervp == uppervp &&
    504 			    un1->un_mount == mp) {
    505 				/*
    506 				 * Another thread beat us, push back freshly
    507 				 * allocated node and retry.
    508 				 */
    509 				mutex_exit(&uhash_lock);
    510 				union_rele(un);
    511 				goto loop;
    512 			}
    513 		}
    514 		LIST_INSERT_HEAD(&uhashtbl[hash[0]], un, un_cache);
    515 		un->un_cflags |= UN_CACHED;
    516 		mutex_exit(&uhash_lock);
    517 	}
    518 
    519 	error = vcache_get(mp, &un, sizeof(un), vpp);
    520 	KASSERT(error != 0 || UNIONTOV(un) == *vpp);
    521 	union_rele(un);
    522 	if (error == ENOENT)
    523 		goto loop;
    524 
    525 out:
    526 	if (xlowervp)
    527 		vrele(xlowervp);
    528 
    529 	return error;
    530 }
    531 
    532 int
    533 union_freevp(struct vnode *vp)
    534 {
    535 	struct union_node *un = VTOUNION(vp);
    536 
    537 	vcache_remove(vp->v_mount, &un, sizeof(un));
    538 
    539 	mutex_enter(vp->v_interlock);
    540 	vp->v_data = NULL;
    541 	mutex_exit(vp->v_interlock);
    542 
    543 	union_rele(un);
    544 
    545 	return 0;
    546 }
    547 
    548 int
    549 union_loadvnode(struct mount *mp, struct vnode *vp,
    550     const void *key, size_t key_len, const void **new_key)
    551 {
    552 	struct vattr va;
    553 	struct vnode *svp;
    554 	struct union_node *un;
    555 	struct union_mount *um;
    556 	voff_t uppersz, lowersz;
    557 
    558 	KASSERT(key_len == sizeof(un));
    559 	memcpy(&un, key, key_len);
    560 
    561 	um = MOUNTTOUNIONMOUNT(mp);
    562 	svp = (un->un_uppervp != NULLVP) ? un->un_uppervp : un->un_lowervp;
    563 
    564 	vp->v_tag = VT_UNION;
    565 	vp->v_op = union_vnodeop_p;
    566 	vp->v_data = un;
    567 	un->un_vnode = vp;
    568 
    569 	vp->v_type = svp->v_type;
    570 	if (svp->v_type == VCHR || svp->v_type == VBLK)
    571 		spec_node_init(vp, svp->v_rdev);
    572 
    573 	mutex_obj_hold(svp->v_interlock);
    574 	uvm_obj_setlock(&vp->v_uobj, svp->v_interlock);
    575 	vp->v_iflag |= VI_LOCKSHARE;
    576 
    577 	/* detect the root vnode (and aliases) */
    578 	if ((un->un_uppervp == um->um_uppervp) &&
    579 	    ((un->un_lowervp == NULLVP) || un->un_lowervp == um->um_lowervp)) {
    580 		if (un->un_lowervp == NULLVP) {
    581 			un->un_lowervp = um->um_lowervp;
    582 			if (un->un_lowervp != NULLVP)
    583 				vref(un->un_lowervp);
    584 		}
    585 		vp->v_vflag |= VV_ROOT;
    586 	} else {
    587 		vp->v_iflag |= VI_LAYER;
    588 	}
    589 
    590 	uppersz = lowersz = VNOVAL;
    591 	if (un->un_uppervp != NULLVP) {
    592 		if (vn_lock(un->un_uppervp, LK_SHARED) == 0) {
    593 			if (VOP_GETATTR(un->un_uppervp, &va, FSCRED) == 0)
    594 				uppersz = va.va_size;
    595 			VOP_UNLOCK(un->un_uppervp);
    596 		}
    597 	}
    598 	if (un->un_lowervp != NULLVP) {
    599 		if (vn_lock(un->un_lowervp, LK_SHARED) == 0) {
    600 			if (VOP_GETATTR(un->un_lowervp, &va, FSCRED) == 0)
    601 				lowersz = va.va_size;
    602 			VOP_UNLOCK(un->un_lowervp);
    603 		}
    604 	}
    605 
    606 	mutex_enter(&un->un_lock);
    607 	union_newsize(vp, uppersz, lowersz);
    608 
    609 	mutex_enter(&uhash_lock);
    610 	union_ref(un);
    611 	mutex_exit(&uhash_lock);
    612 
    613 	*new_key = &vp->v_data;
    614 
    615 	return 0;
    616 }
    617 
    618 /*
    619  * copyfile.  copy the vnode (fvp) to the vnode (tvp)
    620  * using a sequence of reads and writes.  both (fvp)
    621  * and (tvp) are locked on entry and exit.
    622  */
    623 int
    624 union_copyfile(struct vnode *fvp, struct vnode *tvp, kauth_cred_t cred,
    625 	struct lwp *l)
    626 {
    627 	char *tbuf;
    628 	struct uio uio;
    629 	struct iovec iov;
    630 	int error = 0;
    631 
    632 	/*
    633 	 * strategy:
    634 	 * allocate a buffer of size MAXBSIZE.
    635 	 * loop doing reads and writes, keeping track
    636 	 * of the current uio offset.
    637 	 * give up at the first sign of trouble.
    638 	 */
    639 
    640 	uio.uio_offset = 0;
    641 	UIO_SETUP_SYSSPACE(&uio);
    642 
    643 	tbuf = malloc(MAXBSIZE, M_TEMP, M_WAITOK);
    644 
    645 	/* ugly loop follows... */
    646 	do {
    647 		off_t offset = uio.uio_offset;
    648 
    649 		uio.uio_iov = &iov;
    650 		uio.uio_iovcnt = 1;
    651 		iov.iov_base = tbuf;
    652 		iov.iov_len = MAXBSIZE;
    653 		uio.uio_resid = iov.iov_len;
    654 		uio.uio_rw = UIO_READ;
    655 		error = VOP_READ(fvp, &uio, 0, cred);
    656 
    657 		if (error == 0) {
    658 			uio.uio_iov = &iov;
    659 			uio.uio_iovcnt = 1;
    660 			iov.iov_base = tbuf;
    661 			iov.iov_len = MAXBSIZE - uio.uio_resid;
    662 			uio.uio_offset = offset;
    663 			uio.uio_rw = UIO_WRITE;
    664 			uio.uio_resid = iov.iov_len;
    665 
    666 			if (uio.uio_resid == 0)
    667 				break;
    668 
    669 			do {
    670 				error = VOP_WRITE(tvp, &uio, 0, cred);
    671 			} while ((uio.uio_resid > 0) && (error == 0));
    672 		}
    673 
    674 	} while (error == 0);
    675 
    676 	free(tbuf, M_TEMP);
    677 	return (error);
    678 }
    679 
    680 /*
    681  * (un) is assumed to be locked on entry and remains
    682  * locked on exit.
    683  */
    684 int
    685 union_copyup(struct union_node *un, int docopy, kauth_cred_t cred,
    686 	struct lwp *l)
    687 {
    688 	int error;
    689 	struct vnode *lvp, *uvp;
    690 	struct vattr lvattr, uvattr;
    691 
    692 	error = union_vn_create(&uvp, un, l);
    693 	if (error)
    694 		return (error);
    695 
    696 	KASSERT(VOP_ISLOCKED(uvp) == LK_EXCLUSIVE);
    697 	union_newupper(un, uvp);
    698 
    699 	lvp = un->un_lowervp;
    700 
    701 	if (docopy) {
    702 		/*
    703 		 * XX - should not ignore errors
    704 		 * from VOP_CLOSE
    705 		 */
    706 		vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
    707 
    708         	error = VOP_GETATTR(lvp, &lvattr, cred);
    709 		if (error == 0)
    710 			error = VOP_OPEN(lvp, FREAD, cred);
    711 		if (error == 0) {
    712 			error = union_copyfile(lvp, uvp, cred, l);
    713 			(void) VOP_CLOSE(lvp, FREAD, cred);
    714 		}
    715 		if (error == 0) {
    716 			/* Copy permissions up too */
    717 			vattr_null(&uvattr);
    718 			uvattr.va_mode = lvattr.va_mode;
    719 			uvattr.va_flags = lvattr.va_flags;
    720         		error = VOP_SETATTR(uvp, &uvattr, cred);
    721 		}
    722 		VOP_UNLOCK(lvp);
    723 #ifdef UNION_DIAGNOSTIC
    724 		if (error == 0)
    725 			uprintf("union: copied up %s\n", un->un_path);
    726 #endif
    727 
    728 	}
    729 	union_vn_close(uvp, FWRITE, cred, l);
    730 
    731 	/*
    732 	 * Subsequent IOs will go to the top layer, so
    733 	 * call close on the lower vnode and open on the
    734 	 * upper vnode to ensure that the filesystem keeps
    735 	 * its references counts right.  This doesn't do
    736 	 * the right thing with (cred) and (FREAD) though.
    737 	 * Ignoring error returns is not right, either.
    738 	 */
    739 	if (error == 0) {
    740 		int i;
    741 
    742 		vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
    743 		for (i = 0; i < un->un_openl; i++) {
    744 			(void) VOP_CLOSE(lvp, FREAD, cred);
    745 			(void) VOP_OPEN(uvp, FREAD, cred);
    746 		}
    747 		un->un_openl = 0;
    748 		VOP_UNLOCK(lvp);
    749 	}
    750 
    751 	return (error);
    752 
    753 }
    754 
    755 /*
    756  * Prepare the creation of a new node in the upper layer.
    757  *
    758  * (dvp) is the directory in which to create the new node.
    759  * it is locked on entry and exit.
    760  * (cnp) is the componentname to be created.
    761  * (cred, path, hash) are credentials, path and its hash to fill (cnp).
    762  */
    763 static int
    764 union_do_lookup(struct vnode *dvp, struct componentname *cnp, kauth_cred_t cred,
    765     const char *path)
    766 {
    767 	int error;
    768 	struct vnode *vp;
    769 
    770 	cnp->cn_nameiop = CREATE;
    771 	cnp->cn_flags = LOCKPARENT | ISLASTCN;
    772 	cnp->cn_cred = cred;
    773 	cnp->cn_nameptr = path;
    774 	cnp->cn_namelen = strlen(path);
    775 
    776 	error = VOP_LOOKUP(dvp, &vp, cnp);
    777 
    778 	if (error == 0) {
    779 		KASSERT(vp != NULL);
    780 		VOP_ABORTOP(dvp, cnp);
    781 		vrele(vp);
    782 		error = EEXIST;
    783 	} else if (error == EJUSTRETURN) {
    784 		error = 0;
    785 	}
    786 
    787 	return error;
    788 }
    789 
    790 /*
    791  * Create a shadow directory in the upper layer.
    792  * The new vnode is returned locked.
    793  *
    794  * (um) points to the union mount structure for access to the
    795  * the mounting process's credentials.
    796  * (dvp) is the directory in which to create the shadow directory.
    797  * it is unlocked on entry and exit.
    798  * (cnp) is the componentname to be created.
    799  * (vpp) is the returned newly created shadow directory, which
    800  * is returned locked.
    801  *
    802  * N.B. We still attempt to create shadow directories even if the union
    803  * is mounted read-only, which is a little nonintuitive.
    804  */
    805 int
    806 union_mkshadow(struct union_mount *um, struct vnode *dvp,
    807 	struct componentname *cnp, struct vnode **vpp)
    808 {
    809 	int error;
    810 	struct vattr va;
    811 	struct componentname cn;
    812 	char *pnbuf;
    813 
    814 	if (cnp->cn_namelen + 1 > MAXPATHLEN)
    815 		return ENAMETOOLONG;
    816 	pnbuf = PNBUF_GET();
    817 	memcpy(pnbuf, cnp->cn_nameptr, cnp->cn_namelen);
    818 	pnbuf[cnp->cn_namelen] = '\0';
    819 
    820 	vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
    821 
    822 	error = union_do_lookup(dvp, &cn,
    823 	    (um->um_op == UNMNT_ABOVE ? cnp->cn_cred : um->um_cred), pnbuf);
    824 	if (error) {
    825 		VOP_UNLOCK(dvp);
    826 		PNBUF_PUT(pnbuf);
    827 		return error;
    828 	}
    829 
    830 	/*
    831 	 * policy: when creating the shadow directory in the
    832 	 * upper layer, create it owned by the user who did
    833 	 * the mount, group from parent directory, and mode
    834 	 * 777 modified by umask (ie mostly identical to the
    835 	 * mkdir syscall).  (jsp, kb)
    836 	 */
    837 
    838 	vattr_null(&va);
    839 	va.va_type = VDIR;
    840 	va.va_mode = um->um_cmode;
    841 
    842 	KASSERT(*vpp == NULL);
    843 	error = VOP_MKDIR(dvp, vpp, &cn, &va);
    844 	VOP_UNLOCK(dvp);
    845 	PNBUF_PUT(pnbuf);
    846 	return error;
    847 }
    848 
    849 /*
    850  * Create a whiteout entry in the upper layer.
    851  *
    852  * (um) points to the union mount structure for access to the
    853  * the mounting process's credentials.
    854  * (dvp) is the directory in which to create the whiteout.
    855  * it is locked on entry and exit.
    856  * (cnp) is the componentname to be created.
    857  * (un) holds the path and its hash to be created.
    858  */
    859 int
    860 union_mkwhiteout(struct union_mount *um, struct vnode *dvp,
    861 	struct componentname *cnp, struct union_node *un)
    862 {
    863 	int error;
    864 	struct componentname cn;
    865 
    866 	error = union_do_lookup(dvp, &cn,
    867 	    (um->um_op == UNMNT_ABOVE ? cnp->cn_cred : um->um_cred),
    868 	    un->un_path);
    869 	if (error)
    870 		return error;
    871 
    872 	error = VOP_WHITEOUT(dvp, &cn, CREATE);
    873 	return error;
    874 }
    875 
    876 /*
    877  * union_vn_create: creates and opens a new shadow file
    878  * on the upper union layer.  this function is similar
    879  * in spirit to calling vn_open but it avoids calling namei().
    880  * the problem with calling namei is that a) it locks too many
    881  * things, and b) it doesn't start at the "right" directory,
    882  * whereas union_do_lookup is told where to start.
    883  */
    884 int
    885 union_vn_create(struct vnode **vpp, struct union_node *un, struct lwp *l)
    886 {
    887 	struct vnode *vp;
    888 	kauth_cred_t cred = l->l_cred;
    889 	struct vattr vat;
    890 	struct vattr *vap = &vat;
    891 	int fmode = FFLAGS(O_WRONLY|O_CREAT|O_TRUNC|O_EXCL);
    892 	int error;
    893 	int cmode = UN_FILEMODE & ~l->l_proc->p_cwdi->cwdi_cmask;
    894 	struct componentname cn;
    895 
    896 	*vpp = NULLVP;
    897 
    898 	vn_lock(un->un_dirvp, LK_EXCLUSIVE | LK_RETRY);
    899 
    900 	error = union_do_lookup(un->un_dirvp, &cn, l->l_cred,
    901 	    un->un_path);
    902 	if (error) {
    903 		VOP_UNLOCK(un->un_dirvp);
    904 		return error;
    905 	}
    906 
    907 	/*
    908 	 * Good - there was no race to create the file
    909 	 * so go ahead and create it.  The permissions
    910 	 * on the file will be 0666 modified by the
    911 	 * current user's umask.  Access to the file, while
    912 	 * it is unioned, will require access to the top *and*
    913 	 * bottom files.  Access when not unioned will simply
    914 	 * require access to the top-level file.
    915 	 * TODO: confirm choice of access permissions.
    916 	 */
    917 	vattr_null(vap);
    918 	vap->va_type = VREG;
    919 	vap->va_mode = cmode;
    920 	vp = NULL;
    921 	error = VOP_CREATE(un->un_dirvp, &vp, &cn, vap);
    922 	if (error) {
    923 		VOP_UNLOCK(un->un_dirvp);
    924 		return error;
    925 	}
    926 
    927 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
    928 	VOP_UNLOCK(un->un_dirvp);
    929 	error = VOP_OPEN(vp, fmode, cred);
    930 	if (error) {
    931 		vput(vp);
    932 		return error;
    933 	}
    934 
    935 	vp->v_writecount++;
    936 	*vpp = vp;
    937 	return 0;
    938 }
    939 
    940 int
    941 union_vn_close(struct vnode *vp, int fmode, kauth_cred_t cred, struct lwp *l)
    942 {
    943 
    944 	if (fmode & FWRITE)
    945 		--vp->v_writecount;
    946 	return (VOP_CLOSE(vp, fmode, cred));
    947 }
    948 
    949 void
    950 union_removed_upper(struct union_node *un)
    951 {
    952 	struct vnode *vp = UNIONTOV(un);
    953 
    954 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
    955 #if 1
    956 	/*
    957 	 * We do not set the uppervp to NULLVP here, because lowervp
    958 	 * may also be NULLVP, so this routine would end up creating
    959 	 * a bogus union node with no upper or lower VP (that causes
    960 	 * pain in many places that assume at least one VP exists).
    961 	 * Since we've removed this node from the cache hash chains,
    962 	 * it won't be found again.  When all current holders
    963 	 * release it, union_inactive() will vgone() it.
    964 	 */
    965 	union_diruncache(un);
    966 #else
    967 	union_newupper(un, NULLVP);
    968 #endif
    969 
    970 	VOP_UNLOCK(vp);
    971 
    972 	mutex_enter(&uhash_lock);
    973 	if (un->un_cflags & UN_CACHED) {
    974 		un->un_cflags &= ~UN_CACHED;
    975 		LIST_REMOVE(un, un_cache);
    976 	}
    977 	mutex_exit(&uhash_lock);
    978 }
    979 
    980 #if 0
    981 struct vnode *
    982 union_lowervp(struct vnode *vp)
    983 {
    984 	struct union_node *un = VTOUNION(vp);
    985 
    986 	if ((un->un_lowervp != NULLVP) &&
    987 	    (vp->v_type == un->un_lowervp->v_type)) {
    988 		if (vget(un->un_lowervp, 0) == 0)
    989 			return (un->un_lowervp);
    990 	}
    991 
    992 	return (NULLVP);
    993 }
    994 #endif
    995 
    996 /*
    997  * determine whether a whiteout is needed
    998  * during a remove/rmdir operation.
    999  */
   1000 int
   1001 union_dowhiteout(struct union_node *un, kauth_cred_t cred)
   1002 {
   1003 	struct vattr va;
   1004 
   1005 	if (un->un_lowervp != NULLVP)
   1006 		return (1);
   1007 
   1008 	if (VOP_GETATTR(un->un_uppervp, &va, cred) == 0 &&
   1009 	    (va.va_flags & OPAQUE))
   1010 		return (1);
   1011 
   1012 	return (0);
   1013 }
   1014 
   1015 static void
   1016 union_dircache_r(struct vnode *vp, struct vnode ***vppp, int *cntp)
   1017 {
   1018 	struct union_node *un;
   1019 
   1020 	if (vp->v_op != union_vnodeop_p) {
   1021 		if (vppp) {
   1022 			vref(vp);
   1023 			*(*vppp)++ = vp;
   1024 			if (--(*cntp) == 0)
   1025 				panic("union: dircache table too small");
   1026 		} else {
   1027 			(*cntp)++;
   1028 		}
   1029 
   1030 		return;
   1031 	}
   1032 
   1033 	un = VTOUNION(vp);
   1034 	if (un->un_uppervp != NULLVP)
   1035 		union_dircache_r(un->un_uppervp, vppp, cntp);
   1036 	if (un->un_lowervp != NULLVP)
   1037 		union_dircache_r(un->un_lowervp, vppp, cntp);
   1038 }
   1039 
   1040 struct vnode *
   1041 union_dircache(struct vnode *vp, struct lwp *l)
   1042 {
   1043 	int cnt;
   1044 	struct vnode *nvp = NULLVP;
   1045 	struct vnode **vpp;
   1046 	struct vnode **dircache;
   1047 	int error;
   1048 
   1049 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   1050 	dircache = VTOUNION(vp)->un_dircache;
   1051 
   1052 	nvp = NULLVP;
   1053 
   1054 	if (dircache == 0) {
   1055 		cnt = 0;
   1056 		union_dircache_r(vp, 0, &cnt);
   1057 		cnt++;
   1058 		dircache = (struct vnode **)
   1059 				malloc(cnt * sizeof(struct vnode *),
   1060 					M_TEMP, M_WAITOK);
   1061 		vpp = dircache;
   1062 		union_dircache_r(vp, &vpp, &cnt);
   1063 		VTOUNION(vp)->un_dircache = dircache;
   1064 		*vpp = NULLVP;
   1065 		vpp = dircache + 1;
   1066 	} else {
   1067 		vpp = dircache;
   1068 		do {
   1069 			if (*vpp++ == VTOUNION(vp)->un_uppervp)
   1070 				break;
   1071 		} while (*vpp != NULLVP);
   1072 	}
   1073 
   1074 	if (*vpp == NULLVP)
   1075 		goto out;
   1076 
   1077 	vref(*vpp);
   1078 	error = union_allocvp(&nvp, vp->v_mount, NULLVP, NULLVP, 0, *vpp, NULLVP, 0);
   1079 	if (!error) {
   1080 		vn_lock(nvp, LK_EXCLUSIVE | LK_RETRY);
   1081 		VTOUNION(vp)->un_dircache = 0;
   1082 		VTOUNION(nvp)->un_dircache = dircache;
   1083 	}
   1084 
   1085 out:
   1086 	VOP_UNLOCK(vp);
   1087 	return (nvp);
   1088 }
   1089 
   1090 void
   1091 union_diruncache(struct union_node *un)
   1092 {
   1093 	struct vnode **vpp;
   1094 
   1095 	KASSERT(VOP_ISLOCKED(UNIONTOV(un)) == LK_EXCLUSIVE);
   1096 	if (un->un_dircache != 0) {
   1097 		for (vpp = un->un_dircache; *vpp != NULLVP; vpp++)
   1098 			vrele(*vpp);
   1099 		free(un->un_dircache, M_TEMP);
   1100 		un->un_dircache = 0;
   1101 	}
   1102 }
   1103 
   1104 /*
   1105  * Check whether node can rmdir (check empty).
   1106  */
   1107 int
   1108 union_check_rmdir(struct union_node *un, kauth_cred_t cred)
   1109 {
   1110 	int dirlen, eofflag, error;
   1111 	char *dirbuf;
   1112 	struct vattr va;
   1113 	struct vnode *tvp;
   1114 	struct dirent *dp, *edp;
   1115 	struct componentname cn;
   1116 	struct iovec aiov;
   1117 	struct uio auio;
   1118 
   1119 	KASSERT(un->un_uppervp != NULL);
   1120 
   1121 	/* Check upper for being opaque. */
   1122 	KASSERT(VOP_ISLOCKED(un->un_uppervp));
   1123 	error = VOP_GETATTR(un->un_uppervp, &va, cred);
   1124 	if (error || (va.va_flags & OPAQUE))
   1125 		return error;
   1126 
   1127 	if (un->un_lowervp == NULL)
   1128 		return 0;
   1129 
   1130 	/* Check lower for being empty. */
   1131 	vn_lock(un->un_lowervp, LK_SHARED | LK_RETRY);
   1132 	error = VOP_GETATTR(un->un_lowervp, &va, cred);
   1133 	if (error) {
   1134 		VOP_UNLOCK(un->un_lowervp);
   1135 		return error;
   1136 	}
   1137 	dirlen = va.va_blocksize;
   1138 	dirbuf = kmem_alloc(dirlen, KM_SLEEP);
   1139 	if (dirbuf == NULL) {
   1140 		VOP_UNLOCK(un->un_lowervp);
   1141 		return ENOMEM;
   1142 	}
   1143 	/* error = 0; */
   1144 	eofflag = 0;
   1145 	auio.uio_offset = 0;
   1146 	do {
   1147 		aiov.iov_len = dirlen;
   1148 		aiov.iov_base = dirbuf;
   1149 		auio.uio_iov = &aiov;
   1150 		auio.uio_iovcnt = 1;
   1151 		auio.uio_resid = aiov.iov_len;
   1152 		auio.uio_rw = UIO_READ;
   1153 		UIO_SETUP_SYSSPACE(&auio);
   1154 		error = VOP_READDIR(un->un_lowervp, &auio, cred, &eofflag,
   1155 		    NULL, NULL);
   1156 		if (error)
   1157 			break;
   1158 		edp = (struct dirent *)&dirbuf[dirlen - auio.uio_resid];
   1159 		for (dp = (struct dirent *)dirbuf;
   1160 		    error == 0 && dp < edp;
   1161 		    dp = (struct dirent *)((char *)dp + dp->d_reclen)) {
   1162 			if (dp->d_reclen == 0) {
   1163 				error = ENOTEMPTY;
   1164 				break;
   1165 			}
   1166 			if (dp->d_type == DT_WHT ||
   1167 			    (dp->d_namlen == 1 && dp->d_name[0] == '.') ||
   1168 			    (dp->d_namlen == 2 && !memcmp(dp->d_name, "..", 2)))
   1169 				continue;
   1170 			/* Check for presence in the upper layer. */
   1171 			cn.cn_nameiop = LOOKUP;
   1172 			cn.cn_flags = ISLASTCN | RDONLY;
   1173 			cn.cn_cred = cred;
   1174 			cn.cn_nameptr = dp->d_name;
   1175 			cn.cn_namelen = dp->d_namlen;
   1176 			error = VOP_LOOKUP(un->un_uppervp, &tvp, &cn);
   1177 			if (error == ENOENT && (cn.cn_flags & ISWHITEOUT)) {
   1178 				error = 0;
   1179 				continue;
   1180 			}
   1181 			if (error == 0)
   1182 				vrele(tvp);
   1183 			error = ENOTEMPTY;
   1184 		}
   1185 	} while (error == 0 && !eofflag);
   1186 	kmem_free(dirbuf, dirlen);
   1187 	VOP_UNLOCK(un->un_lowervp);
   1188 
   1189 	return error;
   1190 }
   1191 
   1192 /*
   1193  * This hook is called from vn_readdir() to switch to lower directory
   1194  * entry after the upper directory is read.
   1195  */
   1196 int
   1197 union_readdirhook(struct vnode **vpp, struct file *fp, struct lwp *l)
   1198 {
   1199 	struct vnode *vp = *vpp, *lvp;
   1200 	struct vattr va;
   1201 	int error;
   1202 
   1203 	if (vp->v_op != union_vnodeop_p)
   1204 		return (0);
   1205 
   1206 	/*
   1207 	 * If the directory is opaque,
   1208 	 * then don't show lower entries
   1209 	 */
   1210 	vn_lock(vp, LK_SHARED | LK_RETRY);
   1211 	error = VOP_GETATTR(vp, &va, fp->f_cred);
   1212 	VOP_UNLOCK(vp);
   1213 	if (error || (va.va_flags & OPAQUE))
   1214 		return error;
   1215 
   1216 	if ((lvp = union_dircache(vp, l)) == NULLVP)
   1217 		return (0);
   1218 
   1219 	error = VOP_OPEN(lvp, FREAD, fp->f_cred);
   1220 	if (error) {
   1221 		vput(lvp);
   1222 		return (error);
   1223 	}
   1224 	VOP_UNLOCK(lvp);
   1225 	fp->f_vnode = lvp;
   1226 	fp->f_offset = 0;
   1227 	error = vn_close(vp, FREAD, fp->f_cred);
   1228 	if (error)
   1229 		return (error);
   1230 	*vpp = lvp;
   1231 	return (0);
   1232 }
   1233