Home | History | Annotate | Line # | Download | only in union
union_subr.c revision 1.73
      1 /*	$NetBSD: union_subr.c,v 1.73 2015/04/20 19:36:55 riastradh Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 1994
      5  *	The Regents of the University of California.  All rights reserved.
      6  *
      7  * This code is derived from software contributed to Berkeley by
      8  * Jan-Simon Pendry.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  * 3. Neither the name of the University nor the names of its contributors
     19  *    may be used to endorse or promote products derived from this software
     20  *    without specific prior written permission.
     21  *
     22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     32  * SUCH DAMAGE.
     33  *
     34  *	@(#)union_subr.c	8.20 (Berkeley) 5/20/95
     35  */
     36 
     37 /*
     38  * Copyright (c) 1994 Jan-Simon Pendry
     39  *
     40  * This code is derived from software contributed to Berkeley by
     41  * Jan-Simon Pendry.
     42  *
     43  * Redistribution and use in source and binary forms, with or without
     44  * modification, are permitted provided that the following conditions
     45  * are met:
     46  * 1. Redistributions of source code must retain the above copyright
     47  *    notice, this list of conditions and the following disclaimer.
     48  * 2. Redistributions in binary form must reproduce the above copyright
     49  *    notice, this list of conditions and the following disclaimer in the
     50  *    documentation and/or other materials provided with the distribution.
     51  * 3. All advertising materials mentioning features or use of this software
     52  *    must display the following acknowledgement:
     53  *	This product includes software developed by the University of
     54  *	California, Berkeley and its contributors.
     55  * 4. Neither the name of the University nor the names of its contributors
     56  *    may be used to endorse or promote products derived from this software
     57  *    without specific prior written permission.
     58  *
     59  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     60  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     61  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     62  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     63  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     64  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     65  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     66  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     67  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     68  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     69  * SUCH DAMAGE.
     70  *
     71  *	@(#)union_subr.c	8.20 (Berkeley) 5/20/95
     72  */
     73 
     74 #include <sys/cdefs.h>
     75 __KERNEL_RCSID(0, "$NetBSD: union_subr.c,v 1.73 2015/04/20 19:36:55 riastradh Exp $");
     76 
     77 #include <sys/param.h>
     78 #include <sys/systm.h>
     79 #include <sys/proc.h>
     80 #include <sys/time.h>
     81 #include <sys/kernel.h>
     82 #include <sys/vnode.h>
     83 #include <sys/namei.h>
     84 #include <sys/malloc.h>
     85 #include <sys/dirent.h>
     86 #include <sys/file.h>
     87 #include <sys/filedesc.h>
     88 #include <sys/queue.h>
     89 #include <sys/mount.h>
     90 #include <sys/stat.h>
     91 #include <sys/kauth.h>
     92 
     93 #include <uvm/uvm_extern.h>
     94 
     95 #include <fs/union/union.h>
     96 #include <miscfs/genfs/genfs.h>
     97 #include <miscfs/specfs/specdev.h>
     98 
     99 static LIST_HEAD(uhashhead, union_node) *uhashtbl;
    100 static u_long uhash_mask;		/* size of hash table - 1 */
    101 #define UNION_HASH(u, l) \
    102 	((((u_long) (u) + (u_long) (l)) >> 8) & uhash_mask)
    103 #define NOHASH	((u_long)-1)
    104 
    105 static kmutex_t uhash_lock;
    106 
    107 void union_updatevp(struct union_node *, struct vnode *, struct vnode *);
    108 static void union_ref(struct union_node *);
    109 static void union_rele(struct union_node *);
    110 static int union_do_lookup(struct vnode *, struct componentname *, kauth_cred_t,    const char *);
    111 int union_vn_close(struct vnode *, int, kauth_cred_t, struct lwp *);
    112 static void union_dircache_r(struct vnode *, struct vnode ***, int *);
    113 struct vnode *union_dircache(struct vnode *, struct lwp *);
    114 
    115 void
    116 union_init(void)
    117 {
    118 
    119 	mutex_init(&uhash_lock, MUTEX_DEFAULT, IPL_NONE);
    120 	uhashtbl = hashinit(desiredvnodes, HASH_LIST, true, &uhash_mask);
    121 }
    122 
    123 void
    124 union_reinit(void)
    125 {
    126 	struct union_node *un;
    127 	struct uhashhead *oldhash, *hash;
    128 	u_long oldmask, mask, val;
    129 	int i;
    130 
    131 	hash = hashinit(desiredvnodes, HASH_LIST, true, &mask);
    132 	mutex_enter(&uhash_lock);
    133 	oldhash = uhashtbl;
    134 	oldmask = uhash_mask;
    135 	uhashtbl = hash;
    136 	uhash_mask = mask;
    137 	for (i = 0; i <= oldmask; i++) {
    138 		while ((un = LIST_FIRST(&oldhash[i])) != NULL) {
    139 			LIST_REMOVE(un, un_cache);
    140 			val = UNION_HASH(un->un_uppervp, un->un_lowervp);
    141 			LIST_INSERT_HEAD(&hash[val], un, un_cache);
    142 		}
    143 	}
    144 	mutex_exit(&uhash_lock);
    145 	hashdone(oldhash, HASH_LIST, oldmask);
    146 }
    147 
    148 /*
    149  * Free global unionfs resources.
    150  */
    151 void
    152 union_done(void)
    153 {
    154 
    155 	hashdone(uhashtbl, HASH_LIST, uhash_mask);
    156 	mutex_destroy(&uhash_lock);
    157 
    158 	/* Make sure to unset the readdir hook. */
    159 	vn_union_readdir_hook = NULL;
    160 }
    161 
    162 void
    163 union_updatevp(struct union_node *un, struct vnode *uppervp,
    164 	struct vnode *lowervp)
    165 {
    166 	int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp);
    167 	int nhash = UNION_HASH(uppervp, lowervp);
    168 	int docache = (lowervp != NULLVP || uppervp != NULLVP);
    169 	bool un_unlock;
    170 
    171 	KASSERT(VOP_ISLOCKED(UNIONTOV(un)) == LK_EXCLUSIVE);
    172 
    173 	mutex_enter(&uhash_lock);
    174 
    175 	if (!docache || ohash != nhash) {
    176 		if (un->un_cflags & UN_CACHED) {
    177 			un->un_cflags &= ~UN_CACHED;
    178 			LIST_REMOVE(un, un_cache);
    179 		}
    180 	}
    181 
    182 	if (un->un_lowervp != lowervp) {
    183 		if (un->un_lowervp) {
    184 			vrele(un->un_lowervp);
    185 			if (un->un_path) {
    186 				free(un->un_path, M_TEMP);
    187 				un->un_path = 0;
    188 			}
    189 			if (un->un_dirvp) {
    190 				vrele(un->un_dirvp);
    191 				un->un_dirvp = NULLVP;
    192 			}
    193 		}
    194 		un->un_lowervp = lowervp;
    195 		mutex_enter(&un->un_lock);
    196 		un->un_lowersz = VNOVAL;
    197 		mutex_exit(&un->un_lock);
    198 	}
    199 
    200 	if (un->un_uppervp != uppervp) {
    201 		if (un->un_uppervp) {
    202 			un_unlock = false;
    203 			vrele(un->un_uppervp);
    204 		} else
    205 			un_unlock = true;
    206 
    207 		mutex_enter(&un->un_lock);
    208 		un->un_uppervp = uppervp;
    209 		mutex_exit(&un->un_lock);
    210 		if (un_unlock) {
    211 			struct vop_unlock_args ap;
    212 
    213 			ap.a_vp = UNIONTOV(un);
    214 			genfs_unlock(&ap);
    215 		}
    216 		mutex_enter(&un->un_lock);
    217 		un->un_uppersz = VNOVAL;
    218 		mutex_exit(&un->un_lock);
    219 		/* Update union vnode interlock. */
    220 		if (uppervp != NULL) {
    221 			mutex_obj_hold(uppervp->v_interlock);
    222 			uvm_obj_setlock(&UNIONTOV(un)->v_uobj,
    223 			    uppervp->v_interlock);
    224 		}
    225 	}
    226 
    227 	if (docache && (ohash != nhash)) {
    228 		LIST_INSERT_HEAD(&uhashtbl[nhash], un, un_cache);
    229 		un->un_cflags |= UN_CACHED;
    230 	}
    231 
    232 	mutex_exit(&uhash_lock);
    233 }
    234 
    235 void
    236 union_newlower(struct union_node *un, struct vnode *lowervp)
    237 {
    238 
    239 	union_updatevp(un, un->un_uppervp, lowervp);
    240 }
    241 
    242 void
    243 union_newupper(struct union_node *un, struct vnode *uppervp)
    244 {
    245 
    246 	union_updatevp(un, uppervp, un->un_lowervp);
    247 }
    248 
    249 /*
    250  * Keep track of size changes in the underlying vnodes.
    251  * If the size changes, then callback to the vm layer
    252  * giving priority to the upper layer size.
    253  *
    254  * Mutex un_lock hold on entry and released on return.
    255  */
    256 void
    257 union_newsize(struct vnode *vp, off_t uppersz, off_t lowersz)
    258 {
    259 	struct union_node *un = VTOUNION(vp);
    260 	off_t sz;
    261 
    262 	KASSERT(mutex_owned(&un->un_lock));
    263 	/* only interested in regular files */
    264 	if (vp->v_type != VREG) {
    265 		mutex_exit(&un->un_lock);
    266 		uvm_vnp_setsize(vp, 0);
    267 		return;
    268 	}
    269 
    270 	sz = VNOVAL;
    271 
    272 	if ((uppersz != VNOVAL) && (un->un_uppersz != uppersz)) {
    273 		un->un_uppersz = uppersz;
    274 		if (sz == VNOVAL)
    275 			sz = un->un_uppersz;
    276 	}
    277 
    278 	if ((lowersz != VNOVAL) && (un->un_lowersz != lowersz)) {
    279 		un->un_lowersz = lowersz;
    280 		if (sz == VNOVAL)
    281 			sz = un->un_lowersz;
    282 	}
    283 	mutex_exit(&un->un_lock);
    284 
    285 	if (sz != VNOVAL) {
    286 #ifdef UNION_DIAGNOSTIC
    287 		printf("union: %s size now %qd\n",
    288 		    uppersz != VNOVAL ? "upper" : "lower", sz);
    289 #endif
    290 		uvm_vnp_setsize(vp, sz);
    291 	}
    292 }
    293 
    294 static void
    295 union_ref(struct union_node *un)
    296 {
    297 
    298 	KASSERT(mutex_owned(&uhash_lock));
    299 	un->un_refs++;
    300 }
    301 
    302 static void
    303 union_rele(struct union_node *un)
    304 {
    305 
    306 	mutex_enter(&uhash_lock);
    307 	un->un_refs--;
    308 	if (un->un_refs > 0) {
    309 		mutex_exit(&uhash_lock);
    310 		return;
    311 	}
    312 	if (un->un_cflags & UN_CACHED) {
    313 		un->un_cflags &= ~UN_CACHED;
    314 		LIST_REMOVE(un, un_cache);
    315 	}
    316 	mutex_exit(&uhash_lock);
    317 
    318 	if (un->un_pvp != NULLVP)
    319 		vrele(un->un_pvp);
    320 	if (un->un_uppervp != NULLVP)
    321 		vrele(un->un_uppervp);
    322 	if (un->un_lowervp != NULLVP)
    323 		vrele(un->un_lowervp);
    324 	if (un->un_dirvp != NULLVP)
    325 		vrele(un->un_dirvp);
    326 	if (un->un_path)
    327 		free(un->un_path, M_TEMP);
    328 	mutex_destroy(&un->un_lock);
    329 
    330 	free(un, M_TEMP);
    331 }
    332 
    333 /*
    334  * allocate a union_node/vnode pair.  the vnode is
    335  * referenced and unlocked.  the new vnode is returned
    336  * via (vpp).  (mp) is the mountpoint of the union filesystem,
    337  * (dvp) is the parent directory where the upper layer object
    338  * should exist (but doesn't) and (cnp) is the componentname
    339  * information which is partially copied to allow the upper
    340  * layer object to be created at a later time.  (uppervp)
    341  * and (lowervp) reference the upper and lower layer objects
    342  * being mapped.  either, but not both, can be nil.
    343  * both, if supplied, are unlocked.
    344  * the reference is either maintained in the new union_node
    345  * object which is allocated, or they are vrele'd.
    346  *
    347  * all union_nodes are maintained on a hash
    348  * list.  new nodes are only allocated when they cannot
    349  * be found on this list.  entries on the list are
    350  * removed when the vfs reclaim entry is called.
    351  *
    352  * the vnode gets attached or referenced with vcache_get().
    353  */
    354 int
    355 union_allocvp(
    356 	struct vnode **vpp,
    357 	struct mount *mp,
    358 	struct vnode *undvp,		/* parent union vnode */
    359 	struct vnode *dvp,		/* may be null */
    360 	struct componentname *cnp,	/* may be null */
    361 	struct vnode *uppervp,		/* may be null */
    362 	struct vnode *lowervp,		/* may be null */
    363 	int docache)
    364 {
    365 	int error;
    366 	struct union_node *un = NULL, *un1;
    367 	struct vnode *vp, *xlowervp = NULLVP;
    368 	u_long hash[3];
    369 	int try;
    370 	bool is_dotdot;
    371 
    372 	is_dotdot = (dvp != NULL && cnp != NULL && (cnp->cn_flags & ISDOTDOT));
    373 
    374 	if (uppervp == NULLVP && lowervp == NULLVP)
    375 		panic("union: unidentifiable allocation");
    376 
    377 	if (uppervp && lowervp && (uppervp->v_type != lowervp->v_type)) {
    378 		xlowervp = lowervp;
    379 		lowervp = NULLVP;
    380 	}
    381 
    382 	if (!docache) {
    383 		un = NULL;
    384 		goto found;
    385 	}
    386 
    387 	/*
    388 	 * If both uppervp and lowervp are not NULL we have to
    389 	 * search union nodes with one vnode as NULL too.
    390 	 */
    391 	hash[0] = UNION_HASH(uppervp, lowervp);
    392 	if (uppervp == NULL || lowervp == NULL) {
    393 		hash[1] = hash[2] = NOHASH;
    394 	} else {
    395 		hash[1] = UNION_HASH(uppervp, NULLVP);
    396 		hash[2] = UNION_HASH(NULLVP, lowervp);
    397 	}
    398 
    399 loop:
    400 	mutex_enter(&uhash_lock);
    401 
    402 	for (try = 0; try < 3; try++) {
    403 		if (hash[try] == NOHASH)
    404 			continue;
    405 		LIST_FOREACH(un, &uhashtbl[hash[try]], un_cache) {
    406 			if ((un->un_lowervp && un->un_lowervp != lowervp) ||
    407 			    (un->un_uppervp && un->un_uppervp != uppervp) ||
    408 			    un->un_mount != mp)
    409 				continue;
    410 
    411 			union_ref(un);
    412 			mutex_exit(&uhash_lock);
    413 			error = vcache_get(mp, &un, sizeof(un), &vp);
    414 			KASSERT(error != 0 || UNIONTOV(un) == vp);
    415 			union_rele(un);
    416 			if (error == ENOENT)
    417 				goto loop;
    418 			else if (error)
    419 				goto out;
    420 			goto found;
    421 		}
    422 	}
    423 
    424 	mutex_exit(&uhash_lock);
    425 
    426 found:
    427 	if (un) {
    428 		if (uppervp != dvp) {
    429 			if (is_dotdot)
    430 				VOP_UNLOCK(dvp);
    431 			vn_lock(UNIONTOV(un), LK_EXCLUSIVE | LK_RETRY);
    432 			if (is_dotdot)
    433 				vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
    434 		}
    435 		/*
    436 		 * Save information about the upper layer.
    437 		 */
    438 		if (uppervp != un->un_uppervp) {
    439 			union_newupper(un, uppervp);
    440 		} else if (uppervp) {
    441 			vrele(uppervp);
    442 		}
    443 
    444 		/*
    445 		 * Save information about the lower layer.
    446 		 * This needs to keep track of pathname
    447 		 * and directory information which union_vn_create
    448 		 * might need.
    449 		 */
    450 		if (lowervp != un->un_lowervp) {
    451 			union_newlower(un, lowervp);
    452 			if (cnp && (lowervp != NULLVP)) {
    453 				un->un_path = malloc(cnp->cn_namelen+1,
    454 						M_TEMP, M_WAITOK);
    455 				memcpy(un->un_path, cnp->cn_nameptr,
    456 						cnp->cn_namelen);
    457 				un->un_path[cnp->cn_namelen] = '\0';
    458 				vref(dvp);
    459 				un->un_dirvp = dvp;
    460 			}
    461 		} else if (lowervp) {
    462 			vrele(lowervp);
    463 		}
    464 		*vpp = UNIONTOV(un);
    465 		if (uppervp != dvp)
    466 			VOP_UNLOCK(*vpp);
    467 		error = 0;
    468 		goto out;
    469 	}
    470 
    471 	un = malloc(sizeof(struct union_node), M_TEMP, M_WAITOK);
    472 	mutex_init(&un->un_lock, MUTEX_DEFAULT, IPL_NONE);
    473 	un->un_refs = 1;
    474 	un->un_mount = mp;
    475 	un->un_vnode = NULL;
    476 	un->un_uppervp = uppervp;
    477 	un->un_lowervp = lowervp;
    478 	un->un_pvp = undvp;
    479 	if (undvp != NULLVP)
    480 		vref(undvp);
    481 	un->un_dircache = 0;
    482 	un->un_openl = 0;
    483 	un->un_cflags = 0;
    484 
    485 	un->un_uppersz = VNOVAL;
    486 	un->un_lowersz = VNOVAL;
    487 
    488 	if (dvp && cnp && (lowervp != NULLVP)) {
    489 		un->un_path = malloc(cnp->cn_namelen+1, M_TEMP, M_WAITOK);
    490 		memcpy(un->un_path, cnp->cn_nameptr, cnp->cn_namelen);
    491 		un->un_path[cnp->cn_namelen] = '\0';
    492 		vref(dvp);
    493 		un->un_dirvp = dvp;
    494 	} else {
    495 		un->un_path = 0;
    496 		un->un_dirvp = 0;
    497 	}
    498 
    499 	if (docache) {
    500 		mutex_enter(&uhash_lock);
    501 		LIST_FOREACH(un1, &uhashtbl[hash[0]], un_cache) {
    502 			if (un1->un_lowervp == lowervp &&
    503 			    un1->un_uppervp == uppervp &&
    504 			    un1->un_mount == mp) {
    505 				/*
    506 				 * Another thread beat us, push back freshly
    507 				 * allocated node and retry.
    508 				 */
    509 				mutex_exit(&uhash_lock);
    510 				union_rele(un);
    511 				goto loop;
    512 			}
    513 		}
    514 		LIST_INSERT_HEAD(&uhashtbl[hash[0]], un, un_cache);
    515 		un->un_cflags |= UN_CACHED;
    516 		mutex_exit(&uhash_lock);
    517 	}
    518 
    519 	error = vcache_get(mp, &un, sizeof(un), vpp);
    520 	KASSERT(error != 0 || UNIONTOV(un) == *vpp);
    521 	union_rele(un);
    522 	if (error == ENOENT)
    523 		goto loop;
    524 
    525 out:
    526 	if (xlowervp)
    527 		vrele(xlowervp);
    528 
    529 	return error;
    530 }
    531 
    532 int
    533 union_freevp(struct vnode *vp)
    534 {
    535 	struct union_node *un = VTOUNION(vp);
    536 
    537 	/* Detach vnode from union node. */
    538 	un->un_vnode = NULL;
    539 	un->un_uppersz = VNOVAL;
    540 	un->un_lowersz = VNOVAL;
    541 
    542 	vcache_remove(vp->v_mount, &un, sizeof(un));
    543 
    544 	/* Detach union node from vnode. */
    545 	mutex_enter(vp->v_interlock);
    546 	vp->v_data = NULL;
    547 	mutex_exit(vp->v_interlock);
    548 
    549 	union_rele(un);
    550 
    551 	return 0;
    552 }
    553 
    554 int
    555 union_loadvnode(struct mount *mp, struct vnode *vp,
    556     const void *key, size_t key_len, const void **new_key)
    557 {
    558 	struct vattr va;
    559 	struct vnode *svp;
    560 	struct union_node *un;
    561 	struct union_mount *um;
    562 	voff_t uppersz, lowersz;
    563 
    564 	KASSERT(key_len == sizeof(un));
    565 	memcpy(&un, key, key_len);
    566 
    567 	um = MOUNTTOUNIONMOUNT(mp);
    568 	svp = (un->un_uppervp != NULLVP) ? un->un_uppervp : un->un_lowervp;
    569 
    570 	vp->v_tag = VT_UNION;
    571 	vp->v_op = union_vnodeop_p;
    572 	vp->v_data = un;
    573 	un->un_vnode = vp;
    574 
    575 	vp->v_type = svp->v_type;
    576 	if (svp->v_type == VCHR || svp->v_type == VBLK)
    577 		spec_node_init(vp, svp->v_rdev);
    578 
    579 	mutex_obj_hold(svp->v_interlock);
    580 	uvm_obj_setlock(&vp->v_uobj, svp->v_interlock);
    581 
    582 	/* detect the root vnode (and aliases) */
    583 	if ((un->un_uppervp == um->um_uppervp) &&
    584 	    ((un->un_lowervp == NULLVP) || un->un_lowervp == um->um_lowervp)) {
    585 		if (un->un_lowervp == NULLVP) {
    586 			un->un_lowervp = um->um_lowervp;
    587 			if (un->un_lowervp != NULLVP)
    588 				vref(un->un_lowervp);
    589 		}
    590 		vp->v_vflag |= VV_ROOT;
    591 	}
    592 
    593 	uppersz = lowersz = VNOVAL;
    594 	if (un->un_uppervp != NULLVP) {
    595 		if (vn_lock(un->un_uppervp, LK_SHARED) == 0) {
    596 			if (VOP_GETATTR(un->un_uppervp, &va, FSCRED) == 0)
    597 				uppersz = va.va_size;
    598 			VOP_UNLOCK(un->un_uppervp);
    599 		}
    600 	}
    601 	if (un->un_lowervp != NULLVP) {
    602 		if (vn_lock(un->un_lowervp, LK_SHARED) == 0) {
    603 			if (VOP_GETATTR(un->un_lowervp, &va, FSCRED) == 0)
    604 				lowersz = va.va_size;
    605 			VOP_UNLOCK(un->un_lowervp);
    606 		}
    607 	}
    608 
    609 	mutex_enter(&un->un_lock);
    610 	union_newsize(vp, uppersz, lowersz);
    611 
    612 	mutex_enter(&uhash_lock);
    613 	union_ref(un);
    614 	mutex_exit(&uhash_lock);
    615 
    616 	*new_key = &vp->v_data;
    617 
    618 	return 0;
    619 }
    620 
    621 /*
    622  * copyfile.  copy the vnode (fvp) to the vnode (tvp)
    623  * using a sequence of reads and writes.  both (fvp)
    624  * and (tvp) are locked on entry and exit.
    625  */
    626 int
    627 union_copyfile(struct vnode *fvp, struct vnode *tvp, kauth_cred_t cred,
    628 	struct lwp *l)
    629 {
    630 	char *tbuf;
    631 	struct uio uio;
    632 	struct iovec iov;
    633 	int error = 0;
    634 
    635 	/*
    636 	 * strategy:
    637 	 * allocate a buffer of size MAXBSIZE.
    638 	 * loop doing reads and writes, keeping track
    639 	 * of the current uio offset.
    640 	 * give up at the first sign of trouble.
    641 	 */
    642 
    643 	uio.uio_offset = 0;
    644 	UIO_SETUP_SYSSPACE(&uio);
    645 
    646 	tbuf = malloc(MAXBSIZE, M_TEMP, M_WAITOK);
    647 
    648 	/* ugly loop follows... */
    649 	do {
    650 		off_t offset = uio.uio_offset;
    651 
    652 		uio.uio_iov = &iov;
    653 		uio.uio_iovcnt = 1;
    654 		iov.iov_base = tbuf;
    655 		iov.iov_len = MAXBSIZE;
    656 		uio.uio_resid = iov.iov_len;
    657 		uio.uio_rw = UIO_READ;
    658 		error = VOP_READ(fvp, &uio, 0, cred);
    659 
    660 		if (error == 0) {
    661 			uio.uio_iov = &iov;
    662 			uio.uio_iovcnt = 1;
    663 			iov.iov_base = tbuf;
    664 			iov.iov_len = MAXBSIZE - uio.uio_resid;
    665 			uio.uio_offset = offset;
    666 			uio.uio_rw = UIO_WRITE;
    667 			uio.uio_resid = iov.iov_len;
    668 
    669 			if (uio.uio_resid == 0)
    670 				break;
    671 
    672 			do {
    673 				error = VOP_WRITE(tvp, &uio, 0, cred);
    674 			} while ((uio.uio_resid > 0) && (error == 0));
    675 		}
    676 
    677 	} while (error == 0);
    678 
    679 	free(tbuf, M_TEMP);
    680 	return (error);
    681 }
    682 
    683 /*
    684  * (un) is assumed to be locked on entry and remains
    685  * locked on exit.
    686  */
    687 int
    688 union_copyup(struct union_node *un, int docopy, kauth_cred_t cred,
    689 	struct lwp *l)
    690 {
    691 	int error;
    692 	struct vnode *lvp, *uvp;
    693 	struct vattr lvattr, uvattr;
    694 
    695 	error = union_vn_create(&uvp, un, l);
    696 	if (error)
    697 		return (error);
    698 
    699 	KASSERT(VOP_ISLOCKED(uvp) == LK_EXCLUSIVE);
    700 	union_newupper(un, uvp);
    701 
    702 	lvp = un->un_lowervp;
    703 
    704 	if (docopy) {
    705 		/*
    706 		 * XX - should not ignore errors
    707 		 * from VOP_CLOSE
    708 		 */
    709 		vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
    710 
    711         	error = VOP_GETATTR(lvp, &lvattr, cred);
    712 		if (error == 0)
    713 			error = VOP_OPEN(lvp, FREAD, cred);
    714 		if (error == 0) {
    715 			error = union_copyfile(lvp, uvp, cred, l);
    716 			(void) VOP_CLOSE(lvp, FREAD, cred);
    717 		}
    718 		if (error == 0) {
    719 			/* Copy permissions up too */
    720 			vattr_null(&uvattr);
    721 			uvattr.va_mode = lvattr.va_mode;
    722 			uvattr.va_flags = lvattr.va_flags;
    723         		error = VOP_SETATTR(uvp, &uvattr, cred);
    724 		}
    725 		VOP_UNLOCK(lvp);
    726 #ifdef UNION_DIAGNOSTIC
    727 		if (error == 0)
    728 			uprintf("union: copied up %s\n", un->un_path);
    729 #endif
    730 
    731 	}
    732 	union_vn_close(uvp, FWRITE, cred, l);
    733 
    734 	/*
    735 	 * Subsequent IOs will go to the top layer, so
    736 	 * call close on the lower vnode and open on the
    737 	 * upper vnode to ensure that the filesystem keeps
    738 	 * its references counts right.  This doesn't do
    739 	 * the right thing with (cred) and (FREAD) though.
    740 	 * Ignoring error returns is not right, either.
    741 	 */
    742 	if (error == 0) {
    743 		int i;
    744 
    745 		vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
    746 		for (i = 0; i < un->un_openl; i++) {
    747 			(void) VOP_CLOSE(lvp, FREAD, cred);
    748 			(void) VOP_OPEN(uvp, FREAD, cred);
    749 		}
    750 		un->un_openl = 0;
    751 		VOP_UNLOCK(lvp);
    752 	}
    753 
    754 	return (error);
    755 
    756 }
    757 
    758 /*
    759  * Prepare the creation of a new node in the upper layer.
    760  *
    761  * (dvp) is the directory in which to create the new node.
    762  * it is locked on entry and exit.
    763  * (cnp) is the componentname to be created.
    764  * (cred, path, hash) are credentials, path and its hash to fill (cnp).
    765  */
    766 static int
    767 union_do_lookup(struct vnode *dvp, struct componentname *cnp, kauth_cred_t cred,
    768     const char *path)
    769 {
    770 	int error;
    771 	struct vnode *vp;
    772 
    773 	cnp->cn_nameiop = CREATE;
    774 	cnp->cn_flags = LOCKPARENT | ISLASTCN;
    775 	cnp->cn_cred = cred;
    776 	cnp->cn_nameptr = path;
    777 	cnp->cn_namelen = strlen(path);
    778 
    779 	error = VOP_LOOKUP(dvp, &vp, cnp);
    780 
    781 	if (error == 0) {
    782 		KASSERT(vp != NULL);
    783 		VOP_ABORTOP(dvp, cnp);
    784 		vrele(vp);
    785 		error = EEXIST;
    786 	} else if (error == EJUSTRETURN) {
    787 		error = 0;
    788 	}
    789 
    790 	return error;
    791 }
    792 
    793 /*
    794  * Create a shadow directory in the upper layer.
    795  * The new vnode is returned locked.
    796  *
    797  * (um) points to the union mount structure for access to the
    798  * the mounting process's credentials.
    799  * (dvp) is the directory in which to create the shadow directory.
    800  * it is unlocked on entry and exit.
    801  * (cnp) is the componentname to be created.
    802  * (vpp) is the returned newly created shadow directory, which
    803  * is returned locked.
    804  *
    805  * N.B. We still attempt to create shadow directories even if the union
    806  * is mounted read-only, which is a little nonintuitive.
    807  */
    808 int
    809 union_mkshadow(struct union_mount *um, struct vnode *dvp,
    810 	struct componentname *cnp, struct vnode **vpp)
    811 {
    812 	int error;
    813 	struct vattr va;
    814 	struct componentname cn;
    815 	char *pnbuf;
    816 
    817 	if (cnp->cn_namelen + 1 > MAXPATHLEN)
    818 		return ENAMETOOLONG;
    819 	pnbuf = PNBUF_GET();
    820 	memcpy(pnbuf, cnp->cn_nameptr, cnp->cn_namelen);
    821 	pnbuf[cnp->cn_namelen] = '\0';
    822 
    823 	vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
    824 
    825 	error = union_do_lookup(dvp, &cn,
    826 	    (um->um_op == UNMNT_ABOVE ? cnp->cn_cred : um->um_cred), pnbuf);
    827 	if (error) {
    828 		VOP_UNLOCK(dvp);
    829 		PNBUF_PUT(pnbuf);
    830 		return error;
    831 	}
    832 
    833 	/*
    834 	 * policy: when creating the shadow directory in the
    835 	 * upper layer, create it owned by the user who did
    836 	 * the mount, group from parent directory, and mode
    837 	 * 777 modified by umask (ie mostly identical to the
    838 	 * mkdir syscall).  (jsp, kb)
    839 	 */
    840 
    841 	vattr_null(&va);
    842 	va.va_type = VDIR;
    843 	va.va_mode = um->um_cmode;
    844 
    845 	KASSERT(*vpp == NULL);
    846 	error = VOP_MKDIR(dvp, vpp, &cn, &va);
    847 	VOP_UNLOCK(dvp);
    848 	PNBUF_PUT(pnbuf);
    849 	return error;
    850 }
    851 
    852 /*
    853  * Create a whiteout entry in the upper layer.
    854  *
    855  * (um) points to the union mount structure for access to the
    856  * the mounting process's credentials.
    857  * (dvp) is the directory in which to create the whiteout.
    858  * it is locked on entry and exit.
    859  * (cnp) is the componentname to be created.
    860  * (un) holds the path and its hash to be created.
    861  */
    862 int
    863 union_mkwhiteout(struct union_mount *um, struct vnode *dvp,
    864 	struct componentname *cnp, struct union_node *un)
    865 {
    866 	int error;
    867 	struct componentname cn;
    868 
    869 	error = union_do_lookup(dvp, &cn,
    870 	    (um->um_op == UNMNT_ABOVE ? cnp->cn_cred : um->um_cred),
    871 	    un->un_path);
    872 	if (error)
    873 		return error;
    874 
    875 	error = VOP_WHITEOUT(dvp, &cn, CREATE);
    876 	return error;
    877 }
    878 
    879 /*
    880  * union_vn_create: creates and opens a new shadow file
    881  * on the upper union layer.  this function is similar
    882  * in spirit to calling vn_open but it avoids calling namei().
    883  * the problem with calling namei is that a) it locks too many
    884  * things, and b) it doesn't start at the "right" directory,
    885  * whereas union_do_lookup is told where to start.
    886  */
    887 int
    888 union_vn_create(struct vnode **vpp, struct union_node *un, struct lwp *l)
    889 {
    890 	struct vnode *vp;
    891 	kauth_cred_t cred = l->l_cred;
    892 	struct vattr vat;
    893 	struct vattr *vap = &vat;
    894 	int fmode = FFLAGS(O_WRONLY|O_CREAT|O_TRUNC|O_EXCL);
    895 	int error;
    896 	int cmode = UN_FILEMODE & ~l->l_proc->p_cwdi->cwdi_cmask;
    897 	struct componentname cn;
    898 
    899 	*vpp = NULLVP;
    900 
    901 	vn_lock(un->un_dirvp, LK_EXCLUSIVE | LK_RETRY);
    902 
    903 	error = union_do_lookup(un->un_dirvp, &cn, l->l_cred,
    904 	    un->un_path);
    905 	if (error) {
    906 		VOP_UNLOCK(un->un_dirvp);
    907 		return error;
    908 	}
    909 
    910 	/*
    911 	 * Good - there was no race to create the file
    912 	 * so go ahead and create it.  The permissions
    913 	 * on the file will be 0666 modified by the
    914 	 * current user's umask.  Access to the file, while
    915 	 * it is unioned, will require access to the top *and*
    916 	 * bottom files.  Access when not unioned will simply
    917 	 * require access to the top-level file.
    918 	 * TODO: confirm choice of access permissions.
    919 	 */
    920 	vattr_null(vap);
    921 	vap->va_type = VREG;
    922 	vap->va_mode = cmode;
    923 	vp = NULL;
    924 	error = VOP_CREATE(un->un_dirvp, &vp, &cn, vap);
    925 	if (error) {
    926 		VOP_UNLOCK(un->un_dirvp);
    927 		return error;
    928 	}
    929 
    930 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
    931 	VOP_UNLOCK(un->un_dirvp);
    932 	error = VOP_OPEN(vp, fmode, cred);
    933 	if (error) {
    934 		vput(vp);
    935 		return error;
    936 	}
    937 
    938 	vp->v_writecount++;
    939 	*vpp = vp;
    940 	return 0;
    941 }
    942 
    943 int
    944 union_vn_close(struct vnode *vp, int fmode, kauth_cred_t cred, struct lwp *l)
    945 {
    946 
    947 	if (fmode & FWRITE)
    948 		--vp->v_writecount;
    949 	return (VOP_CLOSE(vp, fmode, cred));
    950 }
    951 
    952 void
    953 union_removed_upper(struct union_node *un)
    954 {
    955 	struct vnode *vp = UNIONTOV(un);
    956 
    957 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
    958 #if 1
    959 	/*
    960 	 * We do not set the uppervp to NULLVP here, because lowervp
    961 	 * may also be NULLVP, so this routine would end up creating
    962 	 * a bogus union node with no upper or lower VP (that causes
    963 	 * pain in many places that assume at least one VP exists).
    964 	 * Since we've removed this node from the cache hash chains,
    965 	 * it won't be found again.  When all current holders
    966 	 * release it, union_inactive() will vgone() it.
    967 	 */
    968 	union_diruncache(un);
    969 #else
    970 	union_newupper(un, NULLVP);
    971 #endif
    972 
    973 	VOP_UNLOCK(vp);
    974 
    975 	mutex_enter(&uhash_lock);
    976 	if (un->un_cflags & UN_CACHED) {
    977 		un->un_cflags &= ~UN_CACHED;
    978 		LIST_REMOVE(un, un_cache);
    979 	}
    980 	mutex_exit(&uhash_lock);
    981 }
    982 
    983 #if 0
    984 struct vnode *
    985 union_lowervp(struct vnode *vp)
    986 {
    987 	struct union_node *un = VTOUNION(vp);
    988 
    989 	if ((un->un_lowervp != NULLVP) &&
    990 	    (vp->v_type == un->un_lowervp->v_type)) {
    991 		if (vget(un->un_lowervp, 0, true /* wait */) == 0)
    992 			return (un->un_lowervp);
    993 	}
    994 
    995 	return (NULLVP);
    996 }
    997 #endif
    998 
    999 /*
   1000  * determine whether a whiteout is needed
   1001  * during a remove/rmdir operation.
   1002  */
   1003 int
   1004 union_dowhiteout(struct union_node *un, kauth_cred_t cred)
   1005 {
   1006 	struct vattr va;
   1007 
   1008 	if (un->un_lowervp != NULLVP)
   1009 		return (1);
   1010 
   1011 	if (VOP_GETATTR(un->un_uppervp, &va, cred) == 0 &&
   1012 	    (va.va_flags & OPAQUE))
   1013 		return (1);
   1014 
   1015 	return (0);
   1016 }
   1017 
   1018 static void
   1019 union_dircache_r(struct vnode *vp, struct vnode ***vppp, int *cntp)
   1020 {
   1021 	struct union_node *un;
   1022 
   1023 	if (vp->v_op != union_vnodeop_p) {
   1024 		if (vppp) {
   1025 			vref(vp);
   1026 			*(*vppp)++ = vp;
   1027 			if (--(*cntp) == 0)
   1028 				panic("union: dircache table too small");
   1029 		} else {
   1030 			(*cntp)++;
   1031 		}
   1032 
   1033 		return;
   1034 	}
   1035 
   1036 	un = VTOUNION(vp);
   1037 	if (un->un_uppervp != NULLVP)
   1038 		union_dircache_r(un->un_uppervp, vppp, cntp);
   1039 	if (un->un_lowervp != NULLVP)
   1040 		union_dircache_r(un->un_lowervp, vppp, cntp);
   1041 }
   1042 
   1043 struct vnode *
   1044 union_dircache(struct vnode *vp, struct lwp *l)
   1045 {
   1046 	int cnt;
   1047 	struct vnode *nvp = NULLVP;
   1048 	struct vnode **vpp;
   1049 	struct vnode **dircache;
   1050 	int error;
   1051 
   1052 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   1053 	dircache = VTOUNION(vp)->un_dircache;
   1054 
   1055 	nvp = NULLVP;
   1056 
   1057 	if (dircache == 0) {
   1058 		cnt = 0;
   1059 		union_dircache_r(vp, 0, &cnt);
   1060 		cnt++;
   1061 		dircache = (struct vnode **)
   1062 				malloc(cnt * sizeof(struct vnode *),
   1063 					M_TEMP, M_WAITOK);
   1064 		vpp = dircache;
   1065 		union_dircache_r(vp, &vpp, &cnt);
   1066 		VTOUNION(vp)->un_dircache = dircache;
   1067 		*vpp = NULLVP;
   1068 		vpp = dircache + 1;
   1069 	} else {
   1070 		vpp = dircache;
   1071 		do {
   1072 			if (*vpp++ == VTOUNION(vp)->un_uppervp)
   1073 				break;
   1074 		} while (*vpp != NULLVP);
   1075 	}
   1076 
   1077 	if (*vpp == NULLVP)
   1078 		goto out;
   1079 
   1080 	vref(*vpp);
   1081 	error = union_allocvp(&nvp, vp->v_mount, NULLVP, NULLVP, 0, *vpp, NULLVP, 0);
   1082 	if (!error) {
   1083 		vn_lock(nvp, LK_EXCLUSIVE | LK_RETRY);
   1084 		VTOUNION(vp)->un_dircache = 0;
   1085 		VTOUNION(nvp)->un_dircache = dircache;
   1086 	}
   1087 
   1088 out:
   1089 	VOP_UNLOCK(vp);
   1090 	return (nvp);
   1091 }
   1092 
   1093 void
   1094 union_diruncache(struct union_node *un)
   1095 {
   1096 	struct vnode **vpp;
   1097 
   1098 	KASSERT(VOP_ISLOCKED(UNIONTOV(un)) == LK_EXCLUSIVE);
   1099 	if (un->un_dircache != 0) {
   1100 		for (vpp = un->un_dircache; *vpp != NULLVP; vpp++)
   1101 			vrele(*vpp);
   1102 		free(un->un_dircache, M_TEMP);
   1103 		un->un_dircache = 0;
   1104 	}
   1105 }
   1106 
   1107 /*
   1108  * Check whether node can rmdir (check empty).
   1109  */
   1110 int
   1111 union_check_rmdir(struct union_node *un, kauth_cred_t cred)
   1112 {
   1113 	int dirlen, eofflag, error;
   1114 	char *dirbuf;
   1115 	struct vattr va;
   1116 	struct vnode *tvp;
   1117 	struct dirent *dp, *edp;
   1118 	struct componentname cn;
   1119 	struct iovec aiov;
   1120 	struct uio auio;
   1121 
   1122 	KASSERT(un->un_uppervp != NULL);
   1123 
   1124 	/* Check upper for being opaque. */
   1125 	KASSERT(VOP_ISLOCKED(un->un_uppervp));
   1126 	error = VOP_GETATTR(un->un_uppervp, &va, cred);
   1127 	if (error || (va.va_flags & OPAQUE))
   1128 		return error;
   1129 
   1130 	if (un->un_lowervp == NULL)
   1131 		return 0;
   1132 
   1133 	/* Check lower for being empty. */
   1134 	vn_lock(un->un_lowervp, LK_SHARED | LK_RETRY);
   1135 	error = VOP_GETATTR(un->un_lowervp, &va, cred);
   1136 	if (error) {
   1137 		VOP_UNLOCK(un->un_lowervp);
   1138 		return error;
   1139 	}
   1140 	dirlen = va.va_blocksize;
   1141 	dirbuf = kmem_alloc(dirlen, KM_SLEEP);
   1142 	if (dirbuf == NULL) {
   1143 		VOP_UNLOCK(un->un_lowervp);
   1144 		return ENOMEM;
   1145 	}
   1146 	/* error = 0; */
   1147 	eofflag = 0;
   1148 	auio.uio_offset = 0;
   1149 	do {
   1150 		aiov.iov_len = dirlen;
   1151 		aiov.iov_base = dirbuf;
   1152 		auio.uio_iov = &aiov;
   1153 		auio.uio_iovcnt = 1;
   1154 		auio.uio_resid = aiov.iov_len;
   1155 		auio.uio_rw = UIO_READ;
   1156 		UIO_SETUP_SYSSPACE(&auio);
   1157 		error = VOP_READDIR(un->un_lowervp, &auio, cred, &eofflag,
   1158 		    NULL, NULL);
   1159 		if (error)
   1160 			break;
   1161 		edp = (struct dirent *)&dirbuf[dirlen - auio.uio_resid];
   1162 		for (dp = (struct dirent *)dirbuf;
   1163 		    error == 0 && dp < edp;
   1164 		    dp = (struct dirent *)((char *)dp + dp->d_reclen)) {
   1165 			if (dp->d_reclen == 0) {
   1166 				error = ENOTEMPTY;
   1167 				break;
   1168 			}
   1169 			if (dp->d_type == DT_WHT ||
   1170 			    (dp->d_namlen == 1 && dp->d_name[0] == '.') ||
   1171 			    (dp->d_namlen == 2 && !memcmp(dp->d_name, "..", 2)))
   1172 				continue;
   1173 			/* Check for presence in the upper layer. */
   1174 			cn.cn_nameiop = LOOKUP;
   1175 			cn.cn_flags = ISLASTCN | RDONLY;
   1176 			cn.cn_cred = cred;
   1177 			cn.cn_nameptr = dp->d_name;
   1178 			cn.cn_namelen = dp->d_namlen;
   1179 			error = VOP_LOOKUP(un->un_uppervp, &tvp, &cn);
   1180 			if (error == ENOENT && (cn.cn_flags & ISWHITEOUT)) {
   1181 				error = 0;
   1182 				continue;
   1183 			}
   1184 			if (error == 0)
   1185 				vrele(tvp);
   1186 			error = ENOTEMPTY;
   1187 		}
   1188 	} while (error == 0 && !eofflag);
   1189 	kmem_free(dirbuf, dirlen);
   1190 	VOP_UNLOCK(un->un_lowervp);
   1191 
   1192 	return error;
   1193 }
   1194 
   1195 /*
   1196  * This hook is called from vn_readdir() to switch to lower directory
   1197  * entry after the upper directory is read.
   1198  */
   1199 int
   1200 union_readdirhook(struct vnode **vpp, struct file *fp, struct lwp *l)
   1201 {
   1202 	struct vnode *vp = *vpp, *lvp;
   1203 	struct vattr va;
   1204 	int error;
   1205 
   1206 	if (vp->v_op != union_vnodeop_p)
   1207 		return (0);
   1208 
   1209 	/*
   1210 	 * If the directory is opaque,
   1211 	 * then don't show lower entries
   1212 	 */
   1213 	vn_lock(vp, LK_SHARED | LK_RETRY);
   1214 	error = VOP_GETATTR(vp, &va, fp->f_cred);
   1215 	VOP_UNLOCK(vp);
   1216 	if (error || (va.va_flags & OPAQUE))
   1217 		return error;
   1218 
   1219 	if ((lvp = union_dircache(vp, l)) == NULLVP)
   1220 		return (0);
   1221 
   1222 	error = VOP_OPEN(lvp, FREAD, fp->f_cred);
   1223 	if (error) {
   1224 		vput(lvp);
   1225 		return (error);
   1226 	}
   1227 	VOP_UNLOCK(lvp);
   1228 	fp->f_vnode = lvp;
   1229 	fp->f_offset = 0;
   1230 	error = vn_close(vp, FREAD, fp->f_cred);
   1231 	if (error)
   1232 		return (error);
   1233 	*vpp = lvp;
   1234 	return (0);
   1235 }
   1236