Home | History | Annotate | Line # | Download | only in ffs
ffs_alloc.c revision 1.114
      1 /*	$NetBSD: ffs_alloc.c,v 1.114 2008/11/06 22:31:08 joerg Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2008 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Wasabi Systems, Inc.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 2002 Networks Associates Technology, Inc.
     34  * All rights reserved.
     35  *
     36  * This software was developed for the FreeBSD Project by Marshall
     37  * Kirk McKusick and Network Associates Laboratories, the Security
     38  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
     39  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
     40  * research program
     41  *
     42  * Copyright (c) 1982, 1986, 1989, 1993
     43  *	The Regents of the University of California.  All rights reserved.
     44  *
     45  * Redistribution and use in source and binary forms, with or without
     46  * modification, are permitted provided that the following conditions
     47  * are met:
     48  * 1. Redistributions of source code must retain the above copyright
     49  *    notice, this list of conditions and the following disclaimer.
     50  * 2. Redistributions in binary form must reproduce the above copyright
     51  *    notice, this list of conditions and the following disclaimer in the
     52  *    documentation and/or other materials provided with the distribution.
     53  * 3. Neither the name of the University nor the names of its contributors
     54  *    may be used to endorse or promote products derived from this software
     55  *    without specific prior written permission.
     56  *
     57  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     58  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     59  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     60  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     61  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     62  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     63  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     64  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     65  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     66  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     67  * SUCH DAMAGE.
     68  *
     69  *	@(#)ffs_alloc.c	8.19 (Berkeley) 7/13/95
     70  */
     71 
     72 #include <sys/cdefs.h>
     73 __KERNEL_RCSID(0, "$NetBSD: ffs_alloc.c,v 1.114 2008/11/06 22:31:08 joerg Exp $");
     74 
     75 #if defined(_KERNEL_OPT)
     76 #include "opt_ffs.h"
     77 #include "opt_quota.h"
     78 #endif
     79 
     80 #include <sys/param.h>
     81 #include <sys/systm.h>
     82 #include <sys/buf.h>
     83 #include <sys/fstrans.h>
     84 #include <sys/kauth.h>
     85 #include <sys/kernel.h>
     86 #include <sys/mount.h>
     87 #include <sys/proc.h>
     88 #include <sys/syslog.h>
     89 #include <sys/vnode.h>
     90 #include <sys/wapbl.h>
     91 
     92 #include <miscfs/specfs/specdev.h>
     93 #include <ufs/ufs/quota.h>
     94 #include <ufs/ufs/ufsmount.h>
     95 #include <ufs/ufs/inode.h>
     96 #include <ufs/ufs/ufs_extern.h>
     97 #include <ufs/ufs/ufs_bswap.h>
     98 #include <ufs/ufs/ufs_wapbl.h>
     99 
    100 #include <ufs/ffs/fs.h>
    101 #include <ufs/ffs/ffs_extern.h>
    102 
    103 static daddr_t ffs_alloccg(struct inode *, int, daddr_t, int, int);
    104 static daddr_t ffs_alloccgblk(struct inode *, struct buf *, daddr_t, int);
    105 static ino_t ffs_dirpref(struct inode *);
    106 static daddr_t ffs_fragextend(struct inode *, int, daddr_t, int, int);
    107 static void ffs_fserr(struct fs *, u_int, const char *);
    108 static daddr_t ffs_hashalloc(struct inode *, int, daddr_t, int, int,
    109     daddr_t (*)(struct inode *, int, daddr_t, int, int));
    110 static daddr_t ffs_nodealloccg(struct inode *, int, daddr_t, int, int);
    111 static int32_t ffs_mapsearch(struct fs *, struct cg *,
    112 				      daddr_t, int);
    113 
    114 /* if 1, changes in optimalization strategy are logged */
    115 int ffs_log_changeopt = 0;
    116 
    117 /* in ffs_tables.c */
    118 extern const int inside[], around[];
    119 extern const u_char * const fragtbl[];
    120 
    121 /*
    122  * Allocate a block in the file system.
    123  *
    124  * The size of the requested block is given, which must be some
    125  * multiple of fs_fsize and <= fs_bsize.
    126  * A preference may be optionally specified. If a preference is given
    127  * the following hierarchy is used to allocate a block:
    128  *   1) allocate the requested block.
    129  *   2) allocate a rotationally optimal block in the same cylinder.
    130  *   3) allocate a block in the same cylinder group.
    131  *   4) quadradically rehash into other cylinder groups, until an
    132  *      available block is located.
    133  * If no block preference is given the following hierarchy is used
    134  * to allocate a block:
    135  *   1) allocate a block in the cylinder group that contains the
    136  *      inode for the file.
    137  *   2) quadradically rehash into other cylinder groups, until an
    138  *      available block is located.
    139  *
    140  * => called with um_lock held
    141  * => releases um_lock before returning
    142  */
    143 int
    144 ffs_alloc(struct inode *ip, daddr_t lbn, daddr_t bpref, int size, int flags,
    145     kauth_cred_t cred, daddr_t *bnp)
    146 {
    147 	struct ufsmount *ump;
    148 	struct fs *fs;
    149 	daddr_t bno;
    150 	int cg;
    151 #ifdef QUOTA
    152 	int error;
    153 #endif
    154 
    155 	fs = ip->i_fs;
    156 	ump = ip->i_ump;
    157 
    158 	KASSERT(mutex_owned(&ump->um_lock));
    159 
    160 #ifdef UVM_PAGE_TRKOWN
    161 	if (ITOV(ip)->v_type == VREG &&
    162 	    lblktosize(fs, (voff_t)lbn) < round_page(ITOV(ip)->v_size)) {
    163 		struct vm_page *pg;
    164 		struct uvm_object *uobj = &ITOV(ip)->v_uobj;
    165 		voff_t off = trunc_page(lblktosize(fs, lbn));
    166 		voff_t endoff = round_page(lblktosize(fs, lbn) + size);
    167 
    168 		mutex_enter(&uobj->vmobjlock);
    169 		while (off < endoff) {
    170 			pg = uvm_pagelookup(uobj, off);
    171 			KASSERT(pg != NULL);
    172 			KASSERT(pg->owner == curproc->p_pid);
    173 			off += PAGE_SIZE;
    174 		}
    175 		mutex_exit(&uobj->vmobjlock);
    176 	}
    177 #endif
    178 
    179 	*bnp = 0;
    180 #ifdef DIAGNOSTIC
    181 	if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
    182 		printf("dev = 0x%x, bsize = %d, size = %d, fs = %s\n",
    183 		    ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
    184 		panic("ffs_alloc: bad size");
    185 	}
    186 	if (cred == NOCRED)
    187 		panic("ffs_alloc: missing credential");
    188 #endif /* DIAGNOSTIC */
    189 	if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
    190 		goto nospace;
    191 	if (freespace(fs, fs->fs_minfree) <= 0 &&
    192 	    kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, NULL) != 0)
    193 		goto nospace;
    194 #ifdef QUOTA
    195 	mutex_exit(&ump->um_lock);
    196 	if ((error = chkdq(ip, btodb(size), cred, 0)) != 0)
    197 		return (error);
    198 	mutex_enter(&ump->um_lock);
    199 #endif
    200 
    201 	if (bpref >= fs->fs_size)
    202 		bpref = 0;
    203 	if (bpref == 0)
    204 		cg = ino_to_cg(fs, ip->i_number);
    205 	else
    206 		cg = dtog(fs, bpref);
    207 	bno = ffs_hashalloc(ip, cg, bpref, size, flags, ffs_alloccg);
    208 	if (bno > 0) {
    209 		DIP_ADD(ip, blocks, btodb(size));
    210 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
    211 		*bnp = bno;
    212 		return (0);
    213 	}
    214 #ifdef QUOTA
    215 	/*
    216 	 * Restore user's disk quota because allocation failed.
    217 	 */
    218 	(void) chkdq(ip, -btodb(size), cred, FORCE);
    219 #endif
    220 	if (flags & B_CONTIG) {
    221 		/*
    222 		 * XXX ump->um_lock handling is "suspect" at best.
    223 		 * For the case where ffs_hashalloc() fails early
    224 		 * in the B_CONTIG case we reach here with um_lock
    225 		 * already unlocked, so we can't release it again
    226 		 * like in the normal error path.  See kern/39206.
    227 		 *
    228 		 *
    229 		 * Fail silently - it's up to our caller to report
    230 		 * errors.
    231 		 */
    232 		return (ENOSPC);
    233 	}
    234 nospace:
    235 	mutex_exit(&ump->um_lock);
    236 	ffs_fserr(fs, kauth_cred_geteuid(cred), "file system full");
    237 	uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
    238 	return (ENOSPC);
    239 }
    240 
    241 /*
    242  * Reallocate a fragment to a bigger size
    243  *
    244  * The number and size of the old block is given, and a preference
    245  * and new size is also specified. The allocator attempts to extend
    246  * the original block. Failing that, the regular block allocator is
    247  * invoked to get an appropriate block.
    248  *
    249  * => called with um_lock held
    250  * => return with um_lock released
    251  */
    252 int
    253 ffs_realloccg(struct inode *ip, daddr_t lbprev, daddr_t bpref, int osize,
    254     int nsize, kauth_cred_t cred, struct buf **bpp, daddr_t *blknop)
    255 {
    256 	struct ufsmount *ump;
    257 	struct fs *fs;
    258 	struct buf *bp;
    259 	int cg, request, error;
    260 	daddr_t bprev, bno;
    261 
    262 	fs = ip->i_fs;
    263 	ump = ip->i_ump;
    264 
    265 	KASSERT(mutex_owned(&ump->um_lock));
    266 
    267 #ifdef UVM_PAGE_TRKOWN
    268 	if (ITOV(ip)->v_type == VREG) {
    269 		struct vm_page *pg;
    270 		struct uvm_object *uobj = &ITOV(ip)->v_uobj;
    271 		voff_t off = trunc_page(lblktosize(fs, lbprev));
    272 		voff_t endoff = round_page(lblktosize(fs, lbprev) + osize);
    273 
    274 		mutex_enter(&uobj->vmobjlock);
    275 		while (off < endoff) {
    276 			pg = uvm_pagelookup(uobj, off);
    277 			KASSERT(pg != NULL);
    278 			KASSERT(pg->owner == curproc->p_pid);
    279 			KASSERT((pg->flags & PG_CLEAN) == 0);
    280 			off += PAGE_SIZE;
    281 		}
    282 		mutex_exit(&uobj->vmobjlock);
    283 	}
    284 #endif
    285 
    286 #ifdef DIAGNOSTIC
    287 	if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
    288 	    (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
    289 		printf(
    290 		    "dev = 0x%x, bsize = %d, osize = %d, nsize = %d, fs = %s\n",
    291 		    ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt);
    292 		panic("ffs_realloccg: bad size");
    293 	}
    294 	if (cred == NOCRED)
    295 		panic("ffs_realloccg: missing credential");
    296 #endif /* DIAGNOSTIC */
    297 	if (freespace(fs, fs->fs_minfree) <= 0 &&
    298 	    kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, NULL) != 0) {
    299 		mutex_exit(&ump->um_lock);
    300 		goto nospace;
    301 	}
    302 	if (fs->fs_magic == FS_UFS2_MAGIC)
    303 		bprev = ufs_rw64(ip->i_ffs2_db[lbprev], UFS_FSNEEDSWAP(fs));
    304 	else
    305 		bprev = ufs_rw32(ip->i_ffs1_db[lbprev], UFS_FSNEEDSWAP(fs));
    306 
    307 	if (bprev == 0) {
    308 		printf("dev = 0x%x, bsize = %d, bprev = %" PRId64 ", fs = %s\n",
    309 		    ip->i_dev, fs->fs_bsize, bprev, fs->fs_fsmnt);
    310 		panic("ffs_realloccg: bad bprev");
    311 	}
    312 	mutex_exit(&ump->um_lock);
    313 
    314 	/*
    315 	 * Allocate the extra space in the buffer.
    316 	 */
    317 	if (bpp != NULL &&
    318 	    (error = bread(ITOV(ip), lbprev, osize, NOCRED, 0, &bp)) != 0) {
    319 		brelse(bp, 0);
    320 		return (error);
    321 	}
    322 #ifdef QUOTA
    323 	if ((error = chkdq(ip, btodb(nsize - osize), cred, 0)) != 0) {
    324 		if (bpp != NULL) {
    325 			brelse(bp, 0);
    326 		}
    327 		return (error);
    328 	}
    329 #endif
    330 	/*
    331 	 * Check for extension in the existing location.
    332 	 */
    333 	cg = dtog(fs, bprev);
    334 	mutex_enter(&ump->um_lock);
    335 	if ((bno = ffs_fragextend(ip, cg, bprev, osize, nsize)) != 0) {
    336 		DIP_ADD(ip, blocks, btodb(nsize - osize));
    337 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
    338 
    339 		if (bpp != NULL) {
    340 			if (bp->b_blkno != fsbtodb(fs, bno))
    341 				panic("bad blockno");
    342 			allocbuf(bp, nsize, 1);
    343 			memset((char *)bp->b_data + osize, 0, nsize - osize);
    344 			mutex_enter(bp->b_objlock);
    345 			KASSERT(!cv_has_waiters(&bp->b_done));
    346 			bp->b_oflags |= BO_DONE;
    347 			mutex_exit(bp->b_objlock);
    348 			*bpp = bp;
    349 		}
    350 		if (blknop != NULL) {
    351 			*blknop = bno;
    352 		}
    353 		return (0);
    354 	}
    355 	/*
    356 	 * Allocate a new disk location.
    357 	 */
    358 	if (bpref >= fs->fs_size)
    359 		bpref = 0;
    360 	switch ((int)fs->fs_optim) {
    361 	case FS_OPTSPACE:
    362 		/*
    363 		 * Allocate an exact sized fragment. Although this makes
    364 		 * best use of space, we will waste time relocating it if
    365 		 * the file continues to grow. If the fragmentation is
    366 		 * less than half of the minimum free reserve, we choose
    367 		 * to begin optimizing for time.
    368 		 */
    369 		request = nsize;
    370 		if (fs->fs_minfree < 5 ||
    371 		    fs->fs_cstotal.cs_nffree >
    372 		    fs->fs_dsize * fs->fs_minfree / (2 * 100))
    373 			break;
    374 
    375 		if (ffs_log_changeopt) {
    376 			log(LOG_NOTICE,
    377 				"%s: optimization changed from SPACE to TIME\n",
    378 				fs->fs_fsmnt);
    379 		}
    380 
    381 		fs->fs_optim = FS_OPTTIME;
    382 		break;
    383 	case FS_OPTTIME:
    384 		/*
    385 		 * At this point we have discovered a file that is trying to
    386 		 * grow a small fragment to a larger fragment. To save time,
    387 		 * we allocate a full sized block, then free the unused portion.
    388 		 * If the file continues to grow, the `ffs_fragextend' call
    389 		 * above will be able to grow it in place without further
    390 		 * copying. If aberrant programs cause disk fragmentation to
    391 		 * grow within 2% of the free reserve, we choose to begin
    392 		 * optimizing for space.
    393 		 */
    394 		request = fs->fs_bsize;
    395 		if (fs->fs_cstotal.cs_nffree <
    396 		    fs->fs_dsize * (fs->fs_minfree - 2) / 100)
    397 			break;
    398 
    399 		if (ffs_log_changeopt) {
    400 			log(LOG_NOTICE,
    401 				"%s: optimization changed from TIME to SPACE\n",
    402 				fs->fs_fsmnt);
    403 		}
    404 
    405 		fs->fs_optim = FS_OPTSPACE;
    406 		break;
    407 	default:
    408 		printf("dev = 0x%x, optim = %d, fs = %s\n",
    409 		    ip->i_dev, fs->fs_optim, fs->fs_fsmnt);
    410 		panic("ffs_realloccg: bad optim");
    411 		/* NOTREACHED */
    412 	}
    413 	bno = ffs_hashalloc(ip, cg, bpref, request, 0, ffs_alloccg);
    414 	if (bno > 0) {
    415 		if (!DOINGSOFTDEP(ITOV(ip))) {
    416 			if ((ip->i_ump->um_mountp->mnt_wapbl) &&
    417 			    (ITOV(ip)->v_type != VREG)) {
    418 				UFS_WAPBL_REGISTER_DEALLOCATION(
    419 				    ip->i_ump->um_mountp, fsbtodb(fs, bprev),
    420 				    osize);
    421 			} else
    422 				ffs_blkfree(fs, ip->i_devvp, bprev, (long)osize,
    423 				    ip->i_number);
    424 		}
    425 		if (nsize < request) {
    426 			if ((ip->i_ump->um_mountp->mnt_wapbl) &&
    427 			    (ITOV(ip)->v_type != VREG)) {
    428 				UFS_WAPBL_REGISTER_DEALLOCATION(
    429 				    ip->i_ump->um_mountp,
    430 				    fsbtodb(fs, (bno + numfrags(fs, nsize))),
    431 				    request - nsize);
    432 			} else
    433 				ffs_blkfree(fs, ip->i_devvp,
    434 				    bno + numfrags(fs, nsize),
    435 				    (long)(request - nsize), ip->i_number);
    436 		}
    437 		DIP_ADD(ip, blocks, btodb(nsize - osize));
    438 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
    439 		if (bpp != NULL) {
    440 			bp->b_blkno = fsbtodb(fs, bno);
    441 			allocbuf(bp, nsize, 1);
    442 			memset((char *)bp->b_data + osize, 0, (u_int)nsize - osize);
    443 			mutex_enter(bp->b_objlock);
    444 			KASSERT(!cv_has_waiters(&bp->b_done));
    445 			bp->b_oflags |= BO_DONE;
    446 			mutex_exit(bp->b_objlock);
    447 			*bpp = bp;
    448 		}
    449 		if (blknop != NULL) {
    450 			*blknop = bno;
    451 		}
    452 		return (0);
    453 	}
    454 	mutex_exit(&ump->um_lock);
    455 
    456 #ifdef QUOTA
    457 	/*
    458 	 * Restore user's disk quota because allocation failed.
    459 	 */
    460 	(void) chkdq(ip, -btodb(nsize - osize), cred, FORCE);
    461 #endif
    462 	if (bpp != NULL) {
    463 		brelse(bp, 0);
    464 	}
    465 
    466 nospace:
    467 	/*
    468 	 * no space available
    469 	 */
    470 	ffs_fserr(fs, kauth_cred_geteuid(cred), "file system full");
    471 	uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
    472 	return (ENOSPC);
    473 }
    474 
    475 /*
    476  * Allocate an inode in the file system.
    477  *
    478  * If allocating a directory, use ffs_dirpref to select the inode.
    479  * If allocating in a directory, the following hierarchy is followed:
    480  *   1) allocate the preferred inode.
    481  *   2) allocate an inode in the same cylinder group.
    482  *   3) quadradically rehash into other cylinder groups, until an
    483  *      available inode is located.
    484  * If no inode preference is given the following hierarchy is used
    485  * to allocate an inode:
    486  *   1) allocate an inode in cylinder group 0.
    487  *   2) quadradically rehash into other cylinder groups, until an
    488  *      available inode is located.
    489  *
    490  * => um_lock not held upon entry or return
    491  */
    492 int
    493 ffs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred,
    494     struct vnode **vpp)
    495 {
    496 	struct ufsmount *ump;
    497 	struct inode *pip;
    498 	struct fs *fs;
    499 	struct inode *ip;
    500 	struct timespec ts;
    501 	ino_t ino, ipref;
    502 	int cg, error;
    503 
    504 	UFS_WAPBL_JUNLOCK_ASSERT(pvp->v_mount);
    505 
    506 	*vpp = NULL;
    507 	pip = VTOI(pvp);
    508 	fs = pip->i_fs;
    509 	ump = pip->i_ump;
    510 
    511 	error = UFS_WAPBL_BEGIN(pvp->v_mount);
    512 	if (error) {
    513 		return error;
    514 	}
    515 	mutex_enter(&ump->um_lock);
    516 	if (fs->fs_cstotal.cs_nifree == 0)
    517 		goto noinodes;
    518 
    519 	if ((mode & IFMT) == IFDIR)
    520 		ipref = ffs_dirpref(pip);
    521 	else
    522 		ipref = pip->i_number;
    523 	if (ipref >= fs->fs_ncg * fs->fs_ipg)
    524 		ipref = 0;
    525 	cg = ino_to_cg(fs, ipref);
    526 	/*
    527 	 * Track number of dirs created one after another
    528 	 * in a same cg without intervening by files.
    529 	 */
    530 	if ((mode & IFMT) == IFDIR) {
    531 		if (fs->fs_contigdirs[cg] < 255)
    532 			fs->fs_contigdirs[cg]++;
    533 	} else {
    534 		if (fs->fs_contigdirs[cg] > 0)
    535 			fs->fs_contigdirs[cg]--;
    536 	}
    537 	ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, ffs_nodealloccg);
    538 	if (ino == 0)
    539 		goto noinodes;
    540 	UFS_WAPBL_END(pvp->v_mount);
    541 	error = VFS_VGET(pvp->v_mount, ino, vpp);
    542 	if (error) {
    543 		int err;
    544 		err = UFS_WAPBL_BEGIN(pvp->v_mount);
    545 		if (err == 0)
    546 			ffs_vfree(pvp, ino, mode);
    547 		if (err == 0)
    548 			UFS_WAPBL_END(pvp->v_mount);
    549 		return (error);
    550 	}
    551 	KASSERT((*vpp)->v_type == VNON);
    552 	ip = VTOI(*vpp);
    553 	if (ip->i_mode) {
    554 #if 0
    555 		printf("mode = 0%o, inum = %d, fs = %s\n",
    556 		    ip->i_mode, ip->i_number, fs->fs_fsmnt);
    557 #else
    558 		printf("dmode %x mode %x dgen %x gen %x\n",
    559 		    DIP(ip, mode), ip->i_mode,
    560 		    DIP(ip, gen), ip->i_gen);
    561 		printf("size %llx blocks %llx\n",
    562 		    (long long)DIP(ip, size), (long long)DIP(ip, blocks));
    563 		printf("ino %llu ipref %llu\n", (unsigned long long)ino,
    564 		    (unsigned long long)ipref);
    565 #if 0
    566 		error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)),
    567 		    (int)fs->fs_bsize, NOCRED, 0, &bp);
    568 #endif
    569 
    570 #endif
    571 		panic("ffs_valloc: dup alloc");
    572 	}
    573 	if (DIP(ip, blocks)) {				/* XXX */
    574 		printf("free inode %s/%llu had %" PRId64 " blocks\n",
    575 		    fs->fs_fsmnt, (unsigned long long)ino, DIP(ip, blocks));
    576 		DIP_ASSIGN(ip, blocks, 0);
    577 	}
    578 	ip->i_flag &= ~IN_SPACECOUNTED;
    579 	ip->i_flags = 0;
    580 	DIP_ASSIGN(ip, flags, 0);
    581 	/*
    582 	 * Set up a new generation number for this inode.
    583 	 */
    584 	ip->i_gen++;
    585 	DIP_ASSIGN(ip, gen, ip->i_gen);
    586 	if (fs->fs_magic == FS_UFS2_MAGIC) {
    587 		vfs_timestamp(&ts);
    588 		ip->i_ffs2_birthtime = ts.tv_sec;
    589 		ip->i_ffs2_birthnsec = ts.tv_nsec;
    590 	}
    591 	return (0);
    592 noinodes:
    593 	mutex_exit(&ump->um_lock);
    594 	UFS_WAPBL_END(pvp->v_mount);
    595 	ffs_fserr(fs, kauth_cred_geteuid(cred), "out of inodes");
    596 	uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt);
    597 	return (ENOSPC);
    598 }
    599 
    600 /*
    601  * Find a cylinder group in which to place a directory.
    602  *
    603  * The policy implemented by this algorithm is to allocate a
    604  * directory inode in the same cylinder group as its parent
    605  * directory, but also to reserve space for its files inodes
    606  * and data. Restrict the number of directories which may be
    607  * allocated one after another in the same cylinder group
    608  * without intervening allocation of files.
    609  *
    610  * If we allocate a first level directory then force allocation
    611  * in another cylinder group.
    612  */
    613 static ino_t
    614 ffs_dirpref(struct inode *pip)
    615 {
    616 	register struct fs *fs;
    617 	int cg, prefcg;
    618 	int64_t dirsize, cgsize, curdsz;
    619 	int avgifree, avgbfree, avgndir;
    620 	int minifree, minbfree, maxndir;
    621 	int mincg, minndir;
    622 	int maxcontigdirs;
    623 
    624 	KASSERT(mutex_owned(&pip->i_ump->um_lock));
    625 
    626 	fs = pip->i_fs;
    627 
    628 	avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
    629 	avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
    630 	avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg;
    631 
    632 	/*
    633 	 * Force allocation in another cg if creating a first level dir.
    634 	 */
    635 	if (ITOV(pip)->v_vflag & VV_ROOT) {
    636 		prefcg = random() % fs->fs_ncg;
    637 		mincg = prefcg;
    638 		minndir = fs->fs_ipg;
    639 		for (cg = prefcg; cg < fs->fs_ncg; cg++)
    640 			if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
    641 			    fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
    642 			    fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
    643 				mincg = cg;
    644 				minndir = fs->fs_cs(fs, cg).cs_ndir;
    645 			}
    646 		for (cg = 0; cg < prefcg; cg++)
    647 			if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
    648 			    fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
    649 			    fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
    650 				mincg = cg;
    651 				minndir = fs->fs_cs(fs, cg).cs_ndir;
    652 			}
    653 		return ((ino_t)(fs->fs_ipg * mincg));
    654 	}
    655 
    656 	/*
    657 	 * Count various limits which used for
    658 	 * optimal allocation of a directory inode.
    659 	 */
    660 	maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg);
    661 	minifree = avgifree - fs->fs_ipg / 4;
    662 	if (minifree < 0)
    663 		minifree = 0;
    664 	minbfree = avgbfree - fragstoblks(fs, fs->fs_fpg) / 4;
    665 	if (minbfree < 0)
    666 		minbfree = 0;
    667 	cgsize = (int64_t)fs->fs_fsize * fs->fs_fpg;
    668 	dirsize = (int64_t)fs->fs_avgfilesize * fs->fs_avgfpdir;
    669 	if (avgndir != 0) {
    670 		curdsz = (cgsize - (int64_t)avgbfree * fs->fs_bsize) / avgndir;
    671 		if (dirsize < curdsz)
    672 			dirsize = curdsz;
    673 	}
    674 	if (cgsize < dirsize * 255)
    675 		maxcontigdirs = cgsize / dirsize;
    676 	else
    677 		maxcontigdirs = 255;
    678 	if (fs->fs_avgfpdir > 0)
    679 		maxcontigdirs = min(maxcontigdirs,
    680 				    fs->fs_ipg / fs->fs_avgfpdir);
    681 	if (maxcontigdirs == 0)
    682 		maxcontigdirs = 1;
    683 
    684 	/*
    685 	 * Limit number of dirs in one cg and reserve space for
    686 	 * regular files, but only if we have no deficit in
    687 	 * inodes or space.
    688 	 */
    689 	prefcg = ino_to_cg(fs, pip->i_number);
    690 	for (cg = prefcg; cg < fs->fs_ncg; cg++)
    691 		if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
    692 		    fs->fs_cs(fs, cg).cs_nifree >= minifree &&
    693 	    	    fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
    694 			if (fs->fs_contigdirs[cg] < maxcontigdirs)
    695 				return ((ino_t)(fs->fs_ipg * cg));
    696 		}
    697 	for (cg = 0; cg < prefcg; cg++)
    698 		if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
    699 		    fs->fs_cs(fs, cg).cs_nifree >= minifree &&
    700 	    	    fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
    701 			if (fs->fs_contigdirs[cg] < maxcontigdirs)
    702 				return ((ino_t)(fs->fs_ipg * cg));
    703 		}
    704 	/*
    705 	 * This is a backstop when we are deficient in space.
    706 	 */
    707 	for (cg = prefcg; cg < fs->fs_ncg; cg++)
    708 		if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
    709 			return ((ino_t)(fs->fs_ipg * cg));
    710 	for (cg = 0; cg < prefcg; cg++)
    711 		if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
    712 			break;
    713 	return ((ino_t)(fs->fs_ipg * cg));
    714 }
    715 
    716 /*
    717  * Select the desired position for the next block in a file.  The file is
    718  * logically divided into sections. The first section is composed of the
    719  * direct blocks. Each additional section contains fs_maxbpg blocks.
    720  *
    721  * If no blocks have been allocated in the first section, the policy is to
    722  * request a block in the same cylinder group as the inode that describes
    723  * the file. If no blocks have been allocated in any other section, the
    724  * policy is to place the section in a cylinder group with a greater than
    725  * average number of free blocks.  An appropriate cylinder group is found
    726  * by using a rotor that sweeps the cylinder groups. When a new group of
    727  * blocks is needed, the sweep begins in the cylinder group following the
    728  * cylinder group from which the previous allocation was made. The sweep
    729  * continues until a cylinder group with greater than the average number
    730  * of free blocks is found. If the allocation is for the first block in an
    731  * indirect block, the information on the previous allocation is unavailable;
    732  * here a best guess is made based upon the logical block number being
    733  * allocated.
    734  *
    735  * If a section is already partially allocated, the policy is to
    736  * contiguously allocate fs_maxcontig blocks.  The end of one of these
    737  * contiguous blocks and the beginning of the next is laid out
    738  * contigously if possible.
    739  *
    740  * => um_lock held on entry and exit
    741  */
    742 daddr_t
    743 ffs_blkpref_ufs1(struct inode *ip, daddr_t lbn, int indx, int flags,
    744     int32_t *bap /* XXX ondisk32 */)
    745 {
    746 	struct fs *fs;
    747 	int cg;
    748 	int avgbfree, startcg;
    749 
    750 	KASSERT(mutex_owned(&ip->i_ump->um_lock));
    751 
    752 	fs = ip->i_fs;
    753 
    754 	/*
    755 	 * If allocating a contiguous file with B_CONTIG, use the hints
    756 	 * in the inode extentions to return the desired block.
    757 	 *
    758 	 * For metadata (indirect blocks) return the address of where
    759 	 * the first indirect block resides - we'll scan for the next
    760 	 * available slot if we need to allocate more than one indirect
    761 	 * block.  For data, return the address of the actual block
    762 	 * relative to the address of the first data block.
    763 	 */
    764 	if (flags & B_CONTIG) {
    765 		KASSERT(ip->i_ffs_first_data_blk != 0);
    766 		KASSERT(ip->i_ffs_first_indir_blk != 0);
    767 		if (flags & B_METAONLY)
    768 			return ip->i_ffs_first_indir_blk;
    769 		else
    770 			return ip->i_ffs_first_data_blk + blkstofrags(fs, lbn);
    771 	}
    772 
    773 	if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
    774 		if (lbn < NDADDR + NINDIR(fs)) {
    775 			cg = ino_to_cg(fs, ip->i_number);
    776 			return (cgbase(fs, cg) + fs->fs_frag);
    777 		}
    778 		/*
    779 		 * Find a cylinder with greater than average number of
    780 		 * unused data blocks.
    781 		 */
    782 		if (indx == 0 || bap[indx - 1] == 0)
    783 			startcg =
    784 			    ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
    785 		else
    786 			startcg = dtog(fs,
    787 				ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1);
    788 		startcg %= fs->fs_ncg;
    789 		avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
    790 		for (cg = startcg; cg < fs->fs_ncg; cg++)
    791 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
    792 				return (cgbase(fs, cg) + fs->fs_frag);
    793 			}
    794 		for (cg = 0; cg < startcg; cg++)
    795 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
    796 				return (cgbase(fs, cg) + fs->fs_frag);
    797 			}
    798 		return (0);
    799 	}
    800 	/*
    801 	 * We just always try to lay things out contiguously.
    802 	 */
    803 	return ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag;
    804 }
    805 
    806 daddr_t
    807 ffs_blkpref_ufs2(struct inode *ip, daddr_t lbn, int indx, int flags,
    808     int64_t *bap)
    809 {
    810 	struct fs *fs;
    811 	int cg;
    812 	int avgbfree, startcg;
    813 
    814 	KASSERT(mutex_owned(&ip->i_ump->um_lock));
    815 
    816 	fs = ip->i_fs;
    817 
    818 	/*
    819 	 * If allocating a contiguous file with B_CONTIG, use the hints
    820 	 * in the inode extentions to return the desired block.
    821 	 *
    822 	 * For metadata (indirect blocks) return the address of where
    823 	 * the first indirect block resides - we'll scan for the next
    824 	 * available slot if we need to allocate more than one indirect
    825 	 * block.  For data, return the address of the actual block
    826 	 * relative to the address of the first data block.
    827 	 */
    828 	if (flags & B_CONTIG) {
    829 		KASSERT(ip->i_ffs_first_data_blk != 0);
    830 		KASSERT(ip->i_ffs_first_indir_blk != 0);
    831 		if (flags & B_METAONLY)
    832 			return ip->i_ffs_first_indir_blk;
    833 		else
    834 			return ip->i_ffs_first_data_blk + blkstofrags(fs, lbn);
    835 	}
    836 
    837 	if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
    838 		if (lbn < NDADDR + NINDIR(fs)) {
    839 			cg = ino_to_cg(fs, ip->i_number);
    840 			return (cgbase(fs, cg) + fs->fs_frag);
    841 		}
    842 		/*
    843 		 * Find a cylinder with greater than average number of
    844 		 * unused data blocks.
    845 		 */
    846 		if (indx == 0 || bap[indx - 1] == 0)
    847 			startcg =
    848 			    ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
    849 		else
    850 			startcg = dtog(fs,
    851 				ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1);
    852 		startcg %= fs->fs_ncg;
    853 		avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
    854 		for (cg = startcg; cg < fs->fs_ncg; cg++)
    855 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
    856 				return (cgbase(fs, cg) + fs->fs_frag);
    857 			}
    858 		for (cg = 0; cg < startcg; cg++)
    859 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
    860 				return (cgbase(fs, cg) + fs->fs_frag);
    861 			}
    862 		return (0);
    863 	}
    864 	/*
    865 	 * We just always try to lay things out contiguously.
    866 	 */
    867 	return ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag;
    868 }
    869 
    870 
    871 /*
    872  * Implement the cylinder overflow algorithm.
    873  *
    874  * The policy implemented by this algorithm is:
    875  *   1) allocate the block in its requested cylinder group.
    876  *   2) quadradically rehash on the cylinder group number.
    877  *   3) brute force search for a free block.
    878  *
    879  * => called with um_lock held
    880  * => returns with um_lock released on success, held on failure
    881  *    (*allocator releases lock on success, retains lock on failure)
    882  */
    883 /*VARARGS5*/
    884 static daddr_t
    885 ffs_hashalloc(struct inode *ip, int cg, daddr_t pref,
    886     int size /* size for data blocks, mode for inodes */,
    887     int flags, daddr_t (*allocator)(struct inode *, int, daddr_t, int, int))
    888 {
    889 	struct fs *fs;
    890 	daddr_t result;
    891 	int i, icg = cg;
    892 
    893 	fs = ip->i_fs;
    894 	/*
    895 	 * 1: preferred cylinder group
    896 	 */
    897 	result = (*allocator)(ip, cg, pref, size, flags);
    898 	if (result)
    899 		return (result);
    900 
    901 	if (flags & B_CONTIG)
    902 		return (result);
    903 	/*
    904 	 * 2: quadratic rehash
    905 	 */
    906 	for (i = 1; i < fs->fs_ncg; i *= 2) {
    907 		cg += i;
    908 		if (cg >= fs->fs_ncg)
    909 			cg -= fs->fs_ncg;
    910 		result = (*allocator)(ip, cg, 0, size, flags);
    911 		if (result)
    912 			return (result);
    913 	}
    914 	/*
    915 	 * 3: brute force search
    916 	 * Note that we start at i == 2, since 0 was checked initially,
    917 	 * and 1 is always checked in the quadratic rehash.
    918 	 */
    919 	cg = (icg + 2) % fs->fs_ncg;
    920 	for (i = 2; i < fs->fs_ncg; i++) {
    921 		result = (*allocator)(ip, cg, 0, size, flags);
    922 		if (result)
    923 			return (result);
    924 		cg++;
    925 		if (cg == fs->fs_ncg)
    926 			cg = 0;
    927 	}
    928 	return (0);
    929 }
    930 
    931 /*
    932  * Determine whether a fragment can be extended.
    933  *
    934  * Check to see if the necessary fragments are available, and
    935  * if they are, allocate them.
    936  *
    937  * => called with um_lock held
    938  * => returns with um_lock released on success, held on failure
    939  */
    940 static daddr_t
    941 ffs_fragextend(struct inode *ip, int cg, daddr_t bprev, int osize, int nsize)
    942 {
    943 	struct ufsmount *ump;
    944 	struct fs *fs;
    945 	struct cg *cgp;
    946 	struct buf *bp;
    947 	daddr_t bno;
    948 	int frags, bbase;
    949 	int i, error;
    950 	u_int8_t *blksfree;
    951 
    952 	fs = ip->i_fs;
    953 	ump = ip->i_ump;
    954 
    955 	KASSERT(mutex_owned(&ump->um_lock));
    956 
    957 	if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
    958 		return (0);
    959 	frags = numfrags(fs, nsize);
    960 	bbase = fragnum(fs, bprev);
    961 	if (bbase > fragnum(fs, (bprev + frags - 1))) {
    962 		/* cannot extend across a block boundary */
    963 		return (0);
    964 	}
    965 	mutex_exit(&ump->um_lock);
    966 	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
    967 		(int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp);
    968 	if (error)
    969 		goto fail;
    970 	cgp = (struct cg *)bp->b_data;
    971 	if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs)))
    972 		goto fail;
    973 	cgp->cg_old_time = ufs_rw32(time_second, UFS_FSNEEDSWAP(fs));
    974 	if ((fs->fs_magic != FS_UFS1_MAGIC) ||
    975 	    (fs->fs_old_flags & FS_FLAGS_UPDATED))
    976 		cgp->cg_time = ufs_rw64(time_second, UFS_FSNEEDSWAP(fs));
    977 	bno = dtogd(fs, bprev);
    978 	blksfree = cg_blksfree(cgp, UFS_FSNEEDSWAP(fs));
    979 	for (i = numfrags(fs, osize); i < frags; i++)
    980 		if (isclr(blksfree, bno + i))
    981 			goto fail;
    982 	/*
    983 	 * the current fragment can be extended
    984 	 * deduct the count on fragment being extended into
    985 	 * increase the count on the remaining fragment (if any)
    986 	 * allocate the extended piece
    987 	 */
    988 	for (i = frags; i < fs->fs_frag - bbase; i++)
    989 		if (isclr(blksfree, bno + i))
    990 			break;
    991 	ufs_add32(cgp->cg_frsum[i - numfrags(fs, osize)], -1, UFS_FSNEEDSWAP(fs));
    992 	if (i != frags)
    993 		ufs_add32(cgp->cg_frsum[i - frags], 1, UFS_FSNEEDSWAP(fs));
    994 	mutex_enter(&ump->um_lock);
    995 	for (i = numfrags(fs, osize); i < frags; i++) {
    996 		clrbit(blksfree, bno + i);
    997 		ufs_add32(cgp->cg_cs.cs_nffree, -1, UFS_FSNEEDSWAP(fs));
    998 		fs->fs_cstotal.cs_nffree--;
    999 		fs->fs_cs(fs, cg).cs_nffree--;
   1000 	}
   1001 	fs->fs_fmod = 1;
   1002 	ACTIVECG_CLR(fs, cg);
   1003 	mutex_exit(&ump->um_lock);
   1004 	if (DOINGSOFTDEP(ITOV(ip)))
   1005 		softdep_setup_blkmapdep(bp, fs, bprev);
   1006 	bdwrite(bp);
   1007 	return (bprev);
   1008 
   1009  fail:
   1010  	brelse(bp, 0);
   1011  	mutex_enter(&ump->um_lock);
   1012  	return (0);
   1013 }
   1014 
   1015 /*
   1016  * Determine whether a block can be allocated.
   1017  *
   1018  * Check to see if a block of the appropriate size is available,
   1019  * and if it is, allocate it.
   1020  */
   1021 static daddr_t
   1022 ffs_alloccg(struct inode *ip, int cg, daddr_t bpref, int size, int flags)
   1023 {
   1024 	struct ufsmount *ump;
   1025 	struct fs *fs = ip->i_fs;
   1026 	struct cg *cgp;
   1027 	struct buf *bp;
   1028 	int32_t bno;
   1029 	daddr_t blkno;
   1030 	int error, frags, allocsiz, i;
   1031 	u_int8_t *blksfree;
   1032 #ifdef FFS_EI
   1033 	const int needswap = UFS_FSNEEDSWAP(fs);
   1034 #endif
   1035 
   1036 	ump = ip->i_ump;
   1037 
   1038 	KASSERT(mutex_owned(&ump->um_lock));
   1039 
   1040 	if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
   1041 		return (0);
   1042 	mutex_exit(&ump->um_lock);
   1043 	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
   1044 		(int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp);
   1045 	if (error)
   1046 		goto fail;
   1047 	cgp = (struct cg *)bp->b_data;
   1048 	if (!cg_chkmagic(cgp, needswap) ||
   1049 	    (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize))
   1050 		goto fail;
   1051 	cgp->cg_old_time = ufs_rw32(time_second, needswap);
   1052 	if ((fs->fs_magic != FS_UFS1_MAGIC) ||
   1053 	    (fs->fs_old_flags & FS_FLAGS_UPDATED))
   1054 		cgp->cg_time = ufs_rw64(time_second, needswap);
   1055 	if (size == fs->fs_bsize) {
   1056 		mutex_enter(&ump->um_lock);
   1057 		blkno = ffs_alloccgblk(ip, bp, bpref, flags);
   1058 		ACTIVECG_CLR(fs, cg);
   1059 		mutex_exit(&ump->um_lock);
   1060 		bdwrite(bp);
   1061 		return (blkno);
   1062 	}
   1063 	/*
   1064 	 * check to see if any fragments are already available
   1065 	 * allocsiz is the size which will be allocated, hacking
   1066 	 * it down to a smaller size if necessary
   1067 	 */
   1068 	blksfree = cg_blksfree(cgp, needswap);
   1069 	frags = numfrags(fs, size);
   1070 	for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++)
   1071 		if (cgp->cg_frsum[allocsiz] != 0)
   1072 			break;
   1073 	if (allocsiz == fs->fs_frag) {
   1074 		/*
   1075 		 * no fragments were available, so a block will be
   1076 		 * allocated, and hacked up
   1077 		 */
   1078 		if (cgp->cg_cs.cs_nbfree == 0)
   1079 			goto fail;
   1080 		mutex_enter(&ump->um_lock);
   1081 		blkno = ffs_alloccgblk(ip, bp, bpref, flags);
   1082 		bno = dtogd(fs, blkno);
   1083 		for (i = frags; i < fs->fs_frag; i++)
   1084 			setbit(blksfree, bno + i);
   1085 		i = fs->fs_frag - frags;
   1086 		ufs_add32(cgp->cg_cs.cs_nffree, i, needswap);
   1087 		fs->fs_cstotal.cs_nffree += i;
   1088 		fs->fs_cs(fs, cg).cs_nffree += i;
   1089 		fs->fs_fmod = 1;
   1090 		ufs_add32(cgp->cg_frsum[i], 1, needswap);
   1091 		ACTIVECG_CLR(fs, cg);
   1092 		mutex_exit(&ump->um_lock);
   1093 		bdwrite(bp);
   1094 		return (blkno);
   1095 	}
   1096 	bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
   1097 #if 0
   1098 	/*
   1099 	 * XXX fvdl mapsearch will panic, and never return -1
   1100 	 *          also: returning NULL as daddr_t ?
   1101 	 */
   1102 	if (bno < 0)
   1103 		goto fail;
   1104 #endif
   1105 	for (i = 0; i < frags; i++)
   1106 		clrbit(blksfree, bno + i);
   1107 	mutex_enter(&ump->um_lock);
   1108 	ufs_add32(cgp->cg_cs.cs_nffree, -frags, needswap);
   1109 	fs->fs_cstotal.cs_nffree -= frags;
   1110 	fs->fs_cs(fs, cg).cs_nffree -= frags;
   1111 	fs->fs_fmod = 1;
   1112 	ufs_add32(cgp->cg_frsum[allocsiz], -1, needswap);
   1113 	if (frags != allocsiz)
   1114 		ufs_add32(cgp->cg_frsum[allocsiz - frags], 1, needswap);
   1115 	blkno = cg * fs->fs_fpg + bno;
   1116 	ACTIVECG_CLR(fs, cg);
   1117 	mutex_exit(&ump->um_lock);
   1118 	if (DOINGSOFTDEP(ITOV(ip)))
   1119 		softdep_setup_blkmapdep(bp, fs, blkno);
   1120 	bdwrite(bp);
   1121 	return blkno;
   1122 
   1123  fail:
   1124  	brelse(bp, 0);
   1125  	mutex_enter(&ump->um_lock);
   1126  	return (0);
   1127 }
   1128 
   1129 /*
   1130  * Allocate a block in a cylinder group.
   1131  *
   1132  * This algorithm implements the following policy:
   1133  *   1) allocate the requested block.
   1134  *   2) allocate a rotationally optimal block in the same cylinder.
   1135  *   3) allocate the next available block on the block rotor for the
   1136  *      specified cylinder group.
   1137  * Note that this routine only allocates fs_bsize blocks; these
   1138  * blocks may be fragmented by the routine that allocates them.
   1139  */
   1140 static daddr_t
   1141 ffs_alloccgblk(struct inode *ip, struct buf *bp, daddr_t bpref, int flags)
   1142 {
   1143 	struct ufsmount *ump;
   1144 	struct fs *fs = ip->i_fs;
   1145 	struct cg *cgp;
   1146 	daddr_t blkno;
   1147 	int32_t bno;
   1148 	u_int8_t *blksfree;
   1149 #ifdef FFS_EI
   1150 	const int needswap = UFS_FSNEEDSWAP(fs);
   1151 #endif
   1152 
   1153 	ump = ip->i_ump;
   1154 
   1155 	KASSERT(mutex_owned(&ump->um_lock));
   1156 
   1157 	cgp = (struct cg *)bp->b_data;
   1158 	blksfree = cg_blksfree(cgp, needswap);
   1159 	if (bpref == 0 || dtog(fs, bpref) != ufs_rw32(cgp->cg_cgx, needswap)) {
   1160 		bpref = ufs_rw32(cgp->cg_rotor, needswap);
   1161 	} else {
   1162 		bpref = blknum(fs, bpref);
   1163 		bno = dtogd(fs, bpref);
   1164 		/*
   1165 		 * if the requested block is available, use it
   1166 		 */
   1167 		if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno)))
   1168 			goto gotit;
   1169 		/*
   1170 		 * if the requested data block isn't available and we are
   1171 		 * trying to allocate a contiguous file, return an error.
   1172 		 */
   1173 		if ((flags & (B_CONTIG | B_METAONLY)) == B_CONTIG)
   1174 			return (0);
   1175 	}
   1176 
   1177 	/*
   1178 	 * Take the next available block in this cylinder group.
   1179 	 */
   1180 	bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag);
   1181 	if (bno < 0)
   1182 		return (0);
   1183 	cgp->cg_rotor = ufs_rw32(bno, needswap);
   1184 gotit:
   1185 	blkno = fragstoblks(fs, bno);
   1186 	ffs_clrblock(fs, blksfree, blkno);
   1187 	ffs_clusteracct(fs, cgp, blkno, -1);
   1188 	ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
   1189 	fs->fs_cstotal.cs_nbfree--;
   1190 	fs->fs_cs(fs, ufs_rw32(cgp->cg_cgx, needswap)).cs_nbfree--;
   1191 	if ((fs->fs_magic == FS_UFS1_MAGIC) &&
   1192 	    ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
   1193 		int cylno;
   1194 		cylno = old_cbtocylno(fs, bno);
   1195 		KASSERT(cylno >= 0);
   1196 		KASSERT(cylno < fs->fs_old_ncyl);
   1197 		KASSERT(old_cbtorpos(fs, bno) >= 0);
   1198 		KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bno) < fs->fs_old_nrpos);
   1199 		ufs_add16(old_cg_blks(fs, cgp, cylno, needswap)[old_cbtorpos(fs, bno)], -1,
   1200 		    needswap);
   1201 		ufs_add32(old_cg_blktot(cgp, needswap)[cylno], -1, needswap);
   1202 	}
   1203 	fs->fs_fmod = 1;
   1204 	blkno = ufs_rw32(cgp->cg_cgx, needswap) * fs->fs_fpg + bno;
   1205 	if (DOINGSOFTDEP(ITOV(ip))) {
   1206 		mutex_exit(&ump->um_lock);
   1207 		softdep_setup_blkmapdep(bp, fs, blkno);
   1208 		mutex_enter(&ump->um_lock);
   1209 	}
   1210 	return (blkno);
   1211 }
   1212 
   1213 /*
   1214  * Determine whether an inode can be allocated.
   1215  *
   1216  * Check to see if an inode is available, and if it is,
   1217  * allocate it using the following policy:
   1218  *   1) allocate the requested inode.
   1219  *   2) allocate the next available inode after the requested
   1220  *      inode in the specified cylinder group.
   1221  */
   1222 static daddr_t
   1223 ffs_nodealloccg(struct inode *ip, int cg, daddr_t ipref, int mode, int flags)
   1224 {
   1225 	struct ufsmount *ump = ip->i_ump;
   1226 	struct fs *fs = ip->i_fs;
   1227 	struct cg *cgp;
   1228 	struct buf *bp, *ibp;
   1229 	u_int8_t *inosused;
   1230 	int error, start, len, loc, map, i;
   1231 	int32_t initediblk;
   1232 	daddr_t nalloc;
   1233 	struct ufs2_dinode *dp2;
   1234 #ifdef FFS_EI
   1235 	const int needswap = UFS_FSNEEDSWAP(fs);
   1236 #endif
   1237 
   1238 	KASSERT(mutex_owned(&ump->um_lock));
   1239 	UFS_WAPBL_JLOCK_ASSERT(ip->i_ump->um_mountp);
   1240 
   1241 	if (fs->fs_cs(fs, cg).cs_nifree == 0)
   1242 		return (0);
   1243 	mutex_exit(&ump->um_lock);
   1244 	ibp = NULL;
   1245 	initediblk = -1;
   1246 retry:
   1247 	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
   1248 		(int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp);
   1249 	if (error)
   1250 		goto fail;
   1251 	cgp = (struct cg *)bp->b_data;
   1252 	if (!cg_chkmagic(cgp, needswap) || cgp->cg_cs.cs_nifree == 0)
   1253 		goto fail;
   1254 
   1255 	if (ibp != NULL &&
   1256 	    initediblk != ufs_rw32(cgp->cg_initediblk, needswap)) {
   1257 		/* Another thread allocated more inodes so we retry the test. */
   1258 		brelse(ibp, BC_INVAL);
   1259 		ibp = NULL;
   1260 	}
   1261 	/*
   1262 	 * Check to see if we need to initialize more inodes.
   1263 	 */
   1264 	if (fs->fs_magic == FS_UFS2_MAGIC && ibp == NULL) {
   1265 		initediblk = ufs_rw32(cgp->cg_initediblk, needswap);
   1266 		nalloc = fs->fs_ipg - ufs_rw32(cgp->cg_cs.cs_nifree, needswap);
   1267 		if (nalloc + INOPB(fs) > initediblk &&
   1268 		    initediblk < ufs_rw32(cgp->cg_niblk, needswap)) {
   1269 			/*
   1270 			 * We have to release the cg buffer here to prevent
   1271 			 * a deadlock when reading the inode block will
   1272 			 * run a copy-on-write that might use this cg.
   1273 			 */
   1274 			brelse(bp, 0);
   1275 			bp = NULL;
   1276 			error = ffs_getblk(ip->i_devvp, fsbtodb(fs,
   1277 			    ino_to_fsba(fs, cg * fs->fs_ipg + initediblk)),
   1278 			    FFS_NOBLK, fs->fs_bsize, false, &ibp);
   1279 			if (error)
   1280 				goto fail;
   1281 			goto retry;
   1282 		}
   1283 	}
   1284 
   1285 	cgp->cg_old_time = ufs_rw32(time_second, needswap);
   1286 	if ((fs->fs_magic != FS_UFS1_MAGIC) ||
   1287 	    (fs->fs_old_flags & FS_FLAGS_UPDATED))
   1288 		cgp->cg_time = ufs_rw64(time_second, needswap);
   1289 	inosused = cg_inosused(cgp, needswap);
   1290 	if (ipref) {
   1291 		ipref %= fs->fs_ipg;
   1292 		if (isclr(inosused, ipref))
   1293 			goto gotit;
   1294 	}
   1295 	start = ufs_rw32(cgp->cg_irotor, needswap) / NBBY;
   1296 	len = howmany(fs->fs_ipg - ufs_rw32(cgp->cg_irotor, needswap),
   1297 		NBBY);
   1298 	loc = skpc(0xff, len, &inosused[start]);
   1299 	if (loc == 0) {
   1300 		len = start + 1;
   1301 		start = 0;
   1302 		loc = skpc(0xff, len, &inosused[0]);
   1303 		if (loc == 0) {
   1304 			printf("cg = %d, irotor = %d, fs = %s\n",
   1305 			    cg, ufs_rw32(cgp->cg_irotor, needswap),
   1306 				fs->fs_fsmnt);
   1307 			panic("ffs_nodealloccg: map corrupted");
   1308 			/* NOTREACHED */
   1309 		}
   1310 	}
   1311 	i = start + len - loc;
   1312 	map = inosused[i];
   1313 	ipref = i * NBBY;
   1314 	for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) {
   1315 		if ((map & i) == 0) {
   1316 			cgp->cg_irotor = ufs_rw32(ipref, needswap);
   1317 			goto gotit;
   1318 		}
   1319 	}
   1320 	printf("fs = %s\n", fs->fs_fsmnt);
   1321 	panic("ffs_nodealloccg: block not in map");
   1322 	/* NOTREACHED */
   1323 gotit:
   1324 	UFS_WAPBL_REGISTER_INODE(ip->i_ump->um_mountp, cg * fs->fs_ipg + ipref,
   1325 	    mode);
   1326 	/*
   1327 	 * Check to see if we need to initialize more inodes.
   1328 	 */
   1329 	if (ibp != NULL) {
   1330 		KASSERT(initediblk == ufs_rw32(cgp->cg_initediblk, needswap));
   1331 		memset(ibp->b_data, 0, fs->fs_bsize);
   1332 		dp2 = (struct ufs2_dinode *)(ibp->b_data);
   1333 		for (i = 0; i < INOPB(fs); i++) {
   1334 			/*
   1335 			 * Don't bother to swap, it's supposed to be
   1336 			 * random, after all.
   1337 			 */
   1338 			dp2->di_gen = (arc4random() & INT32_MAX) / 2 + 1;
   1339 			dp2++;
   1340 		}
   1341 		initediblk += INOPB(fs);
   1342 		cgp->cg_initediblk = ufs_rw32(initediblk, needswap);
   1343 	}
   1344 
   1345 	mutex_enter(&ump->um_lock);
   1346 	ACTIVECG_CLR(fs, cg);
   1347 	setbit(inosused, ipref);
   1348 	ufs_add32(cgp->cg_cs.cs_nifree, -1, needswap);
   1349 	fs->fs_cstotal.cs_nifree--;
   1350 	fs->fs_cs(fs, cg).cs_nifree--;
   1351 	fs->fs_fmod = 1;
   1352 	if ((mode & IFMT) == IFDIR) {
   1353 		ufs_add32(cgp->cg_cs.cs_ndir, 1, needswap);
   1354 		fs->fs_cstotal.cs_ndir++;
   1355 		fs->fs_cs(fs, cg).cs_ndir++;
   1356 	}
   1357 	mutex_exit(&ump->um_lock);
   1358 	if (DOINGSOFTDEP(ITOV(ip)))
   1359 		softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref);
   1360 	if (ibp != NULL) {
   1361 		bwrite(bp);
   1362 		bawrite(ibp);
   1363 	} else
   1364 		bdwrite(bp);
   1365 	return (cg * fs->fs_ipg + ipref);
   1366  fail:
   1367 	if (bp != NULL)
   1368 		brelse(bp, 0);
   1369 	if (ibp != NULL)
   1370 		brelse(ibp, BC_INVAL);
   1371 	mutex_enter(&ump->um_lock);
   1372 	return (0);
   1373 }
   1374 
   1375 /*
   1376  * Allocate a block or fragment.
   1377  *
   1378  * The specified block or fragment is removed from the
   1379  * free map, possibly fragmenting a block in the process.
   1380  *
   1381  * This implementation should mirror fs_blkfree
   1382  *
   1383  * => um_lock not held on entry or exit
   1384  */
   1385 int
   1386 ffs_blkalloc(struct inode *ip, daddr_t bno, long size)
   1387 {
   1388 	struct ufsmount *ump = ip->i_ump;
   1389 	struct fs *fs = ip->i_fs;
   1390 	struct cg *cgp;
   1391 	struct buf *bp;
   1392 	int32_t fragno, cgbno;
   1393 	int i, error, cg, blk, frags, bbase;
   1394 	u_int8_t *blksfree;
   1395 	const int needswap = UFS_FSNEEDSWAP(fs);
   1396 
   1397 	if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 ||
   1398 	    fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) {
   1399 		printf("dev = 0x%x, bno = %" PRId64 " bsize = %d, "
   1400 		       "size = %ld, fs = %s\n",
   1401 		    ip->i_dev, bno, fs->fs_bsize, size, fs->fs_fsmnt);
   1402 		panic("blkalloc: bad size");
   1403 	}
   1404 	cg = dtog(fs, bno);
   1405 	if (bno >= fs->fs_size) {
   1406 		printf("bad block %" PRId64 ", ino %" PRId64 "\n", bno,
   1407 		    ip->i_number);
   1408 		ffs_fserr(fs, ip->i_uid, "bad block");
   1409 		return EINVAL;
   1410 	}
   1411 	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
   1412 		(int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp);
   1413 	if (error) {
   1414 		brelse(bp, 0);
   1415 		return error;
   1416 	}
   1417 	cgp = (struct cg *)bp->b_data;
   1418 	if (!cg_chkmagic(cgp, needswap)) {
   1419 		brelse(bp, 0);
   1420 		return EIO;
   1421 	}
   1422 	cgp->cg_old_time = ufs_rw32(time_second, needswap);
   1423 	cgp->cg_time = ufs_rw64(time_second, needswap);
   1424 	cgbno = dtogd(fs, bno);
   1425 	blksfree = cg_blksfree(cgp, needswap);
   1426 
   1427 	mutex_enter(&ump->um_lock);
   1428 	if (size == fs->fs_bsize) {
   1429 		fragno = fragstoblks(fs, cgbno);
   1430 		if (!ffs_isblock(fs, blksfree, fragno)) {
   1431 			mutex_exit(&ump->um_lock);
   1432 			brelse(bp, 0);
   1433 			return EBUSY;
   1434 		}
   1435 		ffs_clrblock(fs, blksfree, fragno);
   1436 		ffs_clusteracct(fs, cgp, fragno, -1);
   1437 		ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
   1438 		fs->fs_cstotal.cs_nbfree--;
   1439 		fs->fs_cs(fs, cg).cs_nbfree--;
   1440 	} else {
   1441 		bbase = cgbno - fragnum(fs, cgbno);
   1442 
   1443 		frags = numfrags(fs, size);
   1444 		for (i = 0; i < frags; i++) {
   1445 			if (isclr(blksfree, cgbno + i)) {
   1446 				mutex_exit(&ump->um_lock);
   1447 				brelse(bp, 0);
   1448 				return EBUSY;
   1449 			}
   1450 		}
   1451 		/*
   1452 		 * if a complete block is being split, account for it
   1453 		 */
   1454 		fragno = fragstoblks(fs, bbase);
   1455 		if (ffs_isblock(fs, blksfree, fragno)) {
   1456 			ufs_add32(cgp->cg_cs.cs_nffree, fs->fs_frag, needswap);
   1457 			fs->fs_cstotal.cs_nffree += fs->fs_frag;
   1458 			fs->fs_cs(fs, cg).cs_nffree += fs->fs_frag;
   1459 			ffs_clusteracct(fs, cgp, fragno, -1);
   1460 			ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
   1461 			fs->fs_cstotal.cs_nbfree--;
   1462 			fs->fs_cs(fs, cg).cs_nbfree--;
   1463 		}
   1464 		/*
   1465 		 * decrement the counts associated with the old frags
   1466 		 */
   1467 		blk = blkmap(fs, blksfree, bbase);
   1468 		ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap);
   1469 		/*
   1470 		 * allocate the fragment
   1471 		 */
   1472 		for (i = 0; i < frags; i++) {
   1473 			clrbit(blksfree, cgbno + i);
   1474 		}
   1475 		ufs_add32(cgp->cg_cs.cs_nffree, -i, needswap);
   1476 		fs->fs_cstotal.cs_nffree -= i;
   1477 		fs->fs_cs(fs, cg).cs_nffree -= i;
   1478 		/*
   1479 		 * add back in counts associated with the new frags
   1480 		 */
   1481 		blk = blkmap(fs, blksfree, bbase);
   1482 		ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap);
   1483 	}
   1484 	fs->fs_fmod = 1;
   1485 	ACTIVECG_CLR(fs, cg);
   1486 	mutex_exit(&ump->um_lock);
   1487 	bdwrite(bp);
   1488 	return 0;
   1489 }
   1490 
   1491 /*
   1492  * Free a block or fragment.
   1493  *
   1494  * The specified block or fragment is placed back in the
   1495  * free map. If a fragment is deallocated, a possible
   1496  * block reassembly is checked.
   1497  *
   1498  * => um_lock not held on entry or exit
   1499  */
   1500 void
   1501 ffs_blkfree(struct fs *fs, struct vnode *devvp, daddr_t bno, long size,
   1502     ino_t inum)
   1503 {
   1504 	struct cg *cgp;
   1505 	struct buf *bp;
   1506 	struct ufsmount *ump;
   1507 	int32_t fragno, cgbno;
   1508 	daddr_t cgblkno;
   1509 	int i, error, cg, blk, frags, bbase;
   1510 	u_int8_t *blksfree;
   1511 	dev_t dev;
   1512 	const bool devvp_is_snapshot = (devvp->v_type != VBLK);
   1513 	const int needswap = UFS_FSNEEDSWAP(fs);
   1514 
   1515 	cg = dtog(fs, bno);
   1516 	if (devvp_is_snapshot) {
   1517 		dev = VTOI(devvp)->i_devvp->v_rdev;
   1518 		ump = VFSTOUFS(devvp->v_mount);
   1519 		cgblkno = fragstoblks(fs, cgtod(fs, cg));
   1520 	} else {
   1521 		dev = devvp->v_rdev;
   1522 		ump = VFSTOUFS(devvp->v_specmountpoint);
   1523 		cgblkno = fsbtodb(fs, cgtod(fs, cg));
   1524 		if (ffs_snapblkfree(fs, devvp, bno, size, inum))
   1525 			return;
   1526 	}
   1527 	if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 ||
   1528 	    fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) {
   1529 		printf("dev = 0x%x, bno = %" PRId64 " bsize = %d, "
   1530 		       "size = %ld, fs = %s\n",
   1531 		    dev, bno, fs->fs_bsize, size, fs->fs_fsmnt);
   1532 		panic("blkfree: bad size");
   1533 	}
   1534 
   1535 	if (bno >= fs->fs_size) {
   1536 		printf("bad block %" PRId64 ", ino %llu\n", bno,
   1537 		    (unsigned long long)inum);
   1538 		ffs_fserr(fs, inum, "bad block");
   1539 		return;
   1540 	}
   1541 	error = bread(devvp, cgblkno, (int)fs->fs_cgsize,
   1542 	    NOCRED, B_MODIFY, &bp);
   1543 	if (error) {
   1544 		brelse(bp, 0);
   1545 		return;
   1546 	}
   1547 	cgp = (struct cg *)bp->b_data;
   1548 	if (!cg_chkmagic(cgp, needswap)) {
   1549 		brelse(bp, 0);
   1550 		return;
   1551 	}
   1552 	cgp->cg_old_time = ufs_rw32(time_second, needswap);
   1553 	if ((fs->fs_magic != FS_UFS1_MAGIC) ||
   1554 	    (fs->fs_old_flags & FS_FLAGS_UPDATED))
   1555 		cgp->cg_time = ufs_rw64(time_second, needswap);
   1556 	cgbno = dtogd(fs, bno);
   1557 	blksfree = cg_blksfree(cgp, needswap);
   1558 	mutex_enter(&ump->um_lock);
   1559 	if (size == fs->fs_bsize) {
   1560 		fragno = fragstoblks(fs, cgbno);
   1561 		if (!ffs_isfreeblock(fs, blksfree, fragno)) {
   1562 			if (devvp_is_snapshot) {
   1563 				mutex_exit(&ump->um_lock);
   1564 				brelse(bp, 0);
   1565 				return;
   1566 			}
   1567 			printf("dev = 0x%x, block = %" PRId64 ", fs = %s\n",
   1568 			    dev, bno, fs->fs_fsmnt);
   1569 			panic("blkfree: freeing free block");
   1570 		}
   1571 		ffs_setblock(fs, blksfree, fragno);
   1572 		ffs_clusteracct(fs, cgp, fragno, 1);
   1573 		ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap);
   1574 		fs->fs_cstotal.cs_nbfree++;
   1575 		fs->fs_cs(fs, cg).cs_nbfree++;
   1576 		if ((fs->fs_magic == FS_UFS1_MAGIC) &&
   1577 		    ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
   1578 			i = old_cbtocylno(fs, cgbno);
   1579 			KASSERT(i >= 0);
   1580 			KASSERT(i < fs->fs_old_ncyl);
   1581 			KASSERT(old_cbtorpos(fs, cgbno) >= 0);
   1582 			KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, cgbno) < fs->fs_old_nrpos);
   1583 			ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs, cgbno)], 1,
   1584 			    needswap);
   1585 			ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap);
   1586 		}
   1587 	} else {
   1588 		bbase = cgbno - fragnum(fs, cgbno);
   1589 		/*
   1590 		 * decrement the counts associated with the old frags
   1591 		 */
   1592 		blk = blkmap(fs, blksfree, bbase);
   1593 		ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap);
   1594 		/*
   1595 		 * deallocate the fragment
   1596 		 */
   1597 		frags = numfrags(fs, size);
   1598 		for (i = 0; i < frags; i++) {
   1599 			if (isset(blksfree, cgbno + i)) {
   1600 				printf("dev = 0x%x, block = %" PRId64
   1601 				       ", fs = %s\n",
   1602 				    dev, bno + i, fs->fs_fsmnt);
   1603 				panic("blkfree: freeing free frag");
   1604 			}
   1605 			setbit(blksfree, cgbno + i);
   1606 		}
   1607 		ufs_add32(cgp->cg_cs.cs_nffree, i, needswap);
   1608 		fs->fs_cstotal.cs_nffree += i;
   1609 		fs->fs_cs(fs, cg).cs_nffree += i;
   1610 		/*
   1611 		 * add back in counts associated with the new frags
   1612 		 */
   1613 		blk = blkmap(fs, blksfree, bbase);
   1614 		ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap);
   1615 		/*
   1616 		 * if a complete block has been reassembled, account for it
   1617 		 */
   1618 		fragno = fragstoblks(fs, bbase);
   1619 		if (ffs_isblock(fs, blksfree, fragno)) {
   1620 			ufs_add32(cgp->cg_cs.cs_nffree, -fs->fs_frag, needswap);
   1621 			fs->fs_cstotal.cs_nffree -= fs->fs_frag;
   1622 			fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
   1623 			ffs_clusteracct(fs, cgp, fragno, 1);
   1624 			ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap);
   1625 			fs->fs_cstotal.cs_nbfree++;
   1626 			fs->fs_cs(fs, cg).cs_nbfree++;
   1627 			if ((fs->fs_magic == FS_UFS1_MAGIC) &&
   1628 			    ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
   1629 				i = old_cbtocylno(fs, bbase);
   1630 				KASSERT(i >= 0);
   1631 				KASSERT(i < fs->fs_old_ncyl);
   1632 				KASSERT(old_cbtorpos(fs, bbase) >= 0);
   1633 				KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bbase) < fs->fs_old_nrpos);
   1634 				ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs,
   1635 				    bbase)], 1, needswap);
   1636 				ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap);
   1637 			}
   1638 		}
   1639 	}
   1640 	fs->fs_fmod = 1;
   1641 	ACTIVECG_CLR(fs, cg);
   1642 	mutex_exit(&ump->um_lock);
   1643 	bdwrite(bp);
   1644 }
   1645 
   1646 /*
   1647  * Free an inode.
   1648  */
   1649 int
   1650 ffs_vfree(struct vnode *vp, ino_t ino, int mode)
   1651 {
   1652 
   1653 	if (DOINGSOFTDEP(vp)) {
   1654 		softdep_freefile(vp, ino, mode);
   1655 		return (0);
   1656 	}
   1657 	return ffs_freefile(VTOI(vp)->i_fs, VTOI(vp)->i_devvp, ino, mode);
   1658 }
   1659 
   1660 /*
   1661  * Do the actual free operation.
   1662  * The specified inode is placed back in the free map.
   1663  *
   1664  * => um_lock not held on entry or exit
   1665  */
   1666 int
   1667 ffs_freefile(struct fs *fs, struct vnode *devvp, ino_t ino, int mode)
   1668 {
   1669 	struct ufsmount *ump;
   1670 	struct cg *cgp;
   1671 	struct buf *bp;
   1672 	int error, cg;
   1673 	daddr_t cgbno;
   1674 	u_int8_t *inosused;
   1675 	dev_t dev;
   1676 	const bool devvp_is_snapshot = (devvp->v_type != VBLK);
   1677 #ifdef FFS_EI
   1678 	const int needswap = UFS_FSNEEDSWAP(fs);
   1679 #endif
   1680 
   1681 	if (!devvp_is_snapshot) {
   1682 		UFS_WAPBL_JLOCK_ASSERT(devvp->v_specinfo->si_mountpoint);
   1683 	}
   1684 
   1685 	cg = ino_to_cg(fs, ino);
   1686 	if (devvp_is_snapshot) {
   1687 		dev = VTOI(devvp)->i_devvp->v_rdev;
   1688 		ump = VFSTOUFS(devvp->v_mount);
   1689 		cgbno = fragstoblks(fs, cgtod(fs, cg));
   1690 	} else {
   1691 		dev = devvp->v_rdev;
   1692 		ump = VFSTOUFS(devvp->v_specmountpoint);
   1693 		cgbno = fsbtodb(fs, cgtod(fs, cg));
   1694 	}
   1695 	if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
   1696 		panic("ifree: range: dev = 0x%x, ino = %llu, fs = %s",
   1697 		    dev, (unsigned long long)ino, fs->fs_fsmnt);
   1698 	error = bread(devvp, cgbno, (int)fs->fs_cgsize,
   1699 	    NOCRED, B_MODIFY, &bp);
   1700 	if (error) {
   1701 		brelse(bp, 0);
   1702 		return (error);
   1703 	}
   1704 	cgp = (struct cg *)bp->b_data;
   1705 	if (!cg_chkmagic(cgp, needswap)) {
   1706 		brelse(bp, 0);
   1707 		return (0);
   1708 	}
   1709 	cgp->cg_old_time = ufs_rw32(time_second, needswap);
   1710 	if ((fs->fs_magic != FS_UFS1_MAGIC) ||
   1711 	    (fs->fs_old_flags & FS_FLAGS_UPDATED))
   1712 		cgp->cg_time = ufs_rw64(time_second, needswap);
   1713 	inosused = cg_inosused(cgp, needswap);
   1714 	ino %= fs->fs_ipg;
   1715 	if (isclr(inosused, ino)) {
   1716 		printf("ifree: dev = 0x%x, ino = %llu, fs = %s\n",
   1717 		    dev, (unsigned long long)ino + cg * fs->fs_ipg,
   1718 		    fs->fs_fsmnt);
   1719 		if (fs->fs_ronly == 0)
   1720 			panic("ifree: freeing free inode");
   1721 	}
   1722 	clrbit(inosused, ino);
   1723 	if (!devvp_is_snapshot)
   1724 		UFS_WAPBL_UNREGISTER_INODE(devvp->v_specmountpoint,
   1725 		    ino + cg * fs->fs_ipg, mode);
   1726 	if (ino < ufs_rw32(cgp->cg_irotor, needswap))
   1727 		cgp->cg_irotor = ufs_rw32(ino, needswap);
   1728 	ufs_add32(cgp->cg_cs.cs_nifree, 1, needswap);
   1729 	mutex_enter(&ump->um_lock);
   1730 	fs->fs_cstotal.cs_nifree++;
   1731 	fs->fs_cs(fs, cg).cs_nifree++;
   1732 	if ((mode & IFMT) == IFDIR) {
   1733 		ufs_add32(cgp->cg_cs.cs_ndir, -1, needswap);
   1734 		fs->fs_cstotal.cs_ndir--;
   1735 		fs->fs_cs(fs, cg).cs_ndir--;
   1736 	}
   1737 	fs->fs_fmod = 1;
   1738 	ACTIVECG_CLR(fs, cg);
   1739 	mutex_exit(&ump->um_lock);
   1740 	bdwrite(bp);
   1741 	return (0);
   1742 }
   1743 
   1744 /*
   1745  * Check to see if a file is free.
   1746  */
   1747 int
   1748 ffs_checkfreefile(struct fs *fs, struct vnode *devvp, ino_t ino)
   1749 {
   1750 	struct cg *cgp;
   1751 	struct buf *bp;
   1752 	daddr_t cgbno;
   1753 	int ret, cg;
   1754 	u_int8_t *inosused;
   1755 	const bool devvp_is_snapshot = (devvp->v_type != VBLK);
   1756 
   1757 	cg = ino_to_cg(fs, ino);
   1758 	if (devvp_is_snapshot)
   1759 		cgbno = fragstoblks(fs, cgtod(fs, cg));
   1760 	else
   1761 		cgbno = fsbtodb(fs, cgtod(fs, cg));
   1762 	if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
   1763 		return 1;
   1764 	if (bread(devvp, cgbno, (int)fs->fs_cgsize, NOCRED, 0, &bp)) {
   1765 		brelse(bp, 0);
   1766 		return 1;
   1767 	}
   1768 	cgp = (struct cg *)bp->b_data;
   1769 	if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) {
   1770 		brelse(bp, 0);
   1771 		return 1;
   1772 	}
   1773 	inosused = cg_inosused(cgp, UFS_FSNEEDSWAP(fs));
   1774 	ino %= fs->fs_ipg;
   1775 	ret = isclr(inosused, ino);
   1776 	brelse(bp, 0);
   1777 	return ret;
   1778 }
   1779 
   1780 /*
   1781  * Find a block of the specified size in the specified cylinder group.
   1782  *
   1783  * It is a panic if a request is made to find a block if none are
   1784  * available.
   1785  */
   1786 static int32_t
   1787 ffs_mapsearch(struct fs *fs, struct cg *cgp, daddr_t bpref, int allocsiz)
   1788 {
   1789 	int32_t bno;
   1790 	int start, len, loc, i;
   1791 	int blk, field, subfield, pos;
   1792 	int ostart, olen;
   1793 	u_int8_t *blksfree;
   1794 #ifdef FFS_EI
   1795 	const int needswap = UFS_FSNEEDSWAP(fs);
   1796 #endif
   1797 
   1798 	/* KASSERT(mutex_owned(&ump->um_lock)); */
   1799 
   1800 	/*
   1801 	 * find the fragment by searching through the free block
   1802 	 * map for an appropriate bit pattern
   1803 	 */
   1804 	if (bpref)
   1805 		start = dtogd(fs, bpref) / NBBY;
   1806 	else
   1807 		start = ufs_rw32(cgp->cg_frotor, needswap) / NBBY;
   1808 	blksfree = cg_blksfree(cgp, needswap);
   1809 	len = howmany(fs->fs_fpg, NBBY) - start;
   1810 	ostart = start;
   1811 	olen = len;
   1812 	loc = scanc((u_int)len,
   1813 		(const u_char *)&blksfree[start],
   1814 		(const u_char *)fragtbl[fs->fs_frag],
   1815 		(1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1)))));
   1816 	if (loc == 0) {
   1817 		len = start + 1;
   1818 		start = 0;
   1819 		loc = scanc((u_int)len,
   1820 			(const u_char *)&blksfree[0],
   1821 			(const u_char *)fragtbl[fs->fs_frag],
   1822 			(1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1)))));
   1823 		if (loc == 0) {
   1824 			printf("start = %d, len = %d, fs = %s\n",
   1825 			    ostart, olen, fs->fs_fsmnt);
   1826 			printf("offset=%d %ld\n",
   1827 				ufs_rw32(cgp->cg_freeoff, needswap),
   1828 				(long)blksfree - (long)cgp);
   1829 			printf("cg %d\n", cgp->cg_cgx);
   1830 			panic("ffs_alloccg: map corrupted");
   1831 			/* NOTREACHED */
   1832 		}
   1833 	}
   1834 	bno = (start + len - loc) * NBBY;
   1835 	cgp->cg_frotor = ufs_rw32(bno, needswap);
   1836 	/*
   1837 	 * found the byte in the map
   1838 	 * sift through the bits to find the selected frag
   1839 	 */
   1840 	for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
   1841 		blk = blkmap(fs, blksfree, bno);
   1842 		blk <<= 1;
   1843 		field = around[allocsiz];
   1844 		subfield = inside[allocsiz];
   1845 		for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) {
   1846 			if ((blk & field) == subfield)
   1847 				return (bno + pos);
   1848 			field <<= 1;
   1849 			subfield <<= 1;
   1850 		}
   1851 	}
   1852 	printf("bno = %d, fs = %s\n", bno, fs->fs_fsmnt);
   1853 	panic("ffs_alloccg: block not in map");
   1854 	/* return (-1); */
   1855 }
   1856 
   1857 /*
   1858  * Update the cluster map because of an allocation or free.
   1859  *
   1860  * Cnt == 1 means free; cnt == -1 means allocating.
   1861  */
   1862 void
   1863 ffs_clusteracct(struct fs *fs, struct cg *cgp, int32_t blkno, int cnt)
   1864 {
   1865 	int32_t *sump;
   1866 	int32_t *lp;
   1867 	u_char *freemapp, *mapp;
   1868 	int i, start, end, forw, back, map, bit;
   1869 #ifdef FFS_EI
   1870 	const int needswap = UFS_FSNEEDSWAP(fs);
   1871 #endif
   1872 
   1873 	/* KASSERT(mutex_owned(&ump->um_lock)); */
   1874 
   1875 	if (fs->fs_contigsumsize <= 0)
   1876 		return;
   1877 	freemapp = cg_clustersfree(cgp, needswap);
   1878 	sump = cg_clustersum(cgp, needswap);
   1879 	/*
   1880 	 * Allocate or clear the actual block.
   1881 	 */
   1882 	if (cnt > 0)
   1883 		setbit(freemapp, blkno);
   1884 	else
   1885 		clrbit(freemapp, blkno);
   1886 	/*
   1887 	 * Find the size of the cluster going forward.
   1888 	 */
   1889 	start = blkno + 1;
   1890 	end = start + fs->fs_contigsumsize;
   1891 	if (end >= ufs_rw32(cgp->cg_nclusterblks, needswap))
   1892 		end = ufs_rw32(cgp->cg_nclusterblks, needswap);
   1893 	mapp = &freemapp[start / NBBY];
   1894 	map = *mapp++;
   1895 	bit = 1 << (start % NBBY);
   1896 	for (i = start; i < end; i++) {
   1897 		if ((map & bit) == 0)
   1898 			break;
   1899 		if ((i & (NBBY - 1)) != (NBBY - 1)) {
   1900 			bit <<= 1;
   1901 		} else {
   1902 			map = *mapp++;
   1903 			bit = 1;
   1904 		}
   1905 	}
   1906 	forw = i - start;
   1907 	/*
   1908 	 * Find the size of the cluster going backward.
   1909 	 */
   1910 	start = blkno - 1;
   1911 	end = start - fs->fs_contigsumsize;
   1912 	if (end < 0)
   1913 		end = -1;
   1914 	mapp = &freemapp[start / NBBY];
   1915 	map = *mapp--;
   1916 	bit = 1 << (start % NBBY);
   1917 	for (i = start; i > end; i--) {
   1918 		if ((map & bit) == 0)
   1919 			break;
   1920 		if ((i & (NBBY - 1)) != 0) {
   1921 			bit >>= 1;
   1922 		} else {
   1923 			map = *mapp--;
   1924 			bit = 1 << (NBBY - 1);
   1925 		}
   1926 	}
   1927 	back = start - i;
   1928 	/*
   1929 	 * Account for old cluster and the possibly new forward and
   1930 	 * back clusters.
   1931 	 */
   1932 	i = back + forw + 1;
   1933 	if (i > fs->fs_contigsumsize)
   1934 		i = fs->fs_contigsumsize;
   1935 	ufs_add32(sump[i], cnt, needswap);
   1936 	if (back > 0)
   1937 		ufs_add32(sump[back], -cnt, needswap);
   1938 	if (forw > 0)
   1939 		ufs_add32(sump[forw], -cnt, needswap);
   1940 
   1941 	/*
   1942 	 * Update cluster summary information.
   1943 	 */
   1944 	lp = &sump[fs->fs_contigsumsize];
   1945 	for (i = fs->fs_contigsumsize; i > 0; i--)
   1946 		if (ufs_rw32(*lp--, needswap) > 0)
   1947 			break;
   1948 	fs->fs_maxcluster[ufs_rw32(cgp->cg_cgx, needswap)] = i;
   1949 }
   1950 
   1951 /*
   1952  * Fserr prints the name of a file system with an error diagnostic.
   1953  *
   1954  * The form of the error message is:
   1955  *	fs: error message
   1956  */
   1957 static void
   1958 ffs_fserr(struct fs *fs, u_int uid, const char *cp)
   1959 {
   1960 
   1961 	log(LOG_ERR, "uid %d, pid %d, command %s, on %s: %s\n",
   1962 	    uid, curproc->p_pid, curproc->p_comm, fs->fs_fsmnt, cp);
   1963 }
   1964