Home | History | Annotate | Line # | Download | only in ffs
ffs_balloc.c revision 1.51.4.1
      1 /*	$NetBSD: ffs_balloc.c,v 1.51.4.1 2011/06/18 17:00:25 bouyer Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 2002 Networks Associates Technology, Inc.
      5  * All rights reserved.
      6  *
      7  * This software was developed for the FreeBSD Project by Marshall
      8  * Kirk McKusick and Network Associates Laboratories, the Security
      9  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
     10  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
     11  * research program
     12  *
     13  * Copyright (c) 1982, 1986, 1989, 1993
     14  *	The Regents of the University of California.  All rights reserved.
     15  *
     16  * Redistribution and use in source and binary forms, with or without
     17  * modification, are permitted provided that the following conditions
     18  * are met:
     19  * 1. Redistributions of source code must retain the above copyright
     20  *    notice, this list of conditions and the following disclaimer.
     21  * 2. Redistributions in binary form must reproduce the above copyright
     22  *    notice, this list of conditions and the following disclaimer in the
     23  *    documentation and/or other materials provided with the distribution.
     24  * 3. Neither the name of the University nor the names of its contributors
     25  *    may be used to endorse or promote products derived from this software
     26  *    without specific prior written permission.
     27  *
     28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     38  * SUCH DAMAGE.
     39  *
     40  *	@(#)ffs_balloc.c	8.8 (Berkeley) 6/16/95
     41  */
     42 
     43 #include <sys/cdefs.h>
     44 __KERNEL_RCSID(0, "$NetBSD: ffs_balloc.c,v 1.51.4.1 2011/06/18 17:00:25 bouyer Exp $");
     45 
     46 #if defined(_KERNEL_OPT)
     47 #include "opt_quota.h"
     48 #endif
     49 
     50 #include <sys/param.h>
     51 #include <sys/systm.h>
     52 #include <sys/buf.h>
     53 #include <sys/file.h>
     54 #include <sys/mount.h>
     55 #include <sys/vnode.h>
     56 #include <sys/kauth.h>
     57 #include <sys/fstrans.h>
     58 
     59 #include <ufs/ufs/quota.h>
     60 #include <ufs/ufs/ufsmount.h>
     61 #include <ufs/ufs/inode.h>
     62 #include <ufs/ufs/ufs_extern.h>
     63 #include <ufs/ufs/ufs_bswap.h>
     64 
     65 #include <ufs/ffs/fs.h>
     66 #include <ufs/ffs/ffs_extern.h>
     67 
     68 #include <uvm/uvm.h>
     69 
     70 static int ffs_balloc_ufs1(struct vnode *, off_t, int, kauth_cred_t, int,
     71     struct buf **);
     72 static int ffs_balloc_ufs2(struct vnode *, off_t, int, kauth_cred_t, int,
     73     struct buf **);
     74 
     75 /*
     76  * Balloc defines the structure of file system storage
     77  * by allocating the physical blocks on a device given
     78  * the inode and the logical block number in a file.
     79  */
     80 
     81 int
     82 ffs_balloc(struct vnode *vp, off_t off, int size, kauth_cred_t cred, int flags,
     83     struct buf **bpp)
     84 {
     85 	int error;
     86 
     87 	if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC)
     88 		error = ffs_balloc_ufs2(vp, off, size, cred, flags, bpp);
     89 	else
     90 		error = ffs_balloc_ufs1(vp, off, size, cred, flags, bpp);
     91 
     92 	if (error == 0 && bpp != NULL && (error = fscow_run(*bpp, false)) != 0)
     93 		brelse(*bpp, 0);
     94 
     95 	return error;
     96 }
     97 
     98 static int
     99 ffs_balloc_ufs1(struct vnode *vp, off_t off, int size, kauth_cred_t cred,
    100     int flags, struct buf **bpp)
    101 {
    102 	daddr_t lbn, lastlbn;
    103 	struct buf *bp, *nbp;
    104 	struct inode *ip = VTOI(vp);
    105 	struct fs *fs = ip->i_fs;
    106 	struct ufsmount *ump = ip->i_ump;
    107 	struct indir indirs[NIADDR + 2];
    108 	daddr_t newb, pref, nb;
    109 	int32_t *bap;	/* XXX ondisk32 */
    110 	int deallocated, osize, nsize, num, i, error;
    111 	int32_t *blkp, *allocblk, allociblk[NIADDR + 1];
    112 	int32_t *allocib;
    113 	int unwindidx = -1;
    114 #ifdef FFS_EI
    115 	const int needswap = UFS_FSNEEDSWAP(fs);
    116 #endif
    117 	UVMHIST_FUNC("ffs_balloc"); UVMHIST_CALLED(ubchist);
    118 
    119 	lbn = lblkno(fs, off);
    120 	size = blkoff(fs, off) + size;
    121 	if (size > fs->fs_bsize)
    122 		panic("ffs_balloc: blk too big");
    123 	if (bpp != NULL) {
    124 		*bpp = NULL;
    125 	}
    126 	UVMHIST_LOG(ubchist, "vp %p lbn 0x%x size 0x%x", vp, lbn, size,0);
    127 
    128 	if (lbn < 0)
    129 		return (EFBIG);
    130 
    131 	/*
    132 	 * If the next write will extend the file into a new block,
    133 	 * and the file is currently composed of a fragment
    134 	 * this fragment has to be extended to be a full block.
    135 	 */
    136 
    137 	lastlbn = lblkno(fs, ip->i_size);
    138 	if (lastlbn < NDADDR && lastlbn < lbn) {
    139 		nb = lastlbn;
    140 		osize = blksize(fs, ip, nb);
    141 		if (osize < fs->fs_bsize && osize > 0) {
    142 			mutex_enter(&ump->um_lock);
    143 			error = ffs_realloccg(ip, nb,
    144 				    ffs_blkpref_ufs1(ip, lastlbn, nb, flags,
    145 					&ip->i_ffs1_db[0]),
    146 				    osize, (int)fs->fs_bsize, cred, bpp, &newb);
    147 			if (error)
    148 				return (error);
    149 			if (DOINGSOFTDEP(vp))
    150 				softdep_setup_allocdirect(ip, nb, newb,
    151 				    ufs_rw32(ip->i_ffs1_db[nb], needswap),
    152 				    fs->fs_bsize, osize, bpp ? *bpp : NULL);
    153 			ip->i_size = lblktosize(fs, nb + 1);
    154 			ip->i_ffs1_size = ip->i_size;
    155 			uvm_vnp_setsize(vp, ip->i_ffs1_size);
    156 			ip->i_ffs1_db[nb] = ufs_rw32((u_int32_t)newb, needswap);
    157 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
    158 			if (bpp && *bpp) {
    159 				if (flags & B_SYNC)
    160 					bwrite(*bpp);
    161 				else
    162 					bawrite(*bpp);
    163 			}
    164 		}
    165 	}
    166 
    167 	/*
    168 	 * The first NDADDR blocks are direct blocks
    169 	 */
    170 
    171 	if (lbn < NDADDR) {
    172 		nb = ufs_rw32(ip->i_ffs1_db[lbn], needswap);
    173 		if (nb != 0 && ip->i_size >= lblktosize(fs, lbn + 1)) {
    174 
    175 			/*
    176 			 * The block is an already-allocated direct block
    177 			 * and the file already extends past this block,
    178 			 * thus this must be a whole block.
    179 			 * Just read the block (if requested).
    180 			 */
    181 
    182 			if (bpp != NULL) {
    183 				error = bread(vp, lbn, fs->fs_bsize, NOCRED,
    184 					      B_MODIFY, bpp);
    185 				if (error) {
    186 					brelse(*bpp, 0);
    187 					return (error);
    188 				}
    189 			}
    190 			return (0);
    191 		}
    192 		if (nb != 0) {
    193 
    194 			/*
    195 			 * Consider need to reallocate a fragment.
    196 			 */
    197 
    198 			osize = fragroundup(fs, blkoff(fs, ip->i_size));
    199 			nsize = fragroundup(fs, size);
    200 			if (nsize <= osize) {
    201 
    202 				/*
    203 				 * The existing block is already
    204 				 * at least as big as we want.
    205 				 * Just read the block (if requested).
    206 				 */
    207 
    208 				if (bpp != NULL) {
    209 					error = bread(vp, lbn, osize, NOCRED,
    210 						      B_MODIFY, bpp);
    211 					if (error) {
    212 						brelse(*bpp, 0);
    213 						return (error);
    214 					}
    215 				}
    216 				return 0;
    217 			} else {
    218 
    219 				/*
    220 				 * The existing block is smaller than we want,
    221 				 * grow it.
    222 				 */
    223 				mutex_enter(&ump->um_lock);
    224 				error = ffs_realloccg(ip, lbn,
    225 				    ffs_blkpref_ufs1(ip, lbn, (int)lbn, flags,
    226 					&ip->i_ffs1_db[0]),
    227 				    osize, nsize, cred, bpp, &newb);
    228 				if (error)
    229 					return (error);
    230 				if (DOINGSOFTDEP(vp))
    231 					softdep_setup_allocdirect(ip, lbn,
    232 					    newb, nb, nsize, osize,
    233 					    bpp ? *bpp : NULL);
    234 			}
    235 		} else {
    236 
    237 			/*
    238 			 * the block was not previously allocated,
    239 			 * allocate a new block or fragment.
    240 			 */
    241 
    242 			if (ip->i_size < lblktosize(fs, lbn + 1))
    243 				nsize = fragroundup(fs, size);
    244 			else
    245 				nsize = fs->fs_bsize;
    246 			mutex_enter(&ump->um_lock);
    247 			error = ffs_alloc(ip, lbn,
    248 			    ffs_blkpref_ufs1(ip, lbn, (int)lbn, flags,
    249 				&ip->i_ffs1_db[0]),
    250 			    nsize, flags, cred, &newb);
    251 			if (error)
    252 				return (error);
    253 			if (bpp != NULL) {
    254 				error = ffs_getblk(vp, lbn, fsbtodb(fs, newb),
    255 				    nsize, (flags & B_CLRBUF) != 0, bpp);
    256 				if (error)
    257 					return error;
    258 			}
    259 			if (DOINGSOFTDEP(vp)) {
    260 				softdep_setup_allocdirect(ip, lbn, newb, 0,
    261 				    nsize, 0, bpp ? *bpp : NULL);
    262 			}
    263 		}
    264 		ip->i_ffs1_db[lbn] = ufs_rw32((u_int32_t)newb, needswap);
    265 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
    266 		return (0);
    267 	}
    268 
    269 	/*
    270 	 * Determine the number of levels of indirection.
    271 	 */
    272 
    273 	pref = 0;
    274 	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
    275 		return (error);
    276 
    277 	/*
    278 	 * Fetch the first indirect block allocating if necessary.
    279 	 */
    280 
    281 	--num;
    282 	nb = ufs_rw32(ip->i_ffs1_ib[indirs[0].in_off], needswap);
    283 	allocib = NULL;
    284 	allocblk = allociblk;
    285 	if (nb == 0) {
    286 		mutex_enter(&ump->um_lock);
    287 		pref = ffs_blkpref_ufs1(ip, lbn, 0, flags | B_METAONLY, NULL);
    288 		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
    289 		    flags | B_METAONLY, cred, &newb);
    290 		if (error)
    291 			goto fail;
    292 		nb = newb;
    293 		*allocblk++ = nb;
    294 		error = ffs_getblk(vp, indirs[1].in_lbn, fsbtodb(fs, nb),
    295 		    fs->fs_bsize, true, &bp);
    296 		if (error)
    297 			goto fail;
    298 		if (DOINGSOFTDEP(vp)) {
    299 			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
    300 			    newb, 0, fs->fs_bsize, 0, bp);
    301 			bdwrite(bp);
    302 		} else {
    303 
    304 			/*
    305 			 * Write synchronously so that indirect blocks
    306 			 * never point at garbage.
    307 			 */
    308 
    309 			if ((error = bwrite(bp)) != 0)
    310 				goto fail;
    311 		}
    312 		unwindidx = 0;
    313 		allocib = &ip->i_ffs1_ib[indirs[0].in_off];
    314 		*allocib = ufs_rw32(nb, needswap);
    315 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
    316 	}
    317 
    318 	/*
    319 	 * Fetch through the indirect blocks, allocating as necessary.
    320 	 */
    321 
    322 	for (i = 1;;) {
    323 		error = bread(vp,
    324 		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, 0, &bp);
    325 		if (error) {
    326 			brelse(bp, 0);
    327 			goto fail;
    328 		}
    329 		bap = (int32_t *)bp->b_data;	/* XXX ondisk32 */
    330 		nb = ufs_rw32(bap[indirs[i].in_off], needswap);
    331 		if (i == num)
    332 			break;
    333 		i++;
    334 		if (nb != 0) {
    335 			brelse(bp, 0);
    336 			continue;
    337 		}
    338 		if (fscow_run(bp, true) != 0) {
    339 			brelse(bp, 0);
    340 			goto fail;
    341 		}
    342 		mutex_enter(&ump->um_lock);
    343 		/* Try to keep snapshot indirect blocks contiguous. */
    344 		if (i == num && (ip->i_flags & SF_SNAPSHOT) != 0)
    345 			pref = ffs_blkpref_ufs1(ip, lbn, indirs[i-1].in_off,
    346 			    flags | B_METAONLY, &bap[0]);
    347 		if (pref == 0)
    348 			pref = ffs_blkpref_ufs1(ip, lbn, 0, flags | B_METAONLY,
    349 			    NULL);
    350 		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
    351 		    flags | B_METAONLY, cred, &newb);
    352 		if (error) {
    353 			brelse(bp, 0);
    354 			goto fail;
    355 		}
    356 		nb = newb;
    357 		*allocblk++ = nb;
    358 		error = ffs_getblk(vp, indirs[i].in_lbn, fsbtodb(fs, nb),
    359 		    fs->fs_bsize, true, &nbp);
    360 		if (error) {
    361 			brelse(bp, 0);
    362 			goto fail;
    363 		}
    364 		if (DOINGSOFTDEP(vp)) {
    365 			softdep_setup_allocindir_meta(nbp, ip, bp,
    366 			    indirs[i - 1].in_off, nb);
    367 			bdwrite(nbp);
    368 		} else {
    369 
    370 			/*
    371 			 * Write synchronously so that indirect blocks
    372 			 * never point at garbage.
    373 			 */
    374 
    375 			if ((error = bwrite(nbp)) != 0) {
    376 				brelse(bp, 0);
    377 				goto fail;
    378 			}
    379 		}
    380 		if (unwindidx < 0)
    381 			unwindidx = i - 1;
    382 		bap[indirs[i - 1].in_off] = ufs_rw32(nb, needswap);
    383 
    384 		/*
    385 		 * If required, write synchronously, otherwise use
    386 		 * delayed write.
    387 		 */
    388 
    389 		if (flags & B_SYNC) {
    390 			bwrite(bp);
    391 		} else {
    392 			bdwrite(bp);
    393 		}
    394 	}
    395 
    396 	if (flags & B_METAONLY) {
    397 		KASSERT(bpp != NULL);
    398 		*bpp = bp;
    399 		return (0);
    400 	}
    401 
    402 	/*
    403 	 * Get the data block, allocating if necessary.
    404 	 */
    405 
    406 	if (nb == 0) {
    407 		if (fscow_run(bp, true) != 0) {
    408 			brelse(bp, 0);
    409 			goto fail;
    410 		}
    411 		mutex_enter(&ump->um_lock);
    412 		pref = ffs_blkpref_ufs1(ip, lbn, indirs[num].in_off, flags,
    413 		    &bap[0]);
    414 		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, flags, cred,
    415 		    &newb);
    416 		if (error) {
    417 			brelse(bp, 0);
    418 			goto fail;
    419 		}
    420 		nb = newb;
    421 		*allocblk++ = nb;
    422 		if (bpp != NULL) {
    423 			error = ffs_getblk(vp, lbn, fsbtodb(fs, nb),
    424 			    fs->fs_bsize, (flags & B_CLRBUF) != 0, bpp);
    425 			if (error) {
    426 				brelse(bp, 0);
    427 				goto fail;
    428 			}
    429 		}
    430 		if (DOINGSOFTDEP(vp))
    431 			softdep_setup_allocindir_page(ip, lbn, bp,
    432 			    indirs[num].in_off, nb, 0, bpp ? *bpp : NULL);
    433 		bap[indirs[num].in_off] = ufs_rw32(nb, needswap);
    434 		if (allocib == NULL && unwindidx < 0) {
    435 			unwindidx = i - 1;
    436 		}
    437 
    438 		/*
    439 		 * If required, write synchronously, otherwise use
    440 		 * delayed write.
    441 		 */
    442 
    443 		if (flags & B_SYNC) {
    444 			bwrite(bp);
    445 		} else {
    446 			bdwrite(bp);
    447 		}
    448 		return (0);
    449 	}
    450 	brelse(bp, 0);
    451 	if (bpp != NULL) {
    452 		if (flags & B_CLRBUF) {
    453 			error = bread(vp, lbn, (int)fs->fs_bsize,
    454 			    NOCRED, B_MODIFY, &nbp);
    455 			if (error) {
    456 				brelse(nbp, 0);
    457 				goto fail;
    458 			}
    459 		} else {
    460 			error = ffs_getblk(vp, lbn, fsbtodb(fs, nb),
    461 			    fs->fs_bsize, true, &nbp);
    462 			if (error)
    463 				goto fail;
    464 		}
    465 		*bpp = nbp;
    466 	}
    467 	return (0);
    468 
    469 fail:
    470 	/*
    471 	 * If we have failed part way through block allocation, we
    472 	 * have to deallocate any indirect blocks that we have allocated.
    473 	 */
    474 
    475 	if (unwindidx >= 0) {
    476 
    477 		/*
    478 		 * First write out any buffers we've created to resolve their
    479 		 * softdeps.  This must be done in reverse order of creation
    480 		 * so that we resolve the dependencies in one pass.
    481 		 * Write the cylinder group buffers for these buffers too.
    482 		 */
    483 
    484 		for (i = num; i >= unwindidx; i--) {
    485 			if (i == 0) {
    486 				break;
    487 			}
    488 			if (ffs_getblk(vp, indirs[i].in_lbn, FFS_NOBLK,
    489 			    fs->fs_bsize, false, &bp) != 0)
    490 				continue;
    491 			if (bp->b_oflags & BO_DELWRI) {
    492 				nb = fsbtodb(fs, cgtod(fs, dtog(fs,
    493 				    dbtofsb(fs, bp->b_blkno))));
    494 				bwrite(bp);
    495 				if (ffs_getblk(ip->i_devvp, nb, FFS_NOBLK,
    496 				    fs->fs_cgsize, false, &bp) != 0)
    497 					continue;
    498 				if (bp->b_oflags & BO_DELWRI) {
    499 					bwrite(bp);
    500 				} else {
    501 					brelse(bp, BC_INVAL);
    502 				}
    503 			} else {
    504 				brelse(bp, BC_INVAL);
    505 			}
    506 		}
    507 
    508 		/* Now flush all dependencies to disk. */
    509 #ifdef notyet
    510 		/* XXX pages locked */
    511 		(void)softdep_sync_metadata(vp);
    512 #endif
    513 
    514 		if (DOINGSOFTDEP(vp) && unwindidx == 0) {
    515 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
    516 			ffs_update(vp, NULL, NULL, UPDATE_WAIT);
    517 		}
    518 
    519 		/*
    520 		 * Now that any dependencies that we created have been
    521 		 * resolved, we can undo the partial allocation.
    522 		 */
    523 
    524 		if (unwindidx == 0) {
    525 			*allocib = 0;
    526 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
    527 			if (DOINGSOFTDEP(vp))
    528 				ffs_update(vp, NULL, NULL, UPDATE_WAIT);
    529 		} else {
    530 			int r;
    531 
    532 			r = bread(vp, indirs[unwindidx].in_lbn,
    533 			    (int)fs->fs_bsize, NOCRED, 0, &bp);
    534 			if (r) {
    535 				panic("Could not unwind indirect block, error %d", r);
    536 				brelse(bp, 0);
    537 			} else {
    538 				bap = (int32_t *)bp->b_data; /* XXX ondisk32 */
    539 				bap[indirs[unwindidx].in_off] = 0;
    540 				bwrite(bp);
    541 			}
    542 		}
    543 		for (i = unwindidx + 1; i <= num; i++) {
    544 			if (ffs_getblk(vp, indirs[i].in_lbn, FFS_NOBLK,
    545 			    fs->fs_bsize, false, &bp) == 0)
    546 				brelse(bp, BC_INVAL);
    547 		}
    548 	}
    549 	for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
    550 		ffs_blkfree(fs, ip->i_devvp, *blkp, fs->fs_bsize, ip->i_number);
    551 		deallocated += fs->fs_bsize;
    552 	}
    553 	if (deallocated) {
    554 #ifdef QUOTA
    555 		/*
    556 		 * Restore user's disk quota because allocation failed.
    557 		 */
    558 		(void)chkdq(ip, -btodb(deallocated), cred, FORCE);
    559 #endif
    560 		ip->i_ffs1_blocks -= btodb(deallocated);
    561 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
    562 	}
    563 	/*
    564 	 * Flush all dependencies again so that the soft updates code
    565 	 * doesn't find any untracked changes.
    566 	 */
    567 #ifdef notyet
    568 	/* XXX pages locked */
    569 	(void)softdep_sync_metadata(vp);
    570 #endif
    571 	return (error);
    572 }
    573 
    574 static int
    575 ffs_balloc_ufs2(struct vnode *vp, off_t off, int size, kauth_cred_t cred,
    576     int flags, struct buf **bpp)
    577 {
    578 	daddr_t lbn, lastlbn;
    579 	struct buf *bp, *nbp;
    580 	struct inode *ip = VTOI(vp);
    581 	struct fs *fs = ip->i_fs;
    582 	struct ufsmount *ump = ip->i_ump;
    583 	struct indir indirs[NIADDR + 2];
    584 	daddr_t newb, pref, nb;
    585 	int64_t *bap;
    586 	int deallocated, osize, nsize, num, i, error;
    587 	daddr_t *blkp, *allocblk, allociblk[NIADDR + 1];
    588 	int64_t *allocib;
    589 	int unwindidx = -1;
    590 #ifdef FFS_EI
    591 	const int needswap = UFS_FSNEEDSWAP(fs);
    592 #endif
    593 	UVMHIST_FUNC("ffs_balloc"); UVMHIST_CALLED(ubchist);
    594 
    595 	lbn = lblkno(fs, off);
    596 	size = blkoff(fs, off) + size;
    597 	if (size > fs->fs_bsize)
    598 		panic("ffs_balloc: blk too big");
    599 	if (bpp != NULL) {
    600 		*bpp = NULL;
    601 	}
    602 	UVMHIST_LOG(ubchist, "vp %p lbn 0x%x size 0x%x", vp, lbn, size,0);
    603 
    604 	if (lbn < 0)
    605 		return (EFBIG);
    606 
    607 #ifdef notyet
    608 	/*
    609 	 * Check for allocating external data.
    610 	 */
    611 	if (flags & IO_EXT) {
    612 		if (lbn >= NXADDR)
    613 			return (EFBIG);
    614 		/*
    615 		 * If the next write will extend the data into a new block,
    616 		 * and the data is currently composed of a fragment
    617 		 * this fragment has to be extended to be a full block.
    618 		 */
    619 		lastlbn = lblkno(fs, dp->di_extsize);
    620 		if (lastlbn < lbn) {
    621 			nb = lastlbn;
    622 			osize = sblksize(fs, dp->di_extsize, nb);
    623 			if (osize < fs->fs_bsize && osize > 0) {
    624 				mutex_enter(&ump->um_lock);
    625 				error = ffs_realloccg(ip, -1 - nb,
    626 				    dp->di_extb[nb],
    627 				    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
    628 					flags, &dp->di_extb[0]),
    629 				    osize,
    630 				    (int)fs->fs_bsize, cred, &bp);
    631 				if (error)
    632 					return (error);
    633 				if (DOINGSOFTDEP(vp))
    634 					softdep_setup_allocext(ip, nb,
    635 					    dbtofsb(fs, bp->b_blkno),
    636 					    dp->di_extb[nb],
    637 					    fs->fs_bsize, osize, bp);
    638 				dp->di_extsize = smalllblktosize(fs, nb + 1);
    639 				dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno);
    640 				bp->b_xflags |= BX_ALTDATA;
    641 				ip->i_flag |= IN_CHANGE | IN_UPDATE;
    642 				if (flags & IO_SYNC)
    643 					bwrite(bp);
    644 				else
    645 					bawrite(bp);
    646 			}
    647 		}
    648 		/*
    649 		 * All blocks are direct blocks
    650 		 */
    651 		if (flags & BA_METAONLY)
    652 			panic("ffs_balloc_ufs2: BA_METAONLY for ext block");
    653 		nb = dp->di_extb[lbn];
    654 		if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) {
    655 			error = bread(vp, -1 - lbn, fs->fs_bsize,
    656 			    NOCRED, 0, &bp);
    657 			if (error) {
    658 				brelse(bp, 0);
    659 				return (error);
    660 			}
    661 			mutex_enter(&bp->b_interlock);
    662 			bp->b_blkno = fsbtodb(fs, nb);
    663 			bp->b_xflags |= BX_ALTDATA;
    664 			mutex_exit(&bp->b_interlock);
    665 			*bpp = bp;
    666 			return (0);
    667 		}
    668 		if (nb != 0) {
    669 			/*
    670 			 * Consider need to reallocate a fragment.
    671 			 */
    672 			osize = fragroundup(fs, blkoff(fs, dp->di_extsize));
    673 			nsize = fragroundup(fs, size);
    674 			if (nsize <= osize) {
    675 				error = bread(vp, -1 - lbn, osize,
    676 				    NOCRED, 0, &bp);
    677 				if (error) {
    678 					brelse(bp, 0);
    679 					return (error);
    680 				}
    681 				mutex_enter(&bp->b_interlock);
    682 				bp->b_blkno = fsbtodb(fs, nb);
    683 				bp->b_xflags |= BX_ALTDATA;
    684 				mutex_exit(&bp->b_interlock);
    685 			} else {
    686 				mutex_enter(&ump->um_lock);
    687 				error = ffs_realloccg(ip, -1 - lbn,
    688 				    dp->di_extb[lbn],
    689 				    ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags,
    690 				        &dp->di_extb[0]),
    691 				    osize, nsize, cred, &bp);
    692 				if (error)
    693 					return (error);
    694 				bp->b_xflags |= BX_ALTDATA;
    695 				if (DOINGSOFTDEP(vp))
    696 					softdep_setup_allocext(ip, lbn,
    697 					    dbtofsb(fs, bp->b_blkno), nb,
    698 					    nsize, osize, bp);
    699 			}
    700 		} else {
    701 			if (dp->di_extsize < smalllblktosize(fs, lbn + 1))
    702 				nsize = fragroundup(fs, size);
    703 			else
    704 				nsize = fs->fs_bsize;
    705 			mutex_enter(&ump->um_lock);
    706 			error = ffs_alloc(ip, lbn,
    707 			   ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags,
    708 			       &dp->di_extb[0]),
    709 			   nsize, flags, cred, &newb);
    710 			if (error)
    711 				return (error);
    712 			error = ffs_getblk(vp, -1 - lbn, fsbtodb(fs, newb),
    713 			    nsize, (flags & BA_CLRBUF) != 0, &bp);
    714 			if (error)
    715 				return error;
    716 			bp->b_xflags |= BX_ALTDATA;
    717 			if (DOINGSOFTDEP(vp))
    718 				softdep_setup_allocext(ip, lbn, newb, 0,
    719 				    nsize, 0, bp);
    720 		}
    721 		dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno);
    722 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
    723 		*bpp = bp;
    724 		return (0);
    725 	}
    726 #endif
    727 	/*
    728 	 * If the next write will extend the file into a new block,
    729 	 * and the file is currently composed of a fragment
    730 	 * this fragment has to be extended to be a full block.
    731 	 */
    732 
    733 	lastlbn = lblkno(fs, ip->i_size);
    734 	if (lastlbn < NDADDR && lastlbn < lbn) {
    735 		nb = lastlbn;
    736 		osize = blksize(fs, ip, nb);
    737 		if (osize < fs->fs_bsize && osize > 0) {
    738 			mutex_enter(&ump->um_lock);
    739 			error = ffs_realloccg(ip, nb,
    740 				    ffs_blkpref_ufs2(ip, lastlbn, nb, flags,
    741 					&ip->i_ffs2_db[0]),
    742 				    osize, (int)fs->fs_bsize, cred, bpp, &newb);
    743 			if (error)
    744 				return (error);
    745 			if (DOINGSOFTDEP(vp))
    746 				softdep_setup_allocdirect(ip, nb, newb,
    747 				    ufs_rw64(ip->i_ffs2_db[nb], needswap),
    748 				    fs->fs_bsize, osize, bpp ? *bpp : NULL);
    749 			ip->i_size = lblktosize(fs, nb + 1);
    750 			ip->i_ffs2_size = ip->i_size;
    751 			uvm_vnp_setsize(vp, ip->i_size);
    752 			ip->i_ffs2_db[nb] = ufs_rw64(newb, needswap);
    753 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
    754 			if (bpp) {
    755 				if (flags & B_SYNC)
    756 					bwrite(*bpp);
    757 				else
    758 					bawrite(*bpp);
    759 			}
    760 		}
    761 	}
    762 
    763 	/*
    764 	 * The first NDADDR blocks are direct blocks
    765 	 */
    766 
    767 	if (lbn < NDADDR) {
    768 		nb = ufs_rw64(ip->i_ffs2_db[lbn], needswap);
    769 		if (nb != 0 && ip->i_size >= lblktosize(fs, lbn + 1)) {
    770 
    771 			/*
    772 			 * The block is an already-allocated direct block
    773 			 * and the file already extends past this block,
    774 			 * thus this must be a whole block.
    775 			 * Just read the block (if requested).
    776 			 */
    777 
    778 			if (bpp != NULL) {
    779 				error = bread(vp, lbn, fs->fs_bsize, NOCRED,
    780 					      B_MODIFY, bpp);
    781 				if (error) {
    782 					brelse(*bpp, 0);
    783 					return (error);
    784 				}
    785 			}
    786 			return (0);
    787 		}
    788 		if (nb != 0) {
    789 
    790 			/*
    791 			 * Consider need to reallocate a fragment.
    792 			 */
    793 
    794 			osize = fragroundup(fs, blkoff(fs, ip->i_size));
    795 			nsize = fragroundup(fs, size);
    796 			if (nsize <= osize) {
    797 
    798 				/*
    799 				 * The existing block is already
    800 				 * at least as big as we want.
    801 				 * Just read the block (if requested).
    802 				 */
    803 
    804 				if (bpp != NULL) {
    805 					error = bread(vp, lbn, osize, NOCRED,
    806 						      B_MODIFY, bpp);
    807 					if (error) {
    808 						brelse(*bpp, 0);
    809 						return (error);
    810 					}
    811 				}
    812 				return 0;
    813 			} else {
    814 
    815 				/*
    816 				 * The existing block is smaller than we want,
    817 				 * grow it.
    818 				 */
    819 				mutex_enter(&ump->um_lock);
    820 				error = ffs_realloccg(ip, lbn,
    821 				    ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags,
    822 					&ip->i_ffs2_db[0]),
    823 				    osize, nsize, cred, bpp, &newb);
    824 				if (error)
    825 					return (error);
    826 				if (DOINGSOFTDEP(vp))
    827 					softdep_setup_allocdirect(ip, lbn,
    828 					    newb, nb, nsize, osize,
    829 					    bpp ? *bpp : NULL);
    830 			}
    831 		} else {
    832 
    833 			/*
    834 			 * the block was not previously allocated,
    835 			 * allocate a new block or fragment.
    836 			 */
    837 
    838 			if (ip->i_size < lblktosize(fs, lbn + 1))
    839 				nsize = fragroundup(fs, size);
    840 			else
    841 				nsize = fs->fs_bsize;
    842 			mutex_enter(&ump->um_lock);
    843 			error = ffs_alloc(ip, lbn,
    844 			    ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags,
    845 				&ip->i_ffs2_db[0]),
    846 			    nsize, flags, cred, &newb);
    847 			if (error)
    848 				return (error);
    849 			if (bpp != NULL) {
    850 				error = ffs_getblk(vp, lbn, fsbtodb(fs, newb),
    851 				    nsize, (flags & B_CLRBUF) != 0, bpp);
    852 				if (error)
    853 					return error;
    854 			}
    855 			if (DOINGSOFTDEP(vp)) {
    856 				softdep_setup_allocdirect(ip, lbn, newb, 0,
    857 				    nsize, 0, bpp ? *bpp : NULL);
    858 			}
    859 		}
    860 		ip->i_ffs2_db[lbn] = ufs_rw64(newb, needswap);
    861 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
    862 		return (0);
    863 	}
    864 
    865 	/*
    866 	 * Determine the number of levels of indirection.
    867 	 */
    868 
    869 	pref = 0;
    870 	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
    871 		return (error);
    872 
    873 	/*
    874 	 * Fetch the first indirect block allocating if necessary.
    875 	 */
    876 
    877 	--num;
    878 	nb = ufs_rw64(ip->i_ffs2_ib[indirs[0].in_off], needswap);
    879 	allocib = NULL;
    880 	allocblk = allociblk;
    881 	if (nb == 0) {
    882 		mutex_enter(&ump->um_lock);
    883 		pref = ffs_blkpref_ufs2(ip, lbn, 0, flags | B_METAONLY, NULL);
    884 		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
    885 		    flags | B_METAONLY, cred, &newb);
    886 		if (error)
    887 			goto fail;
    888 		nb = newb;
    889 		*allocblk++ = nb;
    890 		error = ffs_getblk(vp, indirs[1].in_lbn, fsbtodb(fs, nb),
    891 		    fs->fs_bsize, true, &bp);
    892 		if (error)
    893 			goto fail;
    894 		if (DOINGSOFTDEP(vp)) {
    895 			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
    896 			    newb, 0, fs->fs_bsize, 0, bp);
    897 			bdwrite(bp);
    898 		} else {
    899 
    900 			/*
    901 			 * Write synchronously so that indirect blocks
    902 			 * never point at garbage.
    903 			 */
    904 
    905 			if ((error = bwrite(bp)) != 0)
    906 				goto fail;
    907 		}
    908 		unwindidx = 0;
    909 		allocib = &ip->i_ffs2_ib[indirs[0].in_off];
    910 		*allocib = ufs_rw64(nb, needswap);
    911 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
    912 	}
    913 
    914 	/*
    915 	 * Fetch through the indirect blocks, allocating as necessary.
    916 	 */
    917 
    918 	for (i = 1;;) {
    919 		error = bread(vp,
    920 		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, 0, &bp);
    921 		if (error) {
    922 			brelse(bp, 0);
    923 			goto fail;
    924 		}
    925 		bap = (int64_t *)bp->b_data;
    926 		nb = ufs_rw64(bap[indirs[i].in_off], needswap);
    927 		if (i == num)
    928 			break;
    929 		i++;
    930 		if (nb != 0) {
    931 			brelse(bp, 0);
    932 			continue;
    933 		}
    934 		if (fscow_run(bp, true) != 0) {
    935 			brelse(bp, 0);
    936 			goto fail;
    937 		}
    938 		mutex_enter(&ump->um_lock);
    939 		/* Try to keep snapshot indirect blocks contiguous. */
    940 		if (i == num && (ip->i_flags & SF_SNAPSHOT) != 0)
    941 			pref = ffs_blkpref_ufs2(ip, lbn, indirs[i-1].in_off,
    942 			    flags | B_METAONLY, &bap[0]);
    943 		if (pref == 0)
    944 			pref = ffs_blkpref_ufs2(ip, lbn, 0, flags | B_METAONLY,
    945 			    NULL);
    946 		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
    947 		    flags | B_METAONLY, cred, &newb);
    948 		if (error) {
    949 			brelse(bp, 0);
    950 			goto fail;
    951 		}
    952 		nb = newb;
    953 		*allocblk++ = nb;
    954 		error = ffs_getblk(vp, indirs[i].in_lbn, fsbtodb(fs, nb),
    955 		    fs->fs_bsize, true, &nbp);
    956 		if (error) {
    957 			brelse(bp, 0);
    958 			goto fail;
    959 		}
    960 		if (DOINGSOFTDEP(vp)) {
    961 			softdep_setup_allocindir_meta(nbp, ip, bp,
    962 			    indirs[i - 1].in_off, nb);
    963 			bdwrite(nbp);
    964 		} else {
    965 
    966 			/*
    967 			 * Write synchronously so that indirect blocks
    968 			 * never point at garbage.
    969 			 */
    970 
    971 			if ((error = bwrite(nbp)) != 0) {
    972 				brelse(bp, 0);
    973 				goto fail;
    974 			}
    975 		}
    976 		if (unwindidx < 0)
    977 			unwindidx = i - 1;
    978 		bap[indirs[i - 1].in_off] = ufs_rw64(nb, needswap);
    979 
    980 		/*
    981 		 * If required, write synchronously, otherwise use
    982 		 * delayed write.
    983 		 */
    984 
    985 		if (flags & B_SYNC) {
    986 			bwrite(bp);
    987 		} else {
    988 			bdwrite(bp);
    989 		}
    990 	}
    991 
    992 	if (flags & B_METAONLY) {
    993 		KASSERT(bpp != NULL);
    994 		*bpp = bp;
    995 		return (0);
    996 	}
    997 
    998 	/*
    999 	 * Get the data block, allocating if necessary.
   1000 	 */
   1001 
   1002 	if (nb == 0) {
   1003 		if (fscow_run(bp, true) != 0) {
   1004 			brelse(bp, 0);
   1005 			goto fail;
   1006 		}
   1007 		mutex_enter(&ump->um_lock);
   1008 		pref = ffs_blkpref_ufs2(ip, lbn, indirs[num].in_off, flags,
   1009 		    &bap[0]);
   1010 		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, flags, cred,
   1011 		    &newb);
   1012 		if (error) {
   1013 			brelse(bp, 0);
   1014 			goto fail;
   1015 		}
   1016 		nb = newb;
   1017 		*allocblk++ = nb;
   1018 		if (bpp != NULL) {
   1019 			error = ffs_getblk(vp, lbn, fsbtodb(fs, nb),
   1020 			    fs->fs_bsize, (flags & B_CLRBUF) != 0, bpp);
   1021 			if (error) {
   1022 				brelse(bp, 0);
   1023 				goto fail;
   1024 			}
   1025 		}
   1026 		if (DOINGSOFTDEP(vp))
   1027 			softdep_setup_allocindir_page(ip, lbn, bp,
   1028 			    indirs[num].in_off, nb, 0, bpp ? *bpp : NULL);
   1029 		bap[indirs[num].in_off] = ufs_rw64(nb, needswap);
   1030 		if (allocib == NULL && unwindidx < 0) {
   1031 			unwindidx = i - 1;
   1032 		}
   1033 
   1034 		/*
   1035 		 * If required, write synchronously, otherwise use
   1036 		 * delayed write.
   1037 		 */
   1038 
   1039 		if (flags & B_SYNC) {
   1040 			bwrite(bp);
   1041 		} else {
   1042 			bdwrite(bp);
   1043 		}
   1044 		return (0);
   1045 	}
   1046 	brelse(bp, 0);
   1047 	if (bpp != NULL) {
   1048 		if (flags & B_CLRBUF) {
   1049 			error = bread(vp, lbn, (int)fs->fs_bsize,
   1050 			    NOCRED, B_MODIFY, &nbp);
   1051 			if (error) {
   1052 				brelse(nbp, 0);
   1053 				goto fail;
   1054 			}
   1055 		} else {
   1056 			error = ffs_getblk(vp, lbn, fsbtodb(fs, nb),
   1057 			    fs->fs_bsize, true, &nbp);
   1058 			if (error)
   1059 				goto fail;
   1060 		}
   1061 		*bpp = nbp;
   1062 	}
   1063 	return (0);
   1064 
   1065 fail:
   1066 	/*
   1067 	 * If we have failed part way through block allocation, we
   1068 	 * have to deallocate any indirect blocks that we have allocated.
   1069 	 */
   1070 
   1071 	if (unwindidx >= 0) {
   1072 
   1073 		/*
   1074 		 * First write out any buffers we've created to resolve their
   1075 		 * softdeps.  This must be done in reverse order of creation
   1076 		 * so that we resolve the dependencies in one pass.
   1077 		 * Write the cylinder group buffers for these buffers too.
   1078 		 */
   1079 
   1080 		for (i = num; i >= unwindidx; i--) {
   1081 			if (i == 0) {
   1082 				break;
   1083 			}
   1084 			if (ffs_getblk(vp, indirs[i].in_lbn, FFS_NOBLK,
   1085 			    fs->fs_bsize, false, &bp) != 0)
   1086 				continue;
   1087 			if (bp->b_oflags & BO_DELWRI) {
   1088 				nb = fsbtodb(fs, cgtod(fs, dtog(fs,
   1089 				    dbtofsb(fs, bp->b_blkno))));
   1090 				bwrite(bp);
   1091 				if (ffs_getblk(ip->i_devvp, nb, FFS_NOBLK,
   1092 				    fs->fs_cgsize, false, &bp) != 0)
   1093 					continue;
   1094 				if (bp->b_oflags & BO_DELWRI) {
   1095 					bwrite(bp);
   1096 				} else {
   1097 					brelse(bp, BC_INVAL);
   1098 				}
   1099 			} else {
   1100 				brelse(bp, BC_INVAL);
   1101 			}
   1102 		}
   1103 
   1104 		/* Now flush the dependencies to disk. */
   1105 #ifdef notyet
   1106 		/* XXX pages locked */
   1107 		(void)softdep_sync_metadata(vp);
   1108 #endif
   1109 
   1110 		if (DOINGSOFTDEP(vp) && unwindidx == 0) {
   1111 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
   1112 			ffs_update(vp, NULL, NULL, UPDATE_WAIT);
   1113 		}
   1114 
   1115 		/*
   1116 		 * Now that any dependencies that we created have been
   1117 		 * resolved, we can undo the partial allocation.
   1118 		 */
   1119 
   1120 		if (unwindidx == 0) {
   1121 			*allocib = 0;
   1122 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
   1123 			if (DOINGSOFTDEP(vp))
   1124 				ffs_update(vp, NULL, NULL, UPDATE_WAIT);
   1125 		} else {
   1126 			int r;
   1127 
   1128 			r = bread(vp, indirs[unwindidx].in_lbn,
   1129 			    (int)fs->fs_bsize, NOCRED, 0, &bp);
   1130 			if (r) {
   1131 				panic("Could not unwind indirect block, error %d", r);
   1132 				brelse(bp, 0);
   1133 			} else {
   1134 				bap = (int64_t *)bp->b_data;
   1135 				bap[indirs[unwindidx].in_off] = 0;
   1136 				bwrite(bp);
   1137 			}
   1138 		}
   1139 		for (i = unwindidx + 1; i <= num; i++) {
   1140 			if (ffs_getblk(vp, indirs[i].in_lbn, FFS_NOBLK,
   1141 			    fs->fs_bsize, false, &bp) == 0)
   1142 				brelse(bp, BC_INVAL);
   1143 		}
   1144 	}
   1145 	for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
   1146 		ffs_blkfree(fs, ip->i_devvp, *blkp, fs->fs_bsize, ip->i_number);
   1147 		deallocated += fs->fs_bsize;
   1148 	}
   1149 	if (deallocated) {
   1150 #ifdef QUOTA
   1151 		/*
   1152 		 * Restore user's disk quota because allocation failed.
   1153 		 */
   1154 		(void)chkdq(ip, -btodb(deallocated), cred, FORCE);
   1155 #endif
   1156 		ip->i_ffs2_blocks -= btodb(deallocated);
   1157 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
   1158 	}
   1159 
   1160 	/*
   1161 	 * Flush all dependencies again so that the soft updates code
   1162 	 * doesn't find any untracked changes.
   1163 	 */
   1164 #ifdef notyet
   1165 	/* XXX pages locked */
   1166 	(void)softdep_sync_metadata(vp);
   1167 #endif
   1168 	return (error);
   1169 }
   1170