Home | History | Annotate | Line # | Download | only in lfs
      1 /*	$NetBSD: lfs_rfw.c,v 1.40 2025/10/20 04:20:37 perseant Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2025 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Konrad E. Schroder <perseant (at) hhhh.org>.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 __KERNEL_RCSID(0, "$NetBSD: lfs_rfw.c,v 1.40 2025/10/20 04:20:37 perseant Exp $");
     34 
     35 #if defined(_KERNEL_OPT)
     36 #include "opt_quota.h"
     37 #endif
     38 
     39 #include <sys/param.h>
     40 #include <sys/systm.h>
     41 #include <sys/namei.h>
     42 #include <sys/proc.h>
     43 #include <sys/kernel.h>
     44 #include <sys/vnode.h>
     45 #include <sys/mount.h>
     46 #include <sys/kthread.h>
     47 #include <sys/buf.h>
     48 #include <sys/device.h>
     49 #include <sys/file.h>
     50 #include <sys/disklabel.h>
     51 #include <sys/ioctl.h>
     52 #include <sys/errno.h>
     53 #include <sys/malloc.h>
     54 #include <sys/pool.h>
     55 #include <sys/socket.h>
     56 #include <sys/stat.h>
     57 #include <sys/syslog.h>
     58 #include <sys/sysctl.h>
     59 #include <sys/conf.h>
     60 #include <sys/kauth.h>
     61 
     62 #include <miscfs/specfs/specdev.h>
     63 
     64 #include <ufs/lfs/ulfs_quotacommon.h>
     65 #include <ufs/lfs/ulfs_inode.h>
     66 #include <ufs/lfs/ulfsmount.h>
     67 #include <ufs/lfs/ulfs_extern.h>
     68 
     69 #include <uvm/uvm_extern.h>
     70 
     71 #include <ufs/lfs/lfs.h>
     72 #include <ufs/lfs/lfs_accessors.h>
     73 #include <ufs/lfs/lfs_kernel.h>
     74 #include <ufs/lfs/lfs_extern.h>
     75 
     76 #include <miscfs/genfs/genfs.h>
     77 #include <miscfs/genfs/genfs_node.h>
     78 
     79 /*
     80  * Roll-forward code.
     81  */
     82 static bool all_selector(void *, struct vnode *);
     83 static void drop_vnode_pages(struct mount *, struct lwp *);
     84 static void update_inoblk_copy_dinode(struct lfs *, union lfs_dinode *,
     85 				      const union lfs_dinode *);
     86 static int update_inogen(struct lfs_inofuncarg *);
     87 static int update_inoblk(struct lfs_inofuncarg *);
     88 static int ino_func_setclean(struct lfs_inofuncarg *);
     89 static int finfo_func_rfw(struct lfs_finfofuncarg *);
     90 static int finfo_func_rewrite(struct lfs_finfofuncarg *);
     91 static int finfo_func_setclean(struct lfs_finfofuncarg *);
     92 
     93 static int update_meta(struct lfs *, ino_t, int, daddr_t, daddr_t, size_t,
     94 		       struct lwp *l);
     95 static int skip_superblock(struct lfs *, daddr_t *);
     96 static int rewrite_block(struct lfs *, struct vnode *, daddr_t, daddr_t, size_t, int *);
     97 #if 0
     98 static bool lfs_isseq(const struct lfs *fs, long int lbn1, long int lbn2);
     99 #endif
    100 
    101 extern int lfs_do_rfw;
    102 int rblkcnt;
    103 int lfs_rfw_max_psegs = 0;
    104 
    105 /*
    106  * Allocate a particular inode with a particular version number, freeing
    107  * any previous versions of this inode that may have gone before.
    108  * Used by the roll-forward code.
    109  *
    110  * XXX this function does not have appropriate locking to be used on a live fs;
    111  * XXX but something similar could probably be used for an "undelete" call.
    112  *
    113  * Called with the Ifile inode locked.
    114  */
    115 int
    116 lfs_rf_valloc(struct lfs *fs, ino_t ino, int vers, struct lwp *l,
    117 	      struct vnode **vpp, union lfs_dinode *dip)
    118 {
    119 	struct vattr va;
    120 	struct vnode *vp;
    121 	struct inode *ip;
    122 	int error;
    123 
    124 	KASSERT(ino > LFS_IFILE_INUM);
    125 	ASSERT_SEGLOCK(fs); /* XXX it doesn't, really */
    126 
    127 	/*
    128 	 * First, just try a vget. If the version number is the one we want,
    129 	 * we don't have to do anything else.  If the version number is wrong,
    130 	 * take appropriate action.
    131 	 */
    132 	error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, LK_EXCLUSIVE, &vp);
    133 	if (error == 0) {
    134 		DLOG((DLOG_RF, "lfs_rf_valloc[1]: ino %d vp %p\n",
    135 			(int)ino, vp));
    136 
    137 		*vpp = vp;
    138 		ip = VTOI(vp);
    139 		DLOG((DLOG_RF, "  ip->i_gen=%jd dip nlink %jd seeking"
    140 			" version %jd\n", (intmax_t)ip->i_gen,
    141 			(intmax_t)(dip == NULL ? -1
    142 				: lfs_dino_getnlink(fs, dip)), (intmax_t)vers));
    143 		if (ip->i_gen == vers) {
    144 			/*
    145 			 * We have what we wanted already.
    146 			 */
    147 			DLOG((DLOG_RF, "  pre-existing\n"));
    148 			return 0;
    149 		} else if (ip->i_gen < vers && dip != NULL
    150 			&& lfs_dino_getnlink(fs, dip) > 0) {
    151 			/*
    152 			 * We have found a newer version.  Truncate
    153 			 * the old vnode to zero and re-initialize
    154 			 * from the given dinode.
    155 			 */
    156 			DLOG((DLOG_RF, "  replace old version %jd\n",
    157 				(intmax_t)ip->i_gen));
    158 			lfs_truncate(vp, (off_t)0, 0, NOCRED);
    159 			ip->i_gen = vers;
    160 			vp->v_type = IFTOVT(lfs_dino_getmode(fs, dip));
    161 			update_inoblk_copy_dinode(fs, ip->i_din, dip);
    162 			LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE);
    163 			return 0;
    164 		} else {
    165 			/*
    166 			 * Not the right version and nothing to
    167 			 * initialize from.  Don't recover this data.
    168 			 */
    169 			DLOG((DLOG_RF, "ino %d: sought version %d, got %d\n",
    170 				(int)ino, (int)vers,
    171 				(int)lfs_dino_getgen(fs, ip->i_din)));
    172 			vput(vp);
    173 			*vpp = NULLVP;
    174 			return EEXIST;
    175 		}
    176 	}
    177 
    178 	/*
    179 	 * No version of this inode was found in the cache.
    180 	 * Make a new one from the dinode.  We will add data blocks
    181 	 * as they come in, so scrub any block addresses off of the
    182 	 * inode and reset block counts to zero.
    183 	 */
    184 	if (dip == NULL)
    185 		return ENOENT;
    186 
    187 	vattr_null(&va);
    188 	va.va_type = IFTOVT(lfs_dino_getmode(fs, dip));
    189 	va.va_mode = lfs_dino_getmode(fs, dip) & ALLPERMS;
    190 	va.va_fileid = ino;
    191 	va.va_gen = vers;
    192 	error = vcache_new(fs->lfs_ivnode->v_mount, NULL, &va, NOCRED, NULL,
    193 	    &vp);
    194 	if (error)
    195 		return error;
    196 	error = vn_lock(vp, LK_EXCLUSIVE);
    197 	if (error)
    198 		goto err;
    199 
    200 	ip = VTOI(vp);
    201 	update_inoblk_copy_dinode(fs, ip->i_din, dip);
    202 
    203 	DLOG((DLOG_RF, "lfs_valloc[2] ino %d vp %p size=%lld effnblks=%d,"
    204 		" blocks=%d\n", (int)ino, vp, (long long)ip->i_size,
    205 		(int)ip->i_lfs_effnblks,
    206 		(int)lfs_dino_getblocks(fs, ip->i_din)));
    207 	*vpp = vp;
    208 	return 0;
    209 
    210 err:
    211 	vrele(vp);
    212 	*vpp = NULLVP;
    213 	return error;
    214 }
    215 
    216 /*
    217  * Load the appropriate indirect block, and change the appropriate pointer.
    218  * Mark the block dirty.  Do segment and avail accounting.
    219  */
    220 static int
    221 update_meta(struct lfs *fs, ino_t ino, int vers, daddr_t lbn,
    222 	    daddr_t ndaddr, size_t size, struct lwp *l)
    223 {
    224 	int error;
    225 	struct vnode *vp;
    226 	struct inode *ip;
    227 	daddr_t odaddr;
    228 	struct indir a[ULFS_NIADDR];
    229 	int num;
    230 	struct buf *bp;
    231 	SEGUSE *sup;
    232 	u_int64_t newsize, loff;
    233 
    234 	KASSERT(lbn >= 0);	/* no indirect blocks */
    235 	KASSERT(ino > LFS_IFILE_INUM);
    236 
    237 	DLOG((DLOG_RF, "update_meta: ino %d lbn %d size %d at 0x%jx\n",
    238 	      (int)ino, (int)lbn, (int)size, (uintmax_t)ndaddr));
    239 
    240 	if ((error = lfs_rf_valloc(fs, ino, vers, l, &vp, NULL)) != 0)
    241 		return error;
    242 	ip = VTOI(vp);
    243 
    244 	/*
    245 	 * If block already exists, note its new location
    246 	 * but do not account it as new.
    247 	 */
    248 	ulfs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL, NULL);
    249 	if (odaddr == UNASSIGNED) {
    250 		if ((error = lfs_balloc(vp, (lbn << lfs_sb_getbshift(fs)),
    251 					size, NOCRED, 0, &bp)) != 0) {
    252 			vput(vp);
    253 			return (error);
    254 		}
    255 		/* No need to write, the block is already on disk */
    256 		if (bp->b_oflags & BO_DELWRI) {
    257 			LFS_UNLOCK_BUF(bp);
    258 			/* Account recovery of the previous version */
    259 			lfs_sb_addavail(fs, lfs_btofsb(fs, bp->b_bcount));
    260 		}
    261 		brelse(bp, BC_INVAL);
    262 		DLOG((DLOG_RF, "balloc ip->i_lfs_effnblks = %d,"
    263 			" lfs_dino_getblocks(fs, ip->i_din) = %d\n",
    264 			(int)ip->i_lfs_effnblks,
    265 			(int)lfs_dino_getblocks(fs, ip->i_din)));
    266 	} else {
    267 		/* XXX fragextend? */
    268 		DLOG((DLOG_RF, "block exists, no balloc\n"));
    269 	}
    270 
    271 	/*
    272 	 * Extend the file, if it is not large enough already.
    273 	 * XXX This is not exactly right, we don't know how much of the
    274 	 * XXX last block is actually used.
    275 	 *
    276 	 * XXX We should be able to encode the actual data length of the
    277 	 * XXX last block in fi_lastlength, since we can infer the
    278 	 * XXX necessary block length from that using a variant of
    279 	 * XXX lfs_blksize().
    280 	 */
    281 	loff = lfs_lblktosize(fs, lbn);
    282 	if (loff >= (ULFS_NDADDR << lfs_sb_getbshift(fs))) {
    283 		/* No fragments */
    284 		newsize = loff + 1;
    285 	} else {
    286 		/* Subtract only a fragment to account for block size */
    287 		newsize = loff + size - lfs_fsbtob(fs, 1) + 1;
    288 	}
    289 
    290 	if (ip->i_size < newsize) {
    291 		DLOG((DLOG_RF, "ino %d size %d -> %d\n",
    292 		      (int)ino, (int)ip->i_size, (int)newsize));
    293 		lfs_dino_setsize(fs, ip->i_din, newsize);
    294 		ip->i_size = newsize;
    295 		/*
    296 		 * tell vm our new size for the case the inode won't
    297 		 * appear later.
    298 		 */
    299 		uvm_vnp_setsize(vp, newsize);
    300 	}
    301 
    302 	lfs_update_single(fs, NULL, vp, lbn, ndaddr, size);
    303 
    304 	LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, ndaddr), bp);
    305 	sup->su_nbytes += size;
    306 	LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, ndaddr), bp);
    307 
    308 	/* differences here should be due to UNWRITTEN indirect blocks. */
    309 	if (vp->v_type != VLNK) {
    310 		if (!(ip->i_lfs_effnblks >= lfs_dino_getblocks(fs, ip->i_din))
    311 #if 0
    312 		    || !(lfs_lblkno(fs, ip->i_size) > ULFS_NDADDR ||
    313 			 ip->i_lfs_effnblks == lfs_dino_getblocks(fs, ip->i_din))
    314 #endif /* 0 */
    315 			) {
    316 			vprint("vnode", vp);
    317 			printf("effnblks=%jd dino_getblocks=%jd\n",
    318 			       (intmax_t)ip->i_lfs_effnblks,
    319 			       (intmax_t)lfs_dino_getblocks(fs, ip->i_din));
    320 		}
    321 		KASSERT(ip->i_lfs_effnblks >= lfs_dino_getblocks(fs, ip->i_din));
    322 #if 0
    323 		KASSERT(lfs_lblkno(fs, ip->i_size) > ULFS_NDADDR ||
    324 			ip->i_lfs_effnblks == lfs_dino_getblocks(fs, ip->i_din));
    325 #endif /* 0 */
    326 	}
    327 
    328 #ifdef DEBUG
    329 	/* Now look again to make sure it worked */
    330 	ulfs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL, NULL);
    331 	if (LFS_DBTOFSB(fs, odaddr) != ndaddr)
    332 		DLOG((DLOG_RF, "update_meta: failed setting ino %jd lbn %jd"
    333 		      " to %jd\n", (intmax_t)ino, (intmax_t)lbn, (intmax_t)ndaddr));
    334 #endif /* DEBUG */
    335 	vput(vp);
    336 	return 0;
    337 }
    338 
    339 /*
    340  * Copy some the fields of the dinode as needed by update_inoblk().
    341  */
    342 static void
    343 update_inoblk_copy_dinode(struct lfs *fs,
    344     union lfs_dinode *dstu, const union lfs_dinode *srcu)
    345 {
    346 	if (fs->lfs_is64) {
    347 		struct lfs64_dinode *dst = &dstu->u_64;
    348 		const struct lfs64_dinode *src = &srcu->u_64;
    349 		unsigned i;
    350 
    351 		/*
    352 		 * Copy everything but the block pointers and di_blocks.
    353 		 * XXX what about di_extb?
    354 		 */
    355 		dst->di_mode = src->di_mode;
    356 		dst->di_nlink = src->di_nlink;
    357 		dst->di_uid = src->di_uid;
    358 		dst->di_gid = src->di_gid;
    359 		dst->di_blksize = src->di_blksize;
    360 		dst->di_size = src->di_size;
    361 		dst->di_atime = src->di_atime;
    362 		dst->di_mtime = src->di_mtime;
    363 		dst->di_ctime = src->di_ctime;
    364 		dst->di_birthtime = src->di_birthtime;
    365 		dst->di_mtimensec = src->di_mtimensec;
    366 		dst->di_atimensec = src->di_atimensec;
    367 		dst->di_ctimensec = src->di_ctimensec;
    368 		dst->di_birthnsec = src->di_birthnsec;
    369 		dst->di_gen = src->di_gen;
    370 		dst->di_kernflags = src->di_kernflags;
    371 		dst->di_flags = src->di_flags;
    372 		dst->di_extsize = src->di_extsize;
    373 		dst->di_modrev = src->di_modrev;
    374 		dst->di_inumber = src->di_inumber;
    375 		for (i = 0; i < __arraycount(src->di_spare); i++) {
    376 			dst->di_spare[i] = src->di_spare[i];
    377 		}
    378 		/* Short symlinks store their data in di_db. */
    379 		if ((src->di_mode & LFS_IFMT) == LFS_IFLNK
    380 		    && src->di_size < lfs_sb_getmaxsymlinklen(fs)) {
    381 			memcpy(dst->di_db, src->di_db, src->di_size);
    382 		}
    383 	} else {
    384 		struct lfs32_dinode *dst = &dstu->u_32;
    385 		const struct lfs32_dinode *src = &srcu->u_32;
    386 
    387 		/* Get mode, link count, size, and times */
    388 		memcpy(dst, src, offsetof(struct lfs32_dinode, di_db[0]));
    389 
    390 		/* Then the rest, except di_blocks */
    391 		dst->di_flags = src->di_flags;
    392 		dst->di_gen = src->di_gen;
    393 		dst->di_uid = src->di_uid;
    394 		dst->di_gid = src->di_gid;
    395 		dst->di_modrev = src->di_modrev;
    396 
    397 		/* Short symlinks store their data in di_db. */
    398 		if ((src->di_mode & LFS_IFMT) == LFS_IFLNK
    399 		    && src->di_size < lfs_sb_getmaxsymlinklen(fs)) {
    400 			memcpy(dst->di_db, src->di_db, src->di_size);
    401 		}
    402 	}
    403 }
    404 
    405 static int
    406 update_inoblk(struct lfs_inofuncarg *lifa)
    407 {
    408 	struct lfs *fs;
    409 	daddr_t offset, daddr;
    410 	struct lwp *l;
    411 	struct vnode *devvp, *vp;
    412 	struct inode *ip;
    413 	union lfs_dinode *dip;
    414 	struct buf *dbp, *ibp;
    415 	int error;
    416 	IFILE *ifp;
    417 	SEGUSE *sup;
    418 	unsigned i, num;
    419 	uint32_t gen, osn, nsn;
    420 	char *buf;
    421 
    422 	fs = lifa->fs;
    423 	offset = lifa->offset;
    424 	l = lifa->l;
    425 	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
    426 
    427 	/*
    428 	 * Get the inode, update times and perms.
    429 	 * DO NOT update disk blocks, we do that separately.
    430 	 */
    431 	error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs),
    432 	    0, &dbp);
    433 	if (error) {
    434 		DLOG((DLOG_RF, "update_inoblk: bread returned %d\n", error));
    435 		return error;
    436 	}
    437 	buf = malloc(dbp->b_bcount, M_SEGMENT, M_WAITOK);
    438 	memcpy(buf, dbp->b_data, dbp->b_bcount);
    439 	brelse(dbp, BC_AGE);
    440 	num = LFS_INOPB(fs);
    441 	for (i = num; i-- > 0; ) {
    442 		dip = DINO_IN_BLOCK(fs, buf, i);
    443 		if (lfs_dino_getinumber(fs, dip) <= LFS_IFILE_INUM)
    444 			continue;
    445 
    446 		/* Check generation number */
    447 		LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp);
    448 		gen = lfs_if_getversion(fs, ifp);
    449 		brelse(ibp, 0);
    450 		if (lfs_dino_getgen(fs, dip) < gen) {
    451 			continue;
    452 		}
    453 
    454 		/*
    455 		 * This inode is the newest generation.  Load it.
    456 		 */
    457 		error = lfs_rf_valloc(fs, lfs_dino_getinumber(fs, dip),
    458 				      lfs_dino_getgen(fs, dip),
    459 				      l, &vp, dip);
    460 		if (error) {
    461 			DLOG((DLOG_RF, "update_inoblk: lfs_rf_valloc"
    462 			      " returned %d\n", error));
    463 			continue;
    464 		}
    465 		ip = VTOI(vp);
    466 		if (lfs_dino_getsize(fs, dip) != ip->i_size
    467 		    && vp->v_type != VLNK) {
    468 			/* XXX What should we do with symlinks? */
    469 			DLOG((DLOG_RF, "  ino %jd size %jd -> %jd\n",
    470 				(intmax_t)lfs_dino_getinumber(fs, dip),
    471 				(intmax_t)ip->i_size,
    472 				(intmax_t)lfs_dino_getsize(fs, dip)));
    473 			lfs_truncate(vp, lfs_dino_getsize(fs, dip), 0,
    474 				     NOCRED);
    475 		}
    476 		update_inoblk_copy_dinode(fs, ip->i_din, dip);
    477 
    478 		ip->i_flags = lfs_dino_getflags(fs, dip);
    479 		ip->i_gen = lfs_dino_getgen(fs, dip);
    480 		ip->i_uid = lfs_dino_getuid(fs, dip);
    481 		ip->i_gid = lfs_dino_getgid(fs, dip);
    482 
    483 		ip->i_mode = lfs_dino_getmode(fs, dip);
    484 		ip->i_nlink = lfs_dino_getnlink(fs, dip);
    485 		ip->i_size = lfs_dino_getsize(fs, dip);
    486 
    487 		LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE);
    488 
    489 		/* Re-initialize to get type right */
    490 		ulfs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p,
    491 			  &vp);
    492 
    493 		/* Record change in location */
    494 		LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp);
    495 		daddr = lfs_if_getdaddr(fs, ifp);
    496 		lfs_if_setdaddr(fs, ifp, offset);
    497 		error = LFS_BWRITE_LOG(ibp); /* Ifile */
    498 		/* And do segment accounting */
    499 		osn = lfs_dtosn(fs, daddr);
    500 		nsn = lfs_dtosn(fs, offset);
    501 		if (DADDR_IS_BAD(daddr) || osn != nsn) {
    502 			if (!DADDR_IS_BAD(daddr)) {
    503 				LFS_SEGENTRY(sup, fs, osn, ibp);
    504 				sup->su_nbytes -= DINOSIZE(fs);
    505 				LFS_WRITESEGENTRY(sup, fs, osn, ibp);
    506 			}
    507 			LFS_SEGENTRY(sup, fs, nsn, ibp);
    508 			sup->su_nbytes += DINOSIZE(fs);
    509 			LFS_WRITESEGENTRY(sup, fs, nsn, ibp);
    510 		}
    511 		vput(vp);
    512 	}
    513 	free(buf, M_SEGMENT);
    514 
    515 	return 0;
    516 }
    517 
    518 /*
    519  * Note the highest generation number of each inode in the Ifile.
    520  * This allows us to skip processing data for intermediate versions.
    521  */
    522 static int
    523 update_inogen(struct lfs_inofuncarg *lifa)
    524 {
    525 	struct lfs *fs;
    526 	daddr_t offset;
    527 	struct vnode *devvp;
    528 	union lfs_dinode *dip;
    529 	struct buf *dbp, *ibp;
    530 	int error;
    531 	IFILE *ifp;
    532 	unsigned i, num;
    533 
    534 	fs = lifa->fs;
    535 	offset = lifa->offset;
    536 	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
    537 
    538 	/* Read inode block */
    539 	error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs),
    540 	    0, &dbp);
    541 	if (error) {
    542 		DLOG((DLOG_RF, "update_inoblk: bread returned %d\n", error));
    543 		return error;
    544 	}
    545 
    546 	/* Check each inode against ifile entry */
    547 	num = LFS_INOPB(fs);
    548 	for (i = num; i-- > 0; ) {
    549 		dip = DINO_IN_BLOCK(fs, dbp->b_data, i);
    550 		if (lfs_dino_getinumber(fs, dip) == LFS_IFILE_INUM)
    551 			continue;
    552 
    553 		/* Update generation number */
    554 		LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp);
    555 		if (lfs_if_getversion(fs, ifp) < lfs_dino_getgen(fs, dip))
    556 			lfs_if_setversion(fs, ifp, lfs_dino_getgen(fs, dip));
    557 		error = LFS_BWRITE_LOG(ibp); /* Ifile */
    558 		if (error)
    559 			break;
    560 	}
    561 	brelse(dbp, 0);
    562 
    563 	return error;
    564 }
    565 
    566 static int
    567 finfo_func_rfw(struct lfs_finfofuncarg *lffa)
    568 {
    569 	struct lfs *fs;
    570 	FINFO *fip;
    571 	daddr_t *offsetp;
    572 	struct lwp *l;
    573 	int j;
    574 	size_t size;
    575 
    576 	fs = lffa->fs;
    577 	fip = lffa->finfop;
    578 	offsetp = lffa->offsetp;
    579 	l = lffa->l;
    580 	size = lfs_sb_getbsize(fs);
    581 	for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) {
    582 		if (j == lfs_fi_getnblocks(fs, fip) - 1)
    583 			size = lfs_fi_getlastlength(fs, fip);
    584 
    585 		/* Account for and update any direct blocks */
    586 		if (lfs_fi_getino(fs, fip) > LFS_IFILE_INUM &&
    587 		    lfs_fi_getblock(fs, fip, j) >= 0) {
    588 			update_meta(fs, lfs_fi_getino(fs, fip),
    589 				    lfs_fi_getversion(fs, fip),
    590 				    lfs_fi_getblock(fs, fip, j),
    591 				    *offsetp, size, l);
    592 			++rblkcnt;
    593 		}
    594 		*offsetp += lfs_btofsb(fs, size);
    595 	}
    596 
    597 	return 0;
    598 }
    599 
    600 static int
    601 skip_superblock(struct lfs *fs, daddr_t *offsetp)
    602 {
    603 	daddr_t offset;
    604 	int i;
    605 
    606 	/*
    607 	 * If this is segment 0, skip the label.
    608 	 * If the segment has a superblock and we're at the top
    609 	 * of the segment, skip the superblock.
    610 	 */
    611 	offset = *offsetp;
    612 	if (offset == lfs_sb_gets0addr(fs)) {
    613 		offset += lfs_btofsb(fs, LFS_LABELPAD);
    614 	}
    615 	for (i = 0; i < LFS_MAXNUMSB; i++) {
    616 		if (offset == lfs_sb_getsboff(fs, i)) {
    617 			offset += lfs_btofsb(fs, LFS_SBPAD);
    618 			break;
    619 		}
    620 	}
    621 	*offsetp = offset;
    622 	return 0;
    623 }
    624 
    625 /*
    626  * Read the partial sement at offset.
    627  *
    628  * If finfo_func and ino_func are both NULL, check the summary
    629  * and data checksums.  During roll forward, this must be done in its
    630  * entirety before processing any blocks.
    631  *
    632  * If finfo_func is given, use that to process every file block
    633  * in the segment summary.  If ino_func is given, use that to process
    634  * every inode block.
    635  */
    636 #define CKSEG_NONE  0x0000
    637 #define CKSEG_CKSUM 0x0001
    638 #define CKSEG_AVAIL 0x0002
    639 
    640 int
    641 lfs_parse_pseg(struct lfs *fs, daddr_t *offsetp, u_int64_t nextserial,
    642 	       kauth_cred_t cred, int *pseg_flags, struct lwp *l,
    643 	       int (*ino_func)(struct lfs_inofuncarg *),
    644 	       int (*finfo_func)(struct lfs_finfofuncarg *),
    645 	       int flags, void *arg)
    646 {
    647 	struct vnode *devvp;
    648 	struct buf *bp, *dbp;
    649 	int error, ninos, i, j;
    650 	SEGSUM *ssp;
    651 	daddr_t offset, prevoffset;
    652 	IINFO *iip;
    653 	FINFO *fip;
    654 	size_t size;
    655 	uint32_t datasum, foundsum;
    656 	char *buf;
    657 	struct lfs_inofuncarg lifa;
    658 	struct lfs_finfofuncarg lffa;
    659 
    660 	KASSERT(fs != NULL);
    661 	KASSERT(offsetp != NULL);
    662 
    663 	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
    664 
    665 	/* Set up callback arguments */
    666 	lifa.fs = fs;
    667 	/* lifa.offset = offset; */
    668 	lifa.cred = cred;
    669 	lifa.l = l;
    670 	lifa.buf = malloc(lfs_sb_getbsize(fs), M_SEGMENT, M_WAITOK);
    671 
    672 	lifa.arg = arg;
    673 
    674 	lffa.fs = fs;
    675 	/* lffa.offsetp = offsetp; */
    676 	/* lffa.finfop = finfop; */
    677 	lffa.cred = cred;
    678 	lffa.l = l;
    679 	lffa.arg = arg;
    680 
    681 	prevoffset = *offsetp;
    682 	skip_superblock(fs, offsetp);
    683 	offset = *offsetp;
    684 
    685 	/* Read in the segment summary */
    686 	buf = malloc(lfs_sb_getsumsize(fs), M_SEGMENT, M_WAITOK);
    687 	error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getsumsize(fs),
    688 	    0, &bp);
    689 	if (error)
    690 		goto err;
    691 	memcpy(buf, bp->b_data, bp->b_bcount);
    692 	brelse(bp, BC_AGE);
    693 
    694 	ssp = (SEGSUM *)buf;
    695 
    696 	/*
    697 	 * Phase I: Check summary checksum.
    698 	 */
    699 	if (flags & CKSEG_CKSUM) {
    700 		size_t sumstart;
    701 
    702 		if (lfs_ss_getmagic(fs, ssp) != SS_MAGIC) {
    703 			DLOG((DLOG_RF, "Bad magic at 0x%" PRIx64 "\n",
    704 			      offset));
    705 			offset = -1;
    706 			goto err;
    707 		}
    708 
    709 		sumstart = lfs_ss_getsumstart(fs);
    710 		if (lfs_ss_getsumsum(fs, ssp) !=
    711 		    cksum((char *)ssp + sumstart,
    712 			  lfs_sb_getsumsize(fs) - sumstart)) {
    713 			DLOG((DLOG_RF, "Sumsum error at 0x%" PRIx64 "\n",
    714 				offset));
    715 			offset = -1;
    716 			goto err;
    717 		}
    718 #if 0
    719 		/*
    720 		 * Under normal conditions, we should never be producing
    721 		 * a partial segment with neither inode blocks nor data blocks.
    722 		 * However, these do sometimes appear and they need not
    723 		 * prevent us from continuing.
    724 		 */
    725 		if (lfs_ss_getnfinfo(fs, ssp) == 0 &&
    726 		    lfs_ss_getninos(fs, ssp) == 0) {
    727 			DLOG((DLOG_RF, "Empty pseg at 0x%" PRIx64 "\n",
    728 				offset));
    729 			offset = -1;
    730 			goto err;
    731 		}
    732 #endif /* 0 */
    733 		if (lfs_sb_getversion(fs) == 1) {
    734 			if (lfs_ss_getcreate(fs, ssp) < lfs_sb_gettstamp(fs)) {
    735 				DLOG((DLOG_RF, "Old data at 0x%" PRIx64 "\n", offset));
    736 				offset = -1;
    737 				goto err;
    738 			}
    739 		} else {
    740 			if (nextserial > 0
    741 			    && lfs_ss_getserial(fs, ssp) != nextserial) {
    742 				DLOG((DLOG_RF, "Serial number at 0x%jx given as 0x%jx,"
    743 				      " expected 0x%jx\n", (intmax_t)offset,
    744 				      (intmax_t)lfs_ss_getserial(fs, ssp),
    745 				      (intmax_t)nextserial));
    746 				offset = -1;
    747 				goto err;
    748 			}
    749 			if (lfs_ss_getident(fs, ssp) != lfs_sb_getident(fs)) {
    750 				DLOG((DLOG_RF, "Incorrect fsid (0x%x vs 0x%x) at 0x%"
    751 				      PRIx64 "\n", lfs_ss_getident(fs, ssp),
    752 				      lfs_sb_getident(fs), offset));
    753 				offset = -1;
    754 				goto err;
    755 			}
    756 		}
    757 	}
    758 	if (pseg_flags)
    759 		*pseg_flags = lfs_ss_getflags(fs, ssp);
    760 	offset += lfs_btofsb(fs, lfs_sb_getsumsize(fs));
    761 
    762 	/* Handle individual blocks */
    763 	foundsum = 0;
    764 	ninos = howmany(lfs_ss_getninos(fs, ssp), LFS_INOPB(fs));
    765 	iip = SEGSUM_IINFOSTART(fs, buf);
    766 	fip = SEGSUM_FINFOBASE(fs, (SEGSUM *)buf);
    767 	KASSERT(lfs_ss_getnfinfo(fs, ssp) <= lfs_sb_getssize(fs) / lfs_sb_getfsize(fs));
    768 	KASSERT(lfs_ss_getnfinfo(fs, ssp) <= lfs_sb_getfsize(fs) / sizeof(FINFO32));
    769 	for (i = 0; i < lfs_ss_getnfinfo(fs, ssp) || ninos; ++i) {
    770 		/* Inode block? */
    771 		if (ninos && lfs_ii_getblock(fs, iip) == offset) {
    772 			if (flags & CKSEG_CKSUM) {
    773 				/* Read in the head and add to the buffer */
    774 				error = bread(devvp, LFS_FSBTODB(fs, offset),
    775 					lfs_sb_getbsize(fs), 0, &dbp);
    776 				if (error) {
    777 					offset = -1;
    778 					goto err;
    779 				}
    780 				foundsum = lfs_cksum_part(dbp->b_data,
    781 					sizeof(uint32_t), foundsum);
    782 				brelse(dbp, BC_AGE);
    783 			} else if (ino_func != NULL) {
    784 				lifa.offset = offset;
    785 				error = (*ino_func)(&lifa);
    786 				if (error != 0) {
    787 					offset = -1;
    788 					goto err;
    789 				}
    790 			}
    791 
    792 			offset += lfs_btofsb(fs, lfs_sb_getibsize(fs));
    793 			iip = NEXTLOWER_IINFO(fs, iip);
    794 			--ninos;
    795 			--i; /* compensate for ++i in loop header */
    796 			continue;
    797 		}
    798 
    799 		/* File block */
    800 		size = lfs_sb_getbsize(fs);
    801 		if (flags & CKSEG_CKSUM) {
    802 			for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) {
    803 				if (j == lfs_fi_getnblocks(fs, fip) - 1)
    804 					size = lfs_fi_getlastlength(fs, fip);
    805 				error = bread(devvp, LFS_FSBTODB(fs, offset),
    806 					      size, 0, &dbp);
    807 				if (error) {
    808 					offset = -1;
    809 					goto err;
    810 				}
    811 				foundsum = lfs_cksum_part(dbp->b_data,
    812 							  sizeof(uint32_t), foundsum);
    813 				brelse(dbp, BC_AGE);
    814 				offset += lfs_btofsb(fs, size);
    815 			}
    816 		} else if (finfo_func != NULL) {
    817 			lffa.offsetp = &offset;
    818 			lffa.finfop = fip;
    819 			(*finfo_func)(&lffa);
    820 		} else {
    821 			int n = lfs_fi_getnblocks(fs, fip);
    822 			size = lfs_fi_getlastlength(fs, fip);
    823 			offset += lfs_btofsb(fs, lfs_sb_getbsize(fs) * (n - 1)
    824 					     + size);
    825 		}
    826 		fip = NEXT_FINFO(fs, fip);
    827 	}
    828 
    829 	/* Checksum the array, compare */
    830 	if (flags & CKSEG_CKSUM) {
    831 		datasum = lfs_ss_getdatasum(fs, ssp);
    832 		foundsum = lfs_cksum_fold(foundsum);
    833 		if (datasum != foundsum) {
    834 			DLOG((DLOG_RF, "Datasum error at 0x%" PRIx64
    835 			      " (wanted %x got %x)\n",
    836 			      offset, datasum, foundsum));
    837 			offset = -1;
    838 			goto err;
    839 		}
    840 	} else {
    841 		/* Don't clog the buffer queue */
    842 		mutex_enter(&lfs_lock);
    843 		if (locked_queue_count > LFS_MAX_BUFS ||
    844 		    locked_queue_bytes > LFS_MAX_BYTES) {
    845 			lfs_flush(fs, SEGM_CKP, 0);
    846 		}
    847 		mutex_exit(&lfs_lock);
    848 	}
    849 
    850 	/*
    851 	 * If we're at the end of the segment, move to the next.
    852 	 * A partial segment needs space for a segment header (1 fsb)
    853 	 * and a full block ("frag" fsb).  Thus, adding "frag" fsb should
    854 	 * still be within the current segment (whereas frag + 1 might
    855 	 * be at the start of the next segment).
    856 	 *
    857 	 * This needs to match the definition of LFS_PARTIAL_FITS
    858 	 * in lfs_segment.c.
    859 	 */
    860 	if (lfs_dtosn(fs, offset + lfs_sb_getfrag(fs))
    861 	    != lfs_dtosn(fs, offset)) {
    862 		if (lfs_dtosn(fs, offset) == lfs_dtosn(fs, lfs_ss_getnext(fs,
    863 									ssp))) {
    864 			offset = -1;
    865 			goto err;
    866 		}
    867 		offset = lfs_ss_getnext(fs, ssp);
    868 		DLOG((DLOG_RF, "LFS roll forward: moving to offset 0x%" PRIx64
    869 		       " -> segment %d\n", offset, lfs_dtosn(fs,offset)));
    870 	}
    871 	if (flags & CKSEG_AVAIL)
    872 		lfs_sb_subavail(fs, offset - prevoffset);
    873 
    874     err:
    875 	free(lifa.buf, M_SEGMENT);
    876 	free(buf, M_SEGMENT);
    877 
    878 	*offsetp = offset;
    879 	return 0;
    880 }
    881 
    882 /*
    883  * Roll forward.
    884  */
    885 void
    886 lfs_roll_forward(struct lfs *fs, struct mount *mp, struct lwp *l)
    887 {
    888 	int flags, dirty;
    889 	daddr_t startoffset, offset, nextoffset, endpseg;
    890 	u_int64_t nextserial, startserial, endserial;
    891 	int sn, curseg;
    892 	struct proc *p;
    893 	kauth_cred_t cred;
    894 	SEGUSE *sup;
    895 	struct buf *bp;
    896 
    897 	p = l ? l->l_proc : NULL;
    898 	cred = p ? p->p_cred : NOCRED;
    899 
    900 	/*
    901 	 * We don't roll forward for v1 filesystems, because
    902 	 * of the danger that the clock was turned back between the last
    903 	 * checkpoint and crash.  This would roll forward garbage.
    904 	 *
    905 	 * v2 filesystems don't have this problem because they use a
    906 	 * monotonically increasing serial number instead of a timestamp.
    907 	 */
    908 	rblkcnt = 0;
    909 	if ((lfs_sb_getpflags(fs) & LFS_PF_CLEAN) || !lfs_do_rfw
    910 	    || lfs_sb_getversion(fs) <= 1 || p == NULL)
    911 		return;
    912 
    913 	DLOG((DLOG_RF, "%s: begin roll forward at serial 0x%jx\n",
    914 		lfs_sb_getfsmnt(fs), (intmax_t)lfs_sb_getserial(fs)));
    915 	DEBUG_CHECK_FREELIST(fs);
    916 
    917 	/*
    918 	 * Phase I: Find the address of the last good partial
    919 	 * segment that was written after the checkpoint.  Mark
    920 	 * the segments in question dirty, so they won't be
    921 	 * reallocated.
    922 	 */
    923 	endpseg = startoffset = offset = lfs_sb_getoffset(fs);
    924 	flags = 0x0;
    925 	DLOG((DLOG_RF, "LFS roll forward phase 1: start at offset 0x%"
    926 	      PRIx64 "\n", offset));
    927 	LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp);
    928 	if (!(sup->su_flags & SEGUSE_DIRTY))
    929 		lfs_sb_subnclean(fs, 1);
    930 	sup->su_flags |= SEGUSE_DIRTY;
    931 	LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp);
    932 
    933 	startserial = lfs_sb_getserial(fs);
    934 	endserial = nextserial = startserial + 1;
    935 	nextoffset = offset;
    936 	while (1) {
    937 		nextoffset = offset;
    938 		lfs_parse_pseg(fs, &nextoffset, nextserial,
    939 			     cred, &flags, l, NULL, NULL, CKSEG_CKSUM, NULL);
    940 		if (nextoffset == -1)
    941 			break;
    942 		if (lfs_sntod(fs, offset) != lfs_sntod(fs, nextoffset)) {
    943 			LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset),
    944 				     bp);
    945 			if (!(sup->su_flags & SEGUSE_DIRTY))
    946 				lfs_sb_subnclean(fs, 1);
    947 			sup->su_flags |= SEGUSE_DIRTY;
    948 			LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp);
    949 		}
    950 
    951 		DLOG((DLOG_RF, "LFS roll forward phase 1: offset=0x%jx"
    952 			" serial=0x%jx\n", (intmax_t)nextoffset,
    953 			(intmax_t)nextserial));
    954 		if (flags & SS_DIROP) {
    955 			DLOG((DLOG_RF, "lfs_mountfs: dirops at 0x%"
    956 			      PRIx64 "\n", offset));
    957 			if (!(flags & SS_CONT)) {
    958 			     DLOG((DLOG_RF, "lfs_mountfs: dirops end "
    959 				   "at 0x%" PRIx64 "\n", offset));
    960 			}
    961 		}
    962 		offset = nextoffset;
    963 		++nextserial;
    964 
    965 		if (!(flags & SS_CONT)) {
    966 			endpseg = nextoffset;
    967 			endserial = nextserial;
    968 		}
    969 		if (lfs_rfw_max_psegs > 0
    970 		    && nextserial > startserial + lfs_rfw_max_psegs)
    971 			break;
    972 	}
    973 	if (flags & SS_CONT) {
    974 		DLOG((DLOG_RF, "LFS roll forward: warning: incomplete "
    975 			"dirops discarded (0x%jx < 0x%jx)\n",
    976 			endpseg, nextoffset));
    977 	}
    978 	if (lfs_sb_getversion(fs) > 1)
    979 		lfs_sb_setserial(fs, endserial);
    980 	DLOG((DLOG_RF, "LFS roll forward phase 1: completed: "
    981 	      "endpseg=0x%" PRIx64 "\n", endpseg));
    982 	offset = startoffset;
    983 	if (offset != endpseg) {
    984 		/* Don't overwrite what we're trying to preserve */
    985 		lfs_sb_setoffset(fs, endpseg);
    986 		lfs_sb_setcurseg(fs, lfs_sntod(fs, lfs_dtosn(fs, endpseg)));
    987 		for (sn = curseg = lfs_dtosn(fs, lfs_sb_getcurseg(fs));;) {
    988 			sn = (sn + 1) % lfs_sb_getnseg(fs);
    989 			/* XXX could we just fail to roll forward? */
    990 			if (sn == curseg)
    991 				panic("lfs_mountfs: no clean segments");
    992 			LFS_SEGENTRY(sup, fs, sn, bp);
    993 			dirty = (sup->su_flags & SEGUSE_DIRTY);
    994 			brelse(bp, 0);
    995 			if (!dirty)
    996 				break;
    997 		}
    998 		lfs_sb_setnextseg(fs, lfs_sntod(fs, sn));
    999 		/* Explicitly set this segment dirty */
   1000 		LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, endpseg), bp);
   1001 		sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE;
   1002 		LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, endpseg), bp);
   1003 
   1004 		/*
   1005 		 * Phase II: Identify the highest generation of each
   1006 		 * inode.  We will ignore inodes and data blocks
   1007 		 * belonging to old versions.
   1008 		 */
   1009 		offset = startoffset;
   1010 		nextserial = startserial + 1;
   1011 		DLOG((DLOG_RF, "LFS roll forward phase 2 beginning\n"));
   1012 		while (offset > 0 && offset != endpseg) {
   1013 			lfs_parse_pseg(fs, &offset, nextserial++, cred,
   1014 				     NULL, l, update_inogen, NULL,
   1015 				     CKSEG_NONE, NULL);
   1016 			DEBUG_CHECK_FREELIST(fs);
   1017 		}
   1018 
   1019 		/*
   1020 		 * Phase III: Update inodes.
   1021 		 */
   1022 		offset = startoffset;
   1023 		nextserial = startserial + 1;
   1024 		DLOG((DLOG_RF, "LFS roll forward phase 3 beginning\n"));
   1025 		while (offset > 0 && offset != endpseg) {
   1026 			lfs_parse_pseg(fs, &offset, nextserial++, cred,
   1027 				     NULL, l, update_inoblk, NULL,
   1028 				     CKSEG_NONE, NULL);
   1029 			DEBUG_CHECK_FREELIST(fs);
   1030 		}
   1031 
   1032 		/*
   1033 		 * Phase IV: Roll forward, updating data blocks.
   1034 		 */
   1035 		offset = startoffset;
   1036 		nextserial = startserial + 1;
   1037 		DLOG((DLOG_RF, "LFS roll forward phase 4 beginning\n"));
   1038 		while (offset > 0 && offset != endpseg) {
   1039 			lfs_parse_pseg(fs, &offset, nextserial++, cred,
   1040 				     NULL, l, NULL, finfo_func_rfw,
   1041 				     CKSEG_AVAIL, NULL);
   1042 			DEBUG_CHECK_FREELIST(fs);
   1043 		}
   1044 
   1045 		/*
   1046 		 * Finish: flush our changes to disk.
   1047 		 */
   1048 		lfs_sb_setserial(fs, endserial);
   1049 
   1050 		lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC);
   1051 		DLOG((DLOG_RF, "lfs_mountfs: roll forward "
   1052 		      "examined %jd blocks\n",
   1053 		      (intmax_t)(endpseg - startoffset)));
   1054 	}
   1055 
   1056 	/* Get rid of our vnodes, except the ifile */
   1057 	drop_vnode_pages(mp, l);
   1058 	DLOG((DLOG_RF, "LFS roll forward complete\n"));
   1059 	printf("%s: roll forward recovered %d data blocks\n",
   1060 		lfs_sb_getfsmnt(fs), rblkcnt);
   1061 
   1062 	/*
   1063 	 * At this point we have no more changes to write to disk.
   1064 	 * Reset the "avail" count to match the segments as they
   1065 	 * appear on disk, and the clean segment count.
   1066 	 */
   1067 	lfs_reset_avail(fs);
   1068 }
   1069 
   1070 static bool
   1071 all_selector(void *cl, struct vnode *vp)
   1072 {
   1073 	return true;
   1074 }
   1075 
   1076 /*
   1077  * Dump any pages from vnodes that may have been put on
   1078  * during truncation.
   1079  */
   1080 static void
   1081 drop_vnode_pages(struct mount *mp, struct lwp *l)
   1082 {
   1083        struct vnode_iterator *marker;
   1084        struct lfs *fs;
   1085        struct vnode *vp;
   1086 
   1087        fs = VFSTOULFS(mp)->um_lfs;
   1088        vfs_vnode_iterator_init(mp, &marker);
   1089        while ((vp = vfs_vnode_iterator_next(marker,
   1090                all_selector, NULL)) != NULL) {
   1091                if (vp == fs->lfs_ivnode)
   1092                        continue;
   1093                VOP_LOCK(vp, LK_EXCLUSIVE | LK_RETRY);
   1094                uvm_vnp_setsize(vp, 0);
   1095                uvm_vnp_setsize(vp, VTOI(vp)->i_size);
   1096                VOP_UNLOCK(vp);
   1097                vrele(vp);
   1098        }
   1099        vfs_vnode_iterator_destroy(marker);
   1100 }
   1101 
   1102 static int
   1103 ino_func_setclean(struct lfs_inofuncarg *lifa)
   1104 {
   1105 	struct lfs *fs;
   1106 	daddr_t offset;
   1107 	struct vnode *devvp, *vp;
   1108 	union lfs_dinode *dip;
   1109 	struct buf *dbp, *ibp;
   1110 	int error;
   1111 	IFILE *ifp;
   1112 	unsigned i, num;
   1113 	daddr_t true_addr;
   1114 	ino_t ino;
   1115 
   1116 	fs = lifa->fs;
   1117 	offset = lifa->offset;
   1118 	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
   1119 
   1120 	/* Read inode block */
   1121 	error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs),
   1122 	    0, &dbp);
   1123 	if (error) {
   1124 		DLOG((DLOG_RF, "ino_func_setclean: bread returned %d\n",
   1125 		      error));
   1126 		return error;
   1127 	}
   1128 	memcpy(lifa->buf, dbp->b_data, dbp->b_bcount);
   1129 	brelse(dbp, BC_AGE);
   1130 
   1131 	/* Check each inode against ifile entry */
   1132 	num = LFS_INOPB(fs);
   1133 	for (i = num; i-- > 0; ) {
   1134 		dip = DINO_IN_BLOCK(fs, lifa->buf, i);
   1135 		ino = lfs_dino_getinumber(fs, dip);
   1136 		if (ino == LFS_IFILE_INUM) {
   1137 			/* Check address against superblock */
   1138 			true_addr = lfs_sb_getidaddr(fs);
   1139 		} else {
   1140 			/* Not ifile.  Check address against ifile. */
   1141 			LFS_IENTRY(ifp, fs, ino, ibp);
   1142 			true_addr = lfs_if_getdaddr(fs, ifp);
   1143 			brelse(ibp, 0);
   1144 		}
   1145 		if (offset != true_addr)
   1146 			continue;
   1147 
   1148 		/* XXX We can use fastvget here! */
   1149 
   1150 		/*
   1151 		 * An inode we need to relocate.
   1152 		 * Get it if we can.
   1153 		 */
   1154 		error = VFS_VGET(fs->lfs_ivnode->v_mount, ino,
   1155 				 LK_EXCLUSIVE | LK_NOWAIT, &vp);
   1156 		if (error)
   1157 			continue;
   1158 
   1159 		KASSERT(VTOI(vp)->i_gen == lfs_dino_getgen(fs, dip));
   1160 		lfs_setclean(fs, vp);
   1161 		VOP_UNLOCK(vp);
   1162 		vrele(vp);
   1163 
   1164 	}
   1165 
   1166 	return error;
   1167 }
   1168 
   1169 static int
   1170 ino_func_rewrite(struct lfs_inofuncarg *lifa)
   1171 {
   1172 	struct lfs *fs;
   1173 	daddr_t offset;
   1174 	struct vnode *devvp, *vp;
   1175 	union lfs_dinode *dip;
   1176 	struct buf *dbp, *ibp;
   1177 	int error;
   1178 	IFILE *ifp;
   1179 	unsigned i, num;
   1180 	daddr_t true_addr;
   1181 	ino_t ino;
   1182 
   1183 	fs = lifa->fs;
   1184 	offset = lifa->offset;
   1185 	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
   1186 
   1187 	/* Read inode block */
   1188 	error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs),
   1189 	    0, &dbp);
   1190 	if (error) {
   1191 		DLOG((DLOG_RF, "ino_func_rewrite: bread returned %d\n",
   1192 		      error));
   1193 		return error;
   1194 	}
   1195 	memcpy(lifa->buf, dbp->b_data, dbp->b_bcount);
   1196 	brelse(dbp, BC_AGE);
   1197 
   1198 	/* Check each inode against ifile entry */
   1199 	num = LFS_INOPB(fs);
   1200 	for (i = num; i-- > 0; ) {
   1201 		dip = DINO_IN_BLOCK(fs, lifa->buf, i);
   1202 		ino = lfs_dino_getinumber(fs, dip);
   1203 		if (ino == LFS_IFILE_INUM) {
   1204 			/* Check address against superblock */
   1205 			true_addr = lfs_sb_getidaddr(fs);
   1206 		} else {
   1207 			/* Not ifile.  Check address against ifile. */
   1208 			LFS_IENTRY(ifp, fs, ino, ibp);
   1209 			true_addr = lfs_if_getdaddr(fs, ifp);
   1210 			brelse(ibp, 0);
   1211 		}
   1212 		if (offset != true_addr)
   1213 			continue;
   1214 
   1215 		if (ino == LFS_IFILE_INUM)
   1216 			continue;
   1217 
   1218 		/* XXX We can use fastvget here! */
   1219 
   1220 		/*
   1221 		 * An inode we need to relocate.
   1222 		 * Get it if we can.
   1223 		 */
   1224 		error = VFS_VGET(fs->lfs_ivnode->v_mount, ino,
   1225 				 LK_EXCLUSIVE | LK_NOWAIT, &vp);
   1226 		if (error)
   1227 			continue;
   1228 
   1229 		KASSERT(VTOI(vp)->i_gen == lfs_dino_getgen(fs, dip));
   1230 
   1231 		if (!(VTOI(vp)->i_state & IN_CLEANING)) {
   1232 			lfs_setclean(fs, vp);
   1233 			lfs_writeinode(fs, fs->lfs_sp, VTOI(vp));
   1234 		}
   1235 
   1236 		VOP_UNLOCK(vp);
   1237 		vrele(vp);
   1238 
   1239 	}
   1240 
   1241 	return error;
   1242 }
   1243 
   1244 static int
   1245 rewrite_block(struct lfs *fs, struct vnode *vp, daddr_t lbn, daddr_t offset, size_t size, int *have_finfop)
   1246 {
   1247 	daddr_t daddr;
   1248 	int error;
   1249 	struct buf *bp;
   1250 	struct inode *ip;
   1251 
   1252 	KASSERT(have_finfop != NULL);
   1253 
   1254 	/* Look up current location of this block. */
   1255 	error = VOP_BMAP(vp, lbn, NULL, &daddr, NULL);
   1256 	if (error)
   1257 		return error;
   1258 
   1259 	/* Skip any block that is not here. */
   1260 	if (offset != 0 && LFS_DBTOFSB(fs, daddr) != offset)
   1261 		return ESTALE;
   1262 
   1263 	/*
   1264 	 * It is (was recently) here.  Read the block.
   1265 	 */
   1266 	//size = lfs_blksize(fs, VTOI(vp), lbn);
   1267 	error = bread(vp, lbn, size, 0, &bp);
   1268 	if (error)
   1269 		return error;
   1270 
   1271 	if (vp == fs->lfs_ivnode) {
   1272 		VOP_BWRITE(vp, bp);
   1273 	} else {
   1274 		/* Get ready to write. */
   1275 		if (!*have_finfop) {
   1276 			ip = VTOI(vp);
   1277 			lfs_acquire_finfo(fs, ip->i_number, ip->i_gen);
   1278 			fs->lfs_sp->vp = vp;
   1279 			*have_finfop = 1;
   1280 		}
   1281 
   1282 		KASSERT(bp->b_vp == vp);
   1283 		/* bp->b_cflags |= BC_INVAL; */ /* brelse will kill the buffer */
   1284 		lfs_bwrite_ext(bp, BW_CLEAN);
   1285 		KASSERT(bp->b_vp == vp);
   1286 		mutex_enter(&bufcache_lock);
   1287 		while (lfs_gatherblock(fs->lfs_sp, bp, &bufcache_lock)) {
   1288 			KASSERT(bp->b_vp != NULL);
   1289 		}
   1290 		mutex_exit(&bufcache_lock);
   1291 
   1292 		KASSERT(bp->b_flags & B_GATHERED);
   1293 		KASSERT(fs->lfs_sp->cbpp[-1] == bp);
   1294 	}
   1295 	return 0;
   1296 }
   1297 
   1298 static int
   1299 finfo_func_rewrite(struct lfs_finfofuncarg *lffa)
   1300 {
   1301 	struct lfs *fs;
   1302 	FINFO *fip;
   1303 	daddr_t *offsetp;
   1304 	int j, have_finfo, error;
   1305 	size_t size, bytes;
   1306 	ino_t ino;
   1307 	uint32_t gen;
   1308 	struct vnode *vp;
   1309 	daddr_t lbn;
   1310 	int *fragsp;
   1311 
   1312 	fs = lffa->fs;
   1313 	fip = lffa->finfop;
   1314 	offsetp = lffa->offsetp;
   1315 	fragsp = (int *)lffa->arg;
   1316 
   1317 	/* Get the inode and check its version. */
   1318 	ino = lfs_fi_getino(fs, fip);
   1319 	gen = lfs_fi_getversion(fs, fip);
   1320 	error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, LK_EXCLUSIVE|LK_NOWAIT,
   1321 			 &vp);
   1322 
   1323 	/*
   1324 	 * If we can't, or if version is wrong, or it has dirop blocks on it,
   1325 	 * we can't relocate its blocks; but we still have to count
   1326 	 * blocks through the partial segment to return the right offset.
   1327 	 * XXX actually we can move DIROP vnodes' *old* data, as long
   1328 	 * XXX as we are sure that we are moving *only* the old data---?
   1329 	 */
   1330 	if (error || VTOI(vp)->i_gen != gen || (vp->v_uflag & VU_DIROP)) {
   1331 		if (error == 0)
   1332 			error = ESTALE;
   1333 
   1334 		if (vp != NULL) {
   1335 			VOP_UNLOCK(vp);
   1336 			vrele(vp);
   1337 			vp = NULL;
   1338 		}
   1339 		bytes = ((lfs_fi_getnblocks(fs, fip) - 1) << lfs_sb_getbshift(fs))
   1340 			+ lfs_fi_getlastlength(fs, fip);
   1341 		*offsetp += lfs_btofsb(fs, bytes);
   1342 
   1343 		return error;
   1344 	}
   1345 
   1346 	/*
   1347 	 * We have the vnode and its version is correct.
   1348 	 * Take a cleaning reference; and loop through the blocks
   1349 	 * and rewrite them.
   1350 	 */
   1351 	lfs_setclean(fs, vp);
   1352 	size = lfs_sb_getbsize(fs);
   1353 	have_finfo = 0;
   1354 	for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) {
   1355 		if (j == lfs_fi_getnblocks(fs, fip) - 1)
   1356 			size = lfs_fi_getlastlength(fs, fip);
   1357 		/*
   1358 		 * An error of ESTALE indicates that there was nothing
   1359 		 * to rewrite; this is not a problem.  Any other error
   1360 		 * causes us to skip the rest of this FINFO.
   1361 		 */
   1362 		if (vp != NULL && error == 0) {
   1363 			lbn = lfs_fi_getblock(fs, fip, j);
   1364 			error = rewrite_block(fs, vp, lbn, *offsetp,
   1365 					      size, &have_finfo);
   1366 			if (error == ESTALE)
   1367 				error = 0;
   1368 			if (fragsp != NULL && error == 0)
   1369 				*fragsp += lfs_btofsb(fs, size);
   1370 		}
   1371 		*offsetp += lfs_btofsb(fs, size);
   1372 	}
   1373 
   1374 	/*
   1375 	 * If we acquired finfo, release it and write the blocks.
   1376 	 */
   1377 	if (have_finfo) {
   1378 		lfs_updatemeta(fs->lfs_sp);
   1379 		fs->lfs_sp->vp = NULL;
   1380 		lfs_release_finfo(fs);
   1381 		lfs_writeinode(fs, fs->lfs_sp, VTOI(vp));
   1382 	}
   1383 
   1384 	/* Release vnode */
   1385 	VOP_UNLOCK(vp);
   1386 	vrele(vp);
   1387 
   1388 	return error;
   1389 }
   1390 
   1391 static int
   1392 finfo_func_setclean(struct lfs_finfofuncarg *lffa)
   1393 {
   1394 	struct lfs *fs;
   1395 	FINFO *fip;
   1396 	daddr_t *offsetp;
   1397 	int error;
   1398 	size_t bytes;
   1399 	ino_t ino;
   1400 	uint32_t gen;
   1401 	struct vnode *vp;
   1402 
   1403 	fs = lffa->fs;
   1404 	fip = lffa->finfop;
   1405 	offsetp = lffa->offsetp;
   1406 
   1407 	/* Get the inode and check its version. */
   1408 	ino = lfs_fi_getino(fs, fip);
   1409 	gen = lfs_fi_getversion(fs, fip);
   1410 	error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, LK_EXCLUSIVE|LK_NOWAIT,
   1411 			 &vp);
   1412 
   1413 	/* If we have it and its version is right, take a cleaning reference */
   1414 	if (error == 0 && VTOI(vp)->i_gen == gen)
   1415 		lfs_setclean(fs, vp);
   1416 
   1417 	if (vp != NULL) {
   1418 		VOP_UNLOCK(vp);
   1419 		vrele(vp);
   1420 		vp = NULL;
   1421 	}
   1422 
   1423 	/* Skip to the next block */
   1424 	bytes = ((lfs_fi_getnblocks(fs, fip) - 1) << lfs_sb_getbshift(fs))
   1425 		+ lfs_fi_getlastlength(fs, fip);
   1426 	*offsetp += lfs_btofsb(fs, bytes);
   1427 
   1428 	return error;
   1429 }
   1430 
   1431 /*
   1432  * Use the partial-segment parser to rewrite (clean) a segment.
   1433  */
   1434 int
   1435 lfs_rewrite_segment(struct lfs *fs, int sn, int *fragsp, kauth_cred_t cred, struct lwp *l)
   1436 {
   1437 	daddr_t ooffset, offset, endpseg;
   1438 
   1439 	ASSERT_SEGLOCK(fs);
   1440 
   1441 	offset = lfs_sntod(fs, sn);
   1442 	skip_superblock(fs, &offset);
   1443 	endpseg = lfs_sntod(fs, sn + 1);
   1444 
   1445 	while (offset > 0 && offset != endpseg) {
   1446 		/* First check summary validity (XXX unnecessary?) */
   1447 		ooffset = offset;
   1448 		lfs_parse_pseg(fs, &offset, 0, cred, NULL, l,
   1449 			     NULL, NULL, CKSEG_CKSUM, NULL);
   1450 		if (offset == ooffset)
   1451 			break;
   1452 
   1453 		/*
   1454 		 * Valid, proceed.
   1455 		 *
   1456 		 * First write the file blocks, marking their
   1457 		 * inodes IN_CLEANING.
   1458 		 */
   1459 		offset = ooffset;
   1460 		lfs_parse_pseg(fs, &offset, 0, cred, NULL, l,
   1461 			       NULL, finfo_func_rewrite,
   1462 			       CKSEG_NONE, fragsp);
   1463 
   1464 		/*
   1465 		 * Now go back and pick up any inodes that
   1466 		 * were not already marked IN_CLEANING, and
   1467 		 * write them as well.
   1468 		 */
   1469 		offset = ooffset;
   1470 		lfs_parse_pseg(fs, &offset, 0, cred, NULL, l,
   1471 			       ino_func_rewrite, NULL,
   1472 			       CKSEG_NONE, fragsp);
   1473 	}
   1474 	return 0;
   1475 }
   1476 
   1477 /*
   1478  * Rewrite the contents of one or more segments, in preparation for
   1479  * marking them clean.
   1480  */
   1481 int
   1482 lfs_rewrite_segments(struct lfs *fs, int *snn, int len, int *directp, int *offsetp, struct lwp *l)
   1483 {
   1484 	kauth_cred_t cred;
   1485 	int i, error;
   1486 	struct buf *bp;
   1487 	SEGUSE *sup;
   1488 	daddr_t offset, endpseg;
   1489 
   1490 	ASSERT_NO_SEGLOCK(fs);
   1491 
   1492 	cred = l ? l->l_cred : NOCRED;
   1493 
   1494 	/* Prevent new dirops and acquire the cleaner lock. */
   1495 	lfs_writer_enter(fs, "rewritesegs");
   1496 	if ((error = lfs_cleanerlock(fs)) != 0) {
   1497 		lfs_writer_leave(fs);
   1498 		return error;
   1499 	}
   1500 
   1501 	/*
   1502 	 * Pre-reference vnodes now that we have cleaner lock
   1503 	 * but before we take the segment lock.  We don't want to
   1504 	 * mix cleaning blocks with flushed vnodes.
   1505 	 */
   1506 	for (i = 0; i < len; i++) {
   1507 		error = 0;
   1508 		/* Refuse to clean segments that are ACTIVE */
   1509 		LFS_SEGENTRY(sup, fs, snn[i], bp);
   1510 		if (sup->su_flags & SEGUSE_ACTIVE
   1511 		    || !(sup->su_flags & SEGUSE_DIRTY))
   1512 			error = EINVAL;
   1513 
   1514 		brelse(bp, 0);
   1515 		if (error)
   1516 			break;
   1517 
   1518 		offset = lfs_sntod(fs, snn[i]);
   1519 		skip_superblock(fs, &offset);
   1520 		endpseg = lfs_sntod(fs, snn[i] + 1);
   1521 
   1522 		while (offset > 0 && offset != endpseg) {
   1523 			lfs_parse_pseg(fs, &offset, 0, cred, NULL, l,
   1524 				       ino_func_setclean, finfo_func_setclean,
   1525 				       CKSEG_NONE, NULL);
   1526 		}
   1527 	}
   1528 
   1529 	/*
   1530 	 * Actually rewrite the contents of the segment.
   1531 	 */
   1532 	lfs_seglock(fs, SEGM_CLEAN);
   1533 
   1534 	for (i = 0; i < len; i++) {
   1535 		error = 0;
   1536 		/* Refuse to clean segments that are ACTIVE */
   1537 		LFS_SEGENTRY(sup, fs, snn[i], bp);
   1538 		if (sup->su_flags & SEGUSE_ACTIVE
   1539 		    || !(sup->su_flags & SEGUSE_DIRTY))
   1540 			error = EINVAL;
   1541 
   1542 		brelse(bp, 0);
   1543 		if (error)
   1544 			break;
   1545 
   1546 		error = lfs_rewrite_segment(fs, snn[i], directp, cred, l);
   1547 		if (error) {
   1548 			printf("  rewrite_segment returned %d\n", error);
   1549 			break;
   1550 		}
   1551 	}
   1552 	while (lfs_writeseg(fs, fs->lfs_sp))
   1553 		;
   1554 
   1555 	*offsetp = lfs_btofsb(fs, fs->lfs_sp->bytes_written);
   1556 	lfs_segunlock(fs);
   1557 	lfs_cleanerunlock(fs);
   1558 	lfs_writer_leave(fs);
   1559 
   1560 	return error;
   1561 }
   1562 
   1563 #if 0
   1564 static bool
   1565 lfs_isseq(const struct lfs *fs, long int lbn1, long int lbn2)
   1566 {
   1567 	return lbn2 == lbn1 + lfs_sb_getfrag(__UNCONST(fs));
   1568 }
   1569 
   1570 /*
   1571  * Rewrite the contents of a file in order to coalesce it.
   1572  * We don't bother rewriting indirect blocks because they will have to
   1573  * be rewritten anyway when we rewrite the direct blocks.
   1574  */
   1575 int
   1576 lfs_rewrite_file(struct lfs *fs, ino_t ino, struct lwp *l)
   1577 {
   1578 	daddr_t lbn, hiblk, daddr;
   1579 	int i, error, num, run;
   1580 	struct vnode *vp;
   1581 	struct indir indirs[ULFS_NIADDR+2];
   1582 	size_t size;
   1583 
   1584 	ASSERT_SEGLOCK(fs);
   1585 
   1586 	error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, LK_EXCLUSIVE, &vp);
   1587 	if (error)
   1588 		return error;
   1589 
   1590 	lfs_acquire_finfo(fs, ino, VTOI(vp)->i_gen);
   1591 	for (lbn = 0, hiblk = VTOI(vp)->i_lfs_hiblk; lbn < hiblk; ++lbn) {
   1592 		error = ulfs_bmaparray(vp, lbn, &daddr, &indirs[0], &num, &run,
   1593 				       lfs_isseq);
   1594 		if (daddr == UNASSIGNED)
   1595 			continue;
   1596 		for (i = 0; i <= run; i++) {
   1597 			size = lfs_blksize(fs, VTOI(vp), lbn);
   1598 			error = rewrite_block(fs, vp, lbn++, 0x0, size, NULL);
   1599 			if (error)
   1600 				break;
   1601 		}
   1602 	}
   1603 	lfs_release_finfo(fs);
   1604 	while (lfs_writeseg(fs, fs->lfs_sp))
   1605 		;
   1606 	lfs_segunlock(fs);
   1607 
   1608 	return error;
   1609 }
   1610 #endif /* 0 */
   1611 
   1612 
   1613 static int
   1614 ino_func_checkempty(struct lfs_inofuncarg *lifa)
   1615 {
   1616 	struct lfs *fs;
   1617 	daddr_t offset;
   1618 	struct vnode *devvp;
   1619 	union lfs_dinode *dip;
   1620 	struct buf *dbp, *ibp;
   1621 	int error;
   1622 	IFILE *ifp;
   1623 	unsigned i, num;
   1624 	daddr_t true_addr;
   1625 	ino_t ino;
   1626 
   1627 	fs = lifa->fs;
   1628 	offset = lifa->offset;
   1629 	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
   1630 
   1631 	/* Read inode block */
   1632 	error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs),
   1633 	    0, &dbp);
   1634 	if (error) {
   1635 		DLOG((DLOG_RF, "ino_func_checkempty: bread returned %d\n",
   1636 		      error));
   1637 		return error;
   1638 	}
   1639 
   1640 	/* Check each inode against ifile entry */
   1641 	num = LFS_INOPB(fs);
   1642 	for (i = num; i-- > 0; ) {
   1643 		dip = DINO_IN_BLOCK(fs, dbp->b_data, i);
   1644 		ino = lfs_dino_getinumber(fs, dip);
   1645 		if (ino == LFS_IFILE_INUM) {
   1646 			/* Check address against superblock */
   1647 			true_addr = lfs_sb_getidaddr(fs);
   1648 		} else {
   1649 			/* Not ifile.  Check address against ifile. */
   1650 			LFS_IENTRY(ifp, fs, ino, ibp);
   1651 			true_addr = lfs_if_getdaddr(fs, ifp);
   1652 			brelse(ibp, 0);
   1653 		}
   1654 		if (offset == true_addr) {
   1655 			error = EEXIST;
   1656 			break;
   1657 		}
   1658 	}
   1659 	brelse(dbp, BC_AGE);
   1660 
   1661 	return error;
   1662 }
   1663 
   1664 static int
   1665 finfo_func_checkempty(struct lfs_finfofuncarg *lffa)
   1666 {
   1667 	struct lfs *fs;
   1668 	FINFO *fip;
   1669 	daddr_t *offsetp;
   1670 	int j, error;
   1671 	size_t size, bytes;
   1672 	ino_t ino;
   1673 	uint32_t gen;
   1674 	struct vnode *vp;
   1675 	daddr_t lbn, daddr;
   1676 
   1677 	fs = lffa->fs;
   1678 	fip = lffa->finfop;
   1679 	offsetp = lffa->offsetp;
   1680 
   1681 	/* Get the inode and check its version. */
   1682 	ino = lfs_fi_getino(fs, fip);
   1683 	gen = lfs_fi_getversion(fs, fip);
   1684 	error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, LK_EXCLUSIVE, &vp);
   1685 
   1686 	/*
   1687 	 * If we can't, or if version is wrong, this FINFO does not refer
   1688 	 * to a live file.  Skip over it and continue.
   1689 	 */
   1690 	if (error || VTOI(vp)->i_gen != gen) {
   1691 		if (error == 0)
   1692 			error = ESTALE;
   1693 
   1694 		if (vp != NULL) {
   1695 			VOP_UNLOCK(vp);
   1696 			vrele(vp);
   1697 			vp = NULL;
   1698 		}
   1699 		bytes = ((lfs_fi_getnblocks(fs, fip) - 1)
   1700 			 << lfs_sb_getbshift(fs))
   1701 			+ lfs_fi_getlastlength(fs, fip);
   1702 		*offsetp += lfs_btofsb(fs, bytes);
   1703 
   1704 		return error;
   1705 	}
   1706 
   1707 	/*
   1708 	 * We have the vnode and its version is correct.
   1709 	 * Loop through the blocks and check their currency.
   1710 	 */
   1711 	size = lfs_sb_getbsize(fs);
   1712 	for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) {
   1713 		if (j == lfs_fi_getnblocks(fs, fip) - 1)
   1714 			size = lfs_fi_getlastlength(fs, fip);
   1715 		if (vp != NULL) {
   1716 			lbn = lfs_fi_getblock(fs, fip, j);
   1717 
   1718 			/* Look up current location of this block. */
   1719 			error = VOP_BMAP(vp, lbn, NULL, &daddr, NULL);
   1720 			if (error)
   1721 				break;
   1722 
   1723 			/* If it is here, the segment is not empty. */
   1724 			if (LFS_DBTOFSB(fs, daddr) == *offsetp) {
   1725 				error = EEXIST;
   1726 				break;
   1727 			}
   1728 		}
   1729 		*offsetp += lfs_btofsb(fs, size);
   1730 	}
   1731 
   1732 	/* Release vnode */
   1733 	VOP_UNLOCK(vp);
   1734 	vrele(vp);
   1735 
   1736 	return error;
   1737 }
   1738 
   1739 int
   1740 lfs_checkempty(struct lfs *fs, int sn, kauth_cred_t cred, struct lwp *l)
   1741 {
   1742 	daddr_t offset, endpseg;
   1743 	int error;
   1744 
   1745 	ASSERT_SEGLOCK(fs);
   1746 
   1747 	offset = lfs_sntod(fs, sn);
   1748 	skip_superblock(fs, &offset);
   1749 	endpseg = lfs_sntod(fs, sn + 1);
   1750 
   1751 	while (offset > 0 && offset < endpseg) {
   1752 		error = lfs_parse_pseg(fs, &offset, 0, cred, NULL, l,
   1753 				     ino_func_checkempty,
   1754 				     finfo_func_checkempty,
   1755 				     CKSEG_NONE, NULL);
   1756 		if (error)
   1757 			return error;
   1758 	}
   1759 	return 0;
   1760 }
   1761