Home | History | Annotate | Line # | Download | only in lfs
      1 /*	$NetBSD: lfs_rfw.c,v 1.41 2025/11/06 15:45:32 perseant Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2025 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Konrad E. Schroder <perseant (at) hhhh.org>.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 __KERNEL_RCSID(0, "$NetBSD: lfs_rfw.c,v 1.41 2025/11/06 15:45:32 perseant Exp $");
     34 
     35 #if defined(_KERNEL_OPT)
     36 #include "opt_quota.h"
     37 #endif
     38 
     39 #include <sys/param.h>
     40 #include <sys/systm.h>
     41 #include <sys/namei.h>
     42 #include <sys/proc.h>
     43 #include <sys/kernel.h>
     44 #include <sys/vnode.h>
     45 #include <sys/mount.h>
     46 #include <sys/kthread.h>
     47 #include <sys/buf.h>
     48 #include <sys/device.h>
     49 #include <sys/file.h>
     50 #include <sys/disklabel.h>
     51 #include <sys/ioctl.h>
     52 #include <sys/errno.h>
     53 #include <sys/malloc.h>
     54 #include <sys/pool.h>
     55 #include <sys/socket.h>
     56 #include <sys/stat.h>
     57 #include <sys/syslog.h>
     58 #include <sys/sysctl.h>
     59 #include <sys/conf.h>
     60 #include <sys/kauth.h>
     61 
     62 #include <miscfs/specfs/specdev.h>
     63 
     64 #include <ufs/lfs/ulfs_quotacommon.h>
     65 #include <ufs/lfs/ulfs_inode.h>
     66 #include <ufs/lfs/ulfsmount.h>
     67 #include <ufs/lfs/ulfs_extern.h>
     68 
     69 #include <uvm/uvm_extern.h>
     70 
     71 #include <ufs/lfs/lfs.h>
     72 #include <ufs/lfs/lfs_accessors.h>
     73 #include <ufs/lfs/lfs_kernel.h>
     74 #include <ufs/lfs/lfs_extern.h>
     75 
     76 #include <miscfs/genfs/genfs.h>
     77 #include <miscfs/genfs/genfs_node.h>
     78 
     79 /*
     80  * Roll-forward code.
     81  */
     82 static bool all_selector(void *, struct vnode *);
     83 static void drop_vnode_pages(struct mount *, struct lwp *);
     84 static void update_inoblk_copy_dinode(struct lfs *, union lfs_dinode *,
     85 				      const union lfs_dinode *);
     86 static int update_inogen(struct lfs_inofuncarg *);
     87 static int update_inoblk(struct lfs_inofuncarg *);
     88 static int finfo_func_rfw(struct lfs_finfofuncarg *);
     89 
     90 static int update_meta(struct lfs *, ino_t, int, daddr_t, daddr_t, size_t,
     91 		       struct lwp *l);
     92 #if 0
     93 static bool lfs_isseq(const struct lfs *fs, long int lbn1, long int lbn2);
     94 #endif
     95 
     96 extern int lfs_do_rfw;
     97 int rblkcnt;
     98 int lfs_rfw_max_psegs = 0;
     99 
    100 /*
    101  * Allocate a particular inode with a particular version number, freeing
    102  * any previous versions of this inode that may have gone before.
    103  * Used by the roll-forward code.
    104  *
    105  * XXX this function does not have appropriate locking to be used on a live fs;
    106  * XXX but something similar could probably be used for an "undelete" call.
    107  *
    108  * Called with the Ifile inode locked.
    109  */
    110 int
    111 lfs_rf_valloc(struct lfs *fs, ino_t ino, int vers, struct lwp *l,
    112 	      struct vnode **vpp, union lfs_dinode *dip)
    113 {
    114 	struct vattr va;
    115 	struct vnode *vp;
    116 	struct inode *ip;
    117 	int error;
    118 
    119 	KASSERT(ino > LFS_IFILE_INUM);
    120 	LFS_ASSERT_MAXINO(fs, ino);
    121 
    122 	ASSERT_SEGLOCK(fs); /* XXX it doesn't, really */
    123 
    124 	/*
    125 	 * First, just try a vget. If the version number is the one we want,
    126 	 * we don't have to do anything else.  If the version number is wrong,
    127 	 * take appropriate action.
    128 	 */
    129 	error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, LK_EXCLUSIVE, &vp);
    130 	if (error == 0) {
    131 		DLOG((DLOG_RF, "lfs_rf_valloc[1]: ino %d vp %p\n",
    132 			(int)ino, vp));
    133 
    134 		*vpp = vp;
    135 		ip = VTOI(vp);
    136 		DLOG((DLOG_RF, "  ip->i_gen=%jd dip nlink %jd seeking"
    137 			" version %jd\n", (intmax_t)ip->i_gen,
    138 			(intmax_t)(dip == NULL ? -1
    139 				: lfs_dino_getnlink(fs, dip)), (intmax_t)vers));
    140 		if (ip->i_gen == vers) {
    141 			/*
    142 			 * We have what we wanted already.
    143 			 */
    144 			DLOG((DLOG_RF, "  pre-existing\n"));
    145 			return 0;
    146 		} else if (ip->i_gen < vers && dip != NULL
    147 			&& lfs_dino_getnlink(fs, dip) > 0) {
    148 			/*
    149 			 * We have found a newer version.  Truncate
    150 			 * the old vnode to zero and re-initialize
    151 			 * from the given dinode.
    152 			 */
    153 			DLOG((DLOG_RF, "  replace old version %jd\n",
    154 				(intmax_t)ip->i_gen));
    155 			lfs_truncate(vp, (off_t)0, 0, NOCRED);
    156 			ip->i_gen = vers;
    157 			vp->v_type = IFTOVT(lfs_dino_getmode(fs, dip));
    158 			update_inoblk_copy_dinode(fs, ip->i_din, dip);
    159 			LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE);
    160 			return 0;
    161 		} else {
    162 			/*
    163 			 * Not the right version and nothing to
    164 			 * initialize from.  Don't recover this data.
    165 			 */
    166 			DLOG((DLOG_RF, "ino %d: sought version %d, got %d\n",
    167 				(int)ino, (int)vers,
    168 				(int)lfs_dino_getgen(fs, ip->i_din)));
    169 			vput(vp);
    170 			*vpp = NULLVP;
    171 			return EEXIST;
    172 		}
    173 	}
    174 
    175 	/*
    176 	 * No version of this inode was found in the cache.
    177 	 * Make a new one from the dinode.  We will add data blocks
    178 	 * as they come in, so scrub any block addresses off of the
    179 	 * inode and reset block counts to zero.
    180 	 */
    181 	if (dip == NULL)
    182 		return ENOENT;
    183 
    184 	vattr_null(&va);
    185 	va.va_type = IFTOVT(lfs_dino_getmode(fs, dip));
    186 	va.va_mode = lfs_dino_getmode(fs, dip) & ALLPERMS;
    187 	va.va_fileid = ino;
    188 	va.va_gen = vers;
    189 	error = vcache_new(fs->lfs_ivnode->v_mount, NULL, &va, NOCRED, NULL,
    190 	    &vp);
    191 	if (error)
    192 		return error;
    193 	error = vn_lock(vp, LK_EXCLUSIVE);
    194 	if (error)
    195 		goto err;
    196 
    197 	ip = VTOI(vp);
    198 	update_inoblk_copy_dinode(fs, ip->i_din, dip);
    199 
    200 	DLOG((DLOG_RF, "lfs_valloc[2] ino %d vp %p size=%lld effnblks=%d,"
    201 		" blocks=%d\n", (int)ino, vp, (long long)ip->i_size,
    202 		(int)ip->i_lfs_effnblks,
    203 		(int)lfs_dino_getblocks(fs, ip->i_din)));
    204 	*vpp = vp;
    205 	return 0;
    206 
    207 err:
    208 	vrele(vp);
    209 	*vpp = NULLVP;
    210 	return error;
    211 }
    212 
    213 /*
    214  * Load the appropriate indirect block, and change the appropriate pointer.
    215  * Mark the block dirty.  Do segment and avail accounting.
    216  */
    217 static int
    218 update_meta(struct lfs *fs, ino_t ino, int vers, daddr_t lbn,
    219 	    daddr_t ndaddr, size_t size, struct lwp *l)
    220 {
    221 	int error;
    222 	struct vnode *vp;
    223 	struct inode *ip;
    224 	daddr_t odaddr;
    225 	struct indir a[ULFS_NIADDR];
    226 	int num;
    227 	struct buf *bp;
    228 	SEGUSE *sup;
    229 	u_int64_t newsize, loff;
    230 
    231 	KASSERT(lbn >= 0);	/* no indirect blocks */
    232 	KASSERT(ino > LFS_IFILE_INUM);
    233 	LFS_ASSERT_MAXINO(fs, ino);
    234 
    235 	DLOG((DLOG_RF, "update_meta: ino %d lbn %d size %d at 0x%jx\n",
    236 	      (int)ino, (int)lbn, (int)size, (uintmax_t)ndaddr));
    237 
    238 	if ((error = lfs_rf_valloc(fs, ino, vers, l, &vp, NULL)) != 0)
    239 		return error;
    240 	ip = VTOI(vp);
    241 
    242 	/*
    243 	 * If block already exists, note its new location
    244 	 * but do not account it as new.
    245 	 */
    246 	ulfs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL, NULL);
    247 	if (odaddr == UNASSIGNED) {
    248 		if ((error = lfs_balloc(vp, (lbn << lfs_sb_getbshift(fs)),
    249 					size, NOCRED, 0, &bp)) != 0) {
    250 			vput(vp);
    251 			return (error);
    252 		}
    253 		/* No need to write, the block is already on disk */
    254 		if (bp->b_oflags & BO_DELWRI) {
    255 			LFS_UNLOCK_BUF(bp);
    256 			/* Account recovery of the previous version */
    257 			lfs_sb_addavail(fs, lfs_btofsb(fs, bp->b_bcount));
    258 		}
    259 		brelse(bp, BC_INVAL);
    260 		DLOG((DLOG_RF, "balloc ip->i_lfs_effnblks = %d,"
    261 			" lfs_dino_getblocks(fs, ip->i_din) = %d\n",
    262 			(int)ip->i_lfs_effnblks,
    263 			(int)lfs_dino_getblocks(fs, ip->i_din)));
    264 	} else {
    265 		/* XXX fragextend? */
    266 		DLOG((DLOG_RF, "block exists, no balloc\n"));
    267 	}
    268 
    269 	/*
    270 	 * Extend the file, if it is not large enough already.
    271 	 * XXX This is not exactly right, we don't know how much of the
    272 	 * XXX last block is actually used.
    273 	 *
    274 	 * XXX We should be able to encode the actual data length of the
    275 	 * XXX last block in fi_lastlength, since we can infer the
    276 	 * XXX necessary block length from that using a variant of
    277 	 * XXX lfs_blksize().
    278 	 */
    279 	loff = lfs_lblktosize(fs, lbn);
    280 	if (loff >= (ULFS_NDADDR << lfs_sb_getbshift(fs))) {
    281 		/* No fragments */
    282 		newsize = loff + 1;
    283 	} else {
    284 		/* Subtract only a fragment to account for block size */
    285 		newsize = loff + size - lfs_fsbtob(fs, 1) + 1;
    286 	}
    287 
    288 	if (ip->i_size < newsize) {
    289 		DLOG((DLOG_RF, "ino %d size %d -> %d\n",
    290 		      (int)ino, (int)ip->i_size, (int)newsize));
    291 		lfs_dino_setsize(fs, ip->i_din, newsize);
    292 		ip->i_size = newsize;
    293 		/*
    294 		 * tell vm our new size for the case the inode won't
    295 		 * appear later.
    296 		 */
    297 		uvm_vnp_setsize(vp, newsize);
    298 	}
    299 
    300 	lfs_update_single(fs, NULL, vp, lbn, ndaddr, size);
    301 
    302 	LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, ndaddr), bp);
    303 	sup->su_nbytes += size;
    304 	LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, ndaddr), bp);
    305 
    306 	/* differences here should be due to UNWRITTEN indirect blocks. */
    307 	if (vp->v_type != VLNK) {
    308 		if (!(ip->i_lfs_effnblks >= lfs_dino_getblocks(fs, ip->i_din))
    309 #if 0
    310 		    || !(lfs_lblkno(fs, ip->i_size) > ULFS_NDADDR ||
    311 			 ip->i_lfs_effnblks == lfs_dino_getblocks(fs, ip->i_din))
    312 #endif /* 0 */
    313 			) {
    314 			vprint("vnode", vp);
    315 			printf("effnblks=%jd dino_getblocks=%jd\n",
    316 			       (intmax_t)ip->i_lfs_effnblks,
    317 			       (intmax_t)lfs_dino_getblocks(fs, ip->i_din));
    318 		}
    319 		KASSERT(ip->i_lfs_effnblks >= lfs_dino_getblocks(fs, ip->i_din));
    320 #if 0
    321 		KASSERT(lfs_lblkno(fs, ip->i_size) > ULFS_NDADDR ||
    322 			ip->i_lfs_effnblks == lfs_dino_getblocks(fs, ip->i_din));
    323 #endif /* 0 */
    324 	}
    325 
    326 #ifdef DEBUG
    327 	/* Now look again to make sure it worked */
    328 	ulfs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL, NULL);
    329 	if (LFS_DBTOFSB(fs, odaddr) != ndaddr)
    330 		DLOG((DLOG_RF, "update_meta: failed setting ino %jd lbn %jd"
    331 		      " to %jd\n", (intmax_t)ino, (intmax_t)lbn, (intmax_t)ndaddr));
    332 #endif /* DEBUG */
    333 	vput(vp);
    334 	return 0;
    335 }
    336 
    337 /*
    338  * Copy some the fields of the dinode as needed by update_inoblk().
    339  */
    340 static void
    341 update_inoblk_copy_dinode(struct lfs *fs,
    342     union lfs_dinode *dstu, const union lfs_dinode *srcu)
    343 {
    344 	if (fs->lfs_is64) {
    345 		struct lfs64_dinode *dst = &dstu->u_64;
    346 		const struct lfs64_dinode *src = &srcu->u_64;
    347 		unsigned i;
    348 
    349 		/*
    350 		 * Copy everything but the block pointers and di_blocks.
    351 		 * XXX what about di_extb?
    352 		 */
    353 		dst->di_mode = src->di_mode;
    354 		dst->di_nlink = src->di_nlink;
    355 		dst->di_uid = src->di_uid;
    356 		dst->di_gid = src->di_gid;
    357 		dst->di_blksize = src->di_blksize;
    358 		dst->di_size = src->di_size;
    359 		dst->di_atime = src->di_atime;
    360 		dst->di_mtime = src->di_mtime;
    361 		dst->di_ctime = src->di_ctime;
    362 		dst->di_birthtime = src->di_birthtime;
    363 		dst->di_mtimensec = src->di_mtimensec;
    364 		dst->di_atimensec = src->di_atimensec;
    365 		dst->di_ctimensec = src->di_ctimensec;
    366 		dst->di_birthnsec = src->di_birthnsec;
    367 		dst->di_gen = src->di_gen;
    368 		dst->di_kernflags = src->di_kernflags;
    369 		dst->di_flags = src->di_flags;
    370 		dst->di_extsize = src->di_extsize;
    371 		dst->di_modrev = src->di_modrev;
    372 		dst->di_inumber = src->di_inumber;
    373 		for (i = 0; i < __arraycount(src->di_spare); i++) {
    374 			dst->di_spare[i] = src->di_spare[i];
    375 		}
    376 		/* Short symlinks store their data in di_db. */
    377 		if ((src->di_mode & LFS_IFMT) == LFS_IFLNK
    378 		    && src->di_size < lfs_sb_getmaxsymlinklen(fs)) {
    379 			memcpy(dst->di_db, src->di_db, src->di_size);
    380 		}
    381 	} else {
    382 		struct lfs32_dinode *dst = &dstu->u_32;
    383 		const struct lfs32_dinode *src = &srcu->u_32;
    384 
    385 		/* Get mode, link count, size, and times */
    386 		memcpy(dst, src, offsetof(struct lfs32_dinode, di_db[0]));
    387 
    388 		/* Then the rest, except di_blocks */
    389 		dst->di_flags = src->di_flags;
    390 		dst->di_gen = src->di_gen;
    391 		dst->di_uid = src->di_uid;
    392 		dst->di_gid = src->di_gid;
    393 		dst->di_modrev = src->di_modrev;
    394 
    395 		/* Short symlinks store their data in di_db. */
    396 		if ((src->di_mode & LFS_IFMT) == LFS_IFLNK
    397 		    && src->di_size < lfs_sb_getmaxsymlinklen(fs)) {
    398 			memcpy(dst->di_db, src->di_db, src->di_size);
    399 		}
    400 	}
    401 }
    402 
    403 static int
    404 update_inoblk(struct lfs_inofuncarg *lifa)
    405 {
    406 	struct lfs *fs;
    407 	daddr_t offset, daddr;
    408 	struct lwp *l;
    409 	struct vnode *devvp, *vp;
    410 	struct inode *ip;
    411 	union lfs_dinode *dip;
    412 	struct buf *dbp, *ibp;
    413 	int error;
    414 	IFILE *ifp;
    415 	SEGUSE *sup;
    416 	unsigned i, num;
    417 	uint32_t gen, osn, nsn;
    418 	char *buf;
    419 	ino_t ino;
    420 
    421 	fs = lifa->fs;
    422 	offset = lifa->offset;
    423 	l = lifa->l;
    424 	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
    425 
    426 	/*
    427 	 * Get the inode, update times and perms.
    428 	 * DO NOT update disk blocks, we do that separately.
    429 	 */
    430 	error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs),
    431 	    0, &dbp);
    432 	if (error) {
    433 		DLOG((DLOG_RF, "update_inoblk: bread returned %d\n", error));
    434 		return error;
    435 	}
    436 	buf = malloc(dbp->b_bcount, M_SEGMENT, M_WAITOK);
    437 	memcpy(buf, dbp->b_data, dbp->b_bcount);
    438 	brelse(dbp, BC_AGE);
    439 	num = LFS_INOPB(fs);
    440 	for (i = num; i-- > 0; ) {
    441 		dip = DINO_IN_BLOCK(fs, buf, i);
    442 		ino = lfs_dino_getinumber(fs, dip);
    443 		if (ino <= LFS_IFILE_INUM)
    444 			continue;
    445 
    446 		LFS_ASSERT_MAXINO(fs, ino);
    447 
    448 		/* Check generation number */
    449 		LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp);
    450 		gen = lfs_if_getversion(fs, ifp);
    451 		brelse(ibp, 0);
    452 		if (lfs_dino_getgen(fs, dip) < gen) {
    453 			continue;
    454 		}
    455 
    456 		/*
    457 		 * This inode is the newest generation.  Load it.
    458 		 */
    459 		error = lfs_rf_valloc(fs, ino, lfs_dino_getgen(fs, dip),
    460 				      l, &vp, dip);
    461 		if (error) {
    462 			DLOG((DLOG_RF, "update_inoblk: lfs_rf_valloc"
    463 			      " returned %d\n", error));
    464 			continue;
    465 		}
    466 		ip = VTOI(vp);
    467 		if (lfs_dino_getsize(fs, dip) != ip->i_size
    468 		    && vp->v_type != VLNK) {
    469 			/* XXX What should we do with symlinks? */
    470 			DLOG((DLOG_RF, "  ino %jd size %jd -> %jd\n",
    471 				(intmax_t)ino,
    472 				(intmax_t)ip->i_size,
    473 				(intmax_t)lfs_dino_getsize(fs, dip)));
    474 			lfs_truncate(vp, lfs_dino_getsize(fs, dip), 0,
    475 				     NOCRED);
    476 		}
    477 		update_inoblk_copy_dinode(fs, ip->i_din, dip);
    478 
    479 		ip->i_flags = lfs_dino_getflags(fs, dip);
    480 		ip->i_gen = lfs_dino_getgen(fs, dip);
    481 		ip->i_uid = lfs_dino_getuid(fs, dip);
    482 		ip->i_gid = lfs_dino_getgid(fs, dip);
    483 
    484 		ip->i_mode = lfs_dino_getmode(fs, dip);
    485 		ip->i_nlink = lfs_dino_getnlink(fs, dip);
    486 		ip->i_size = lfs_dino_getsize(fs, dip);
    487 
    488 		LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE);
    489 
    490 		/* Re-initialize to get type right */
    491 		ulfs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p,
    492 			  &vp);
    493 
    494 		/* Record change in location */
    495 		LFS_IENTRY(ifp, fs, ino, ibp);
    496 		daddr = lfs_if_getdaddr(fs, ifp);
    497 		lfs_if_setdaddr(fs, ifp, offset);
    498 		error = LFS_BWRITE_LOG(ibp); /* Ifile */
    499 		/* And do segment accounting */
    500 		osn = lfs_dtosn(fs, daddr);
    501 		nsn = lfs_dtosn(fs, offset);
    502 		if (DADDR_IS_BAD(daddr) || osn != nsn) {
    503 			if (!DADDR_IS_BAD(daddr)) {
    504 				LFS_SEGENTRY(sup, fs, osn, ibp);
    505 				sup->su_nbytes -= DINOSIZE(fs);
    506 				LFS_WRITESEGENTRY(sup, fs, osn, ibp);
    507 			}
    508 			LFS_SEGENTRY(sup, fs, nsn, ibp);
    509 			sup->su_nbytes += DINOSIZE(fs);
    510 			LFS_WRITESEGENTRY(sup, fs, nsn, ibp);
    511 		}
    512 		vput(vp);
    513 	}
    514 	free(buf, M_SEGMENT);
    515 
    516 	return 0;
    517 }
    518 
    519 /*
    520  * Note the highest generation number of each inode in the Ifile.
    521  * This allows us to skip processing data for intermediate versions.
    522  */
    523 static int
    524 update_inogen(struct lfs_inofuncarg *lifa)
    525 {
    526 	struct lfs *fs;
    527 	daddr_t offset;
    528 	struct vnode *devvp;
    529 	union lfs_dinode *dip;
    530 	struct buf *dbp, *ibp;
    531 	int error;
    532 	IFILE *ifp;
    533 	unsigned i, num;
    534 
    535 	fs = lifa->fs;
    536 	offset = lifa->offset;
    537 	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
    538 
    539 	/* Read inode block */
    540 	error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs),
    541 	    0, &dbp);
    542 	if (error) {
    543 		DLOG((DLOG_RF, "update_inoblk: bread returned %d\n", error));
    544 		return error;
    545 	}
    546 
    547 	/* Check each inode against ifile entry */
    548 	num = LFS_INOPB(fs);
    549 	for (i = num; i-- > 0; ) {
    550 		dip = DINO_IN_BLOCK(fs, dbp->b_data, i);
    551 		if (lfs_dino_getinumber(fs, dip) == LFS_IFILE_INUM)
    552 			continue;
    553 
    554 		/* Update generation number */
    555 		LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp);
    556 		if (lfs_if_getversion(fs, ifp) < lfs_dino_getgen(fs, dip))
    557 			lfs_if_setversion(fs, ifp, lfs_dino_getgen(fs, dip));
    558 		error = LFS_BWRITE_LOG(ibp); /* Ifile */
    559 		if (error)
    560 			break;
    561 	}
    562 	brelse(dbp, 0);
    563 
    564 	return error;
    565 }
    566 
    567 static int
    568 finfo_func_rfw(struct lfs_finfofuncarg *lffa)
    569 {
    570 	struct lfs *fs;
    571 	FINFO *fip;
    572 	daddr_t *offsetp;
    573 	struct lwp *l;
    574 	int j;
    575 	size_t size;
    576 	ino_t ino;
    577 
    578 	fs = lffa->fs;
    579 	fip = lffa->finfop;
    580 	offsetp = lffa->offsetp;
    581 	l = lffa->l;
    582 	size = lfs_sb_getbsize(fs);
    583 	ino = lfs_fi_getino(fs, fip);
    584 	LFS_ASSERT_MAXINO(fs, ino);
    585 	for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) {
    586 		if (j == lfs_fi_getnblocks(fs, fip) - 1)
    587 			size = lfs_fi_getlastlength(fs, fip);
    588 
    589 		/* Account for and update any direct blocks */
    590 		if (ino > LFS_IFILE_INUM &&
    591 		    lfs_fi_getblock(fs, fip, j) >= 0) {
    592 			update_meta(fs, ino,
    593 				    lfs_fi_getversion(fs, fip),
    594 				    lfs_fi_getblock(fs, fip, j),
    595 				    *offsetp, size, l);
    596 			++rblkcnt;
    597 		}
    598 		*offsetp += lfs_btofsb(fs, size);
    599 	}
    600 
    601 	return 0;
    602 }
    603 
    604 int
    605 lfs_skip_superblock(struct lfs *fs, daddr_t *offsetp)
    606 {
    607 	daddr_t offset;
    608 	int i;
    609 
    610 	/*
    611 	 * If this is segment 0, skip the label.
    612 	 * If the segment has a superblock and we're at the top
    613 	 * of the segment, skip the superblock.
    614 	 */
    615 	offset = *offsetp;
    616 	if (offset == lfs_sb_gets0addr(fs)) {
    617 		offset += lfs_btofsb(fs, LFS_LABELPAD);
    618 	}
    619 	for (i = 0; i < LFS_MAXNUMSB; i++) {
    620 		if (offset == lfs_sb_getsboff(fs, i)) {
    621 			offset += lfs_btofsb(fs, LFS_SBPAD);
    622 			break;
    623 		}
    624 	}
    625 	*offsetp = offset;
    626 	return 0;
    627 }
    628 
    629 /*
    630  * Read the partial sement at offset.
    631  *
    632  * If finfo_func and ino_func are both NULL, check the summary
    633  * and data checksums.  During roll forward, this must be done in its
    634  * entirety before processing any blocks.
    635  *
    636  * If finfo_func is given, use that to process every file block
    637  * in the segment summary.  If ino_func is given, use that to process
    638  * every inode block.
    639  */
    640 int
    641 lfs_parse_pseg(struct lfs *fs, daddr_t *offsetp, u_int64_t nextserial,
    642 	       kauth_cred_t cred, int *pseg_flags, struct lwp *l,
    643 	       int (*ino_func)(struct lfs_inofuncarg *),
    644 	       int (*finfo_func)(struct lfs_finfofuncarg *),
    645 	       int flags, void *arg)
    646 {
    647 	struct vnode *devvp;
    648 	struct buf *bp, *dbp;
    649 	int error, ninos, i, j;
    650 	SEGSUM *ssp;
    651 	daddr_t offset, prevoffset;
    652 	IINFO *iip;
    653 	FINFO *fip;
    654 	size_t size;
    655 	uint32_t datasum, foundsum;
    656 	char *buf;
    657 	struct lfs_inofuncarg lifa;
    658 	struct lfs_finfofuncarg lffa;
    659 
    660 	KASSERT(fs != NULL);
    661 	KASSERT(offsetp != NULL);
    662 
    663 	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
    664 
    665 	/* Set up callback arguments */
    666 	lifa.fs = fs;
    667 	/* lifa.offset = offset; */
    668 	lifa.cred = cred;
    669 	lifa.l = l;
    670 	lifa.buf = malloc(lfs_sb_getbsize(fs), M_SEGMENT, M_WAITOK);
    671 
    672 	lifa.arg = arg;
    673 
    674 	lffa.fs = fs;
    675 	/* lffa.offsetp = offsetp; */
    676 	/* lffa.finfop = finfop; */
    677 	lffa.cred = cred;
    678 	lffa.l = l;
    679 	lffa.arg = arg;
    680 
    681 	prevoffset = *offsetp;
    682 	lfs_skip_superblock(fs, offsetp);
    683 	offset = *offsetp;
    684 
    685 	/* Read in the segment summary */
    686 	buf = malloc(lfs_sb_getsumsize(fs), M_SEGMENT, M_WAITOK);
    687 	error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getsumsize(fs),
    688 	    0, &bp);
    689 	if (error)
    690 		goto err;
    691 	memcpy(buf, bp->b_data, bp->b_bcount);
    692 	brelse(bp, BC_AGE);
    693 
    694 	ssp = (SEGSUM *)buf;
    695 
    696 	if (lfs_ss_getmagic(fs, ssp) != SS_MAGIC) {
    697 		DLOG((DLOG_RF, "Bad magic at 0x%" PRIx64 "\n",
    698 		      offset));
    699 		offset = -1;
    700 		goto err;
    701 	}
    702 
    703 	if (flags & CKSEG_CKSUM) {
    704 		size_t sumstart;
    705 
    706 		sumstart = lfs_ss_getsumstart(fs);
    707 		if (lfs_ss_getsumsum(fs, ssp) !=
    708 		    cksum((char *)ssp + sumstart,
    709 			  lfs_sb_getsumsize(fs) - sumstart)) {
    710 			DLOG((DLOG_RF, "Sumsum error at 0x%" PRIx64 "\n",
    711 				offset));
    712 			offset = -1;
    713 			goto err;
    714 		}
    715 	}
    716 
    717 #if 0
    718 	/*
    719 	 * Under normal conditions, we should never be producing
    720 	 * a partial segment with neither inode blocks nor data blocks.
    721 	 * However, these do sometimes appear and they need not
    722 	 * prevent us from continuing.
    723 	 */
    724 	if (lfs_ss_getnfinfo(fs, ssp) == 0 &&
    725 	    lfs_ss_getninos(fs, ssp) == 0) {
    726 		DLOG((DLOG_RF, "Empty pseg at 0x%" PRIx64 "\n",
    727 		      offset));
    728 		offset = -1;
    729 		goto err;
    730 	}
    731 #endif /* 0 */
    732 
    733 	if (lfs_sb_getversion(fs) == 1) {
    734 		if (lfs_ss_getcreate(fs, ssp) < lfs_sb_gettstamp(fs)) {
    735 			DLOG((DLOG_RF, "Old data at 0x%" PRIx64 "\n", offset));
    736 			offset = -1;
    737 			goto err;
    738 		}
    739 	} else {
    740 		if (nextserial > 0
    741 		    && lfs_ss_getserial(fs, ssp) != nextserial) {
    742 			DLOG((DLOG_RF, "Serial number at 0x%jx given as 0x%jx,"
    743 			      " expected 0x%jx\n", (intmax_t)offset,
    744 			      (intmax_t)lfs_ss_getserial(fs, ssp),
    745 			      (intmax_t)nextserial));
    746 			offset = -1;
    747 			goto err;
    748 		}
    749 		if (lfs_ss_getident(fs, ssp) != lfs_sb_getident(fs)) {
    750 			DLOG((DLOG_RF, "Incorrect fsid (0x%x vs 0x%x) at 0x%"
    751 			      PRIx64 "\n", lfs_ss_getident(fs, ssp),
    752 			      lfs_sb_getident(fs), offset));
    753 			offset = -1;
    754 			goto err;
    755 		}
    756 	}
    757 
    758 #ifdef DIAGNOSTIC
    759 	if (lfs_ss_getnfinfo(fs, ssp) > lfs_sb_getssize(fs) / lfs_sb_getfsize(fs)) {
    760 		printf("At offset 0x%jx, nfinfo %jd > max frags %jd\n",
    761 		       (intmax_t)offset,
    762 		       (intmax_t)lfs_ss_getnfinfo(fs, ssp),
    763 		       (intmax_t)lfs_sb_getssize(fs) / lfs_sb_getfsize(fs));
    764 	}
    765 #endif
    766 	KASSERT(lfs_ss_getnfinfo(fs, ssp) <= lfs_sb_getssize(fs) / lfs_sb_getfsize(fs));
    767 #ifdef DIAGNOSTIC
    768 	if (lfs_ss_getnfinfo(fs, ssp) > lfs_sb_getfsize(fs) / sizeof(FINFO32)) {
    769 		printf("At offset 0x%jx, nfinfo %jd > max entries %jd\n",
    770 		       (intmax_t)offset,
    771 		       (intmax_t)lfs_ss_getnfinfo(fs, ssp),
    772 		       (intmax_t)lfs_sb_getssize(fs) / lfs_sb_getfsize(fs));
    773 	}
    774 #endif
    775 	KASSERT(lfs_ss_getnfinfo(fs, ssp) <= lfs_sb_getfsize(fs) / sizeof(FINFO32));
    776 
    777 	if (pseg_flags)
    778 		*pseg_flags = lfs_ss_getflags(fs, ssp);
    779 	ninos = howmany(lfs_ss_getninos(fs, ssp), LFS_INOPB(fs));
    780 	iip = SEGSUM_IINFOSTART(fs, buf);
    781 	fip = SEGSUM_FINFOBASE(fs, (SEGSUM *)buf);
    782 
    783 	/* Handle individual blocks */
    784 	foundsum = 0;
    785 	offset += lfs_btofsb(fs, lfs_sb_getsumsize(fs));
    786 	for (i = 0; i < lfs_ss_getnfinfo(fs, ssp) || ninos; ++i) {
    787 		/* Inode block? */
    788 		if (ninos && lfs_ii_getblock(fs, iip) == offset) {
    789 			if (flags & CKSEG_CKSUM) {
    790 				/* Read in the head and add to the buffer */
    791 				error = bread(devvp, LFS_FSBTODB(fs, offset),
    792 					lfs_sb_getbsize(fs), 0, &dbp);
    793 				if (error) {
    794 					offset = -1;
    795 					goto err;
    796 				}
    797 				foundsum = lfs_cksum_part(dbp->b_data,
    798 					sizeof(uint32_t), foundsum);
    799 				brelse(dbp, BC_AGE);
    800 			} else if (ino_func != NULL) {
    801 				lifa.offset = offset;
    802 				error = (*ino_func)(&lifa);
    803 				if (error != 0) {
    804 					offset = -1;
    805 					goto err;
    806 				}
    807 			}
    808 
    809 			offset += lfs_btofsb(fs, lfs_sb_getibsize(fs));
    810 			iip = NEXTLOWER_IINFO(fs, iip);
    811 			--ninos;
    812 			--i; /* compensate for ++i in loop header */
    813 			continue;
    814 		}
    815 
    816 		/* File block */
    817 		size = lfs_sb_getbsize(fs);
    818 		if (flags & CKSEG_CKSUM) {
    819 			for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) {
    820 				if (j == lfs_fi_getnblocks(fs, fip) - 1)
    821 					size = lfs_fi_getlastlength(fs, fip);
    822 				error = bread(devvp, LFS_FSBTODB(fs, offset),
    823 					      size, 0, &dbp);
    824 				if (error) {
    825 					offset = -1;
    826 					goto err;
    827 				}
    828 				foundsum = lfs_cksum_part(dbp->b_data,
    829 							  sizeof(uint32_t), foundsum);
    830 				brelse(dbp, BC_AGE);
    831 				offset += lfs_btofsb(fs, size);
    832 			}
    833 		} else if (finfo_func != NULL) {
    834 			lffa.offsetp = &offset;
    835 			lffa.finfop = fip;
    836 			(*finfo_func)(&lffa);
    837 		} else {
    838 			int n = lfs_fi_getnblocks(fs, fip);
    839 			size = lfs_fi_getlastlength(fs, fip);
    840 			offset += lfs_btofsb(fs, lfs_sb_getbsize(fs) * (n - 1)
    841 					     + size);
    842 		}
    843 		fip = NEXT_FINFO(fs, fip);
    844 	}
    845 
    846 	/* Checksum the array, compare */
    847 	if (flags & CKSEG_CKSUM) {
    848 		datasum = lfs_ss_getdatasum(fs, ssp);
    849 		foundsum = lfs_cksum_fold(foundsum);
    850 		if (datasum != foundsum) {
    851 			DLOG((DLOG_RF, "Datasum error at 0x%" PRIx64
    852 			      " (wanted %x got %x)\n",
    853 			      offset, datasum, foundsum));
    854 			offset = -1;
    855 			goto err;
    856 		}
    857 	} else {
    858 		/* Don't clog the buffer queue */
    859 		mutex_enter(&lfs_lock);
    860 		if (locked_queue_count > LFS_MAX_BUFS ||
    861 		    locked_queue_bytes > LFS_MAX_BYTES) {
    862 			lfs_flush(fs, SEGM_CKP, 0);
    863 		}
    864 		mutex_exit(&lfs_lock);
    865 	}
    866 
    867 	/*
    868 	 * If we're at the end of the segment, move to the next.
    869 	 * A partial segment needs space for a segment header (1 fsb)
    870 	 * and a full block ("frag" fsb).  Thus, adding "frag" fsb should
    871 	 * still be within the current segment (whereas frag + 1 might
    872 	 * be at the start of the next segment).
    873 	 *
    874 	 * This needs to match the definition of LFS_PARTIAL_FITS
    875 	 * in lfs_segment.c.
    876 	 */
    877 	if (lfs_dtosn(fs, offset + lfs_sb_getfrag(fs))
    878 	    != lfs_dtosn(fs, offset)) {
    879 		if (lfs_dtosn(fs, offset) == lfs_dtosn(fs, lfs_ss_getnext(fs,
    880 									ssp))) {
    881 			offset = -1;
    882 			goto err;
    883 		}
    884 		offset = lfs_ss_getnext(fs, ssp);
    885 		DLOG((DLOG_RF, "LFS roll forward: moving to offset 0x%" PRIx64
    886 		       " -> segment %d\n", offset, lfs_dtosn(fs,offset)));
    887 	}
    888 	if (flags & CKSEG_AVAIL)
    889 		lfs_sb_subavail(fs, offset - prevoffset);
    890 
    891     err:
    892 	free(lifa.buf, M_SEGMENT);
    893 	free(buf, M_SEGMENT);
    894 
    895 	*offsetp = offset;
    896 	return 0;
    897 }
    898 
    899 /*
    900  * Roll forward.
    901  */
    902 void
    903 lfs_roll_forward(struct lfs *fs, struct mount *mp, struct lwp *l)
    904 {
    905 	int flags, dirty;
    906 	daddr_t startoffset, offset, nextoffset, endpseg;
    907 	u_int64_t nextserial, startserial, endserial;
    908 	int sn, curseg;
    909 	struct proc *p;
    910 	kauth_cred_t cred;
    911 	SEGUSE *sup;
    912 	struct buf *bp;
    913 
    914 	p = l ? l->l_proc : NULL;
    915 	cred = p ? p->p_cred : NOCRED;
    916 
    917 	/*
    918 	 * We don't roll forward for v1 filesystems, because
    919 	 * of the danger that the clock was turned back between the last
    920 	 * checkpoint and crash.  This would roll forward garbage.
    921 	 *
    922 	 * v2 filesystems don't have this problem because they use a
    923 	 * monotonically increasing serial number instead of a timestamp.
    924 	 */
    925 	rblkcnt = 0;
    926 	if ((lfs_sb_getpflags(fs) & LFS_PF_CLEAN) || !lfs_do_rfw
    927 	    || lfs_sb_getversion(fs) <= 1 || p == NULL)
    928 		return;
    929 
    930 	DLOG((DLOG_RF, "%s: begin roll forward at serial 0x%jx\n",
    931 		lfs_sb_getfsmnt(fs), (intmax_t)lfs_sb_getserial(fs)));
    932 	DEBUG_CHECK_FREELIST(fs);
    933 
    934 	/*
    935 	 * Phase I: Find the address of the last good partial
    936 	 * segment that was written after the checkpoint.  Mark
    937 	 * the segments in question dirty, so they won't be
    938 	 * reallocated.
    939 	 */
    940 	endpseg = startoffset = offset = lfs_sb_getoffset(fs);
    941 	flags = 0x0;
    942 	DLOG((DLOG_RF, "LFS roll forward phase 1: start at offset 0x%"
    943 	      PRIx64 "\n", offset));
    944 	LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp);
    945 	if (!(sup->su_flags & SEGUSE_DIRTY))
    946 		lfs_sb_subnclean(fs, 1);
    947 	sup->su_flags |= SEGUSE_DIRTY;
    948 	LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp);
    949 
    950 	startserial = lfs_sb_getserial(fs);
    951 	endserial = nextserial = startserial + 1;
    952 	nextoffset = offset;
    953 	while (1) {
    954 		nextoffset = offset;
    955 		lfs_parse_pseg(fs, &nextoffset, nextserial,
    956 			     cred, &flags, l, NULL, NULL, CKSEG_CKSUM, NULL);
    957 		if (nextoffset == -1)
    958 			break;
    959 		if (lfs_sntod(fs, offset) != lfs_sntod(fs, nextoffset)) {
    960 			LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset),
    961 				     bp);
    962 			if (!(sup->su_flags & SEGUSE_DIRTY))
    963 				lfs_sb_subnclean(fs, 1);
    964 			sup->su_flags |= SEGUSE_DIRTY;
    965 			LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp);
    966 		}
    967 
    968 		DLOG((DLOG_RF, "LFS roll forward phase 1: offset=0x%jx"
    969 			" serial=0x%jx\n", (intmax_t)nextoffset,
    970 			(intmax_t)nextserial));
    971 		if (flags & SS_DIROP) {
    972 			DLOG((DLOG_RF, "lfs_mountfs: dirops at 0x%"
    973 			      PRIx64 "\n", offset));
    974 			if (!(flags & SS_CONT)) {
    975 			     DLOG((DLOG_RF, "lfs_mountfs: dirops end "
    976 				   "at 0x%" PRIx64 "\n", offset));
    977 			}
    978 		}
    979 		offset = nextoffset;
    980 		++nextserial;
    981 
    982 		if (!(flags & SS_CONT)) {
    983 			endpseg = nextoffset;
    984 			endserial = nextserial;
    985 		}
    986 		if (lfs_rfw_max_psegs > 0
    987 		    && nextserial > startserial + lfs_rfw_max_psegs)
    988 			break;
    989 	}
    990 	if (flags & SS_CONT) {
    991 		DLOG((DLOG_RF, "LFS roll forward: warning: incomplete "
    992 			"dirops discarded (0x%jx < 0x%jx)\n",
    993 			endpseg, nextoffset));
    994 	}
    995 	if (lfs_sb_getversion(fs) > 1)
    996 		lfs_sb_setserial(fs, endserial);
    997 	DLOG((DLOG_RF, "LFS roll forward phase 1: completed: "
    998 	      "endpseg=0x%" PRIx64 "\n", endpseg));
    999 	offset = startoffset;
   1000 	if (offset != endpseg) {
   1001 		/* Don't overwrite what we're trying to preserve */
   1002 		lfs_sb_setoffset(fs, endpseg);
   1003 		lfs_sb_setcurseg(fs, lfs_sntod(fs, lfs_dtosn(fs, endpseg)));
   1004 		for (sn = curseg = lfs_dtosn(fs, lfs_sb_getcurseg(fs));;) {
   1005 			sn = (sn + 1) % lfs_sb_getnseg(fs);
   1006 			/* XXX could we just fail to roll forward? */
   1007 			if (sn == curseg)
   1008 				panic("lfs_mountfs: no clean segments");
   1009 			LFS_SEGENTRY(sup, fs, sn, bp);
   1010 			dirty = (sup->su_flags & SEGUSE_DIRTY);
   1011 			brelse(bp, 0);
   1012 			if (!dirty)
   1013 				break;
   1014 		}
   1015 		lfs_sb_setnextseg(fs, lfs_sntod(fs, sn));
   1016 		/* Explicitly set this segment dirty */
   1017 		LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, endpseg), bp);
   1018 		sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE;
   1019 		LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, endpseg), bp);
   1020 
   1021 		/*
   1022 		 * Phase II: Identify the highest generation of each
   1023 		 * inode.  We will ignore inodes and data blocks
   1024 		 * belonging to old versions.
   1025 		 */
   1026 		offset = startoffset;
   1027 		nextserial = startserial + 1;
   1028 		DLOG((DLOG_RF, "LFS roll forward phase 2 beginning\n"));
   1029 		while (offset > 0 && offset != endpseg) {
   1030 			lfs_parse_pseg(fs, &offset, nextserial++, cred,
   1031 				     NULL, l, update_inogen, NULL,
   1032 				     CKSEG_NONE, NULL);
   1033 			DEBUG_CHECK_FREELIST(fs);
   1034 		}
   1035 
   1036 		/*
   1037 		 * Phase III: Update inodes.
   1038 		 */
   1039 		offset = startoffset;
   1040 		nextserial = startserial + 1;
   1041 		DLOG((DLOG_RF, "LFS roll forward phase 3 beginning\n"));
   1042 		while (offset > 0 && offset != endpseg) {
   1043 			lfs_parse_pseg(fs, &offset, nextserial++, cred,
   1044 				     NULL, l, update_inoblk, NULL,
   1045 				     CKSEG_NONE, NULL);
   1046 			DEBUG_CHECK_FREELIST(fs);
   1047 		}
   1048 
   1049 		/*
   1050 		 * Phase IV: Roll forward, updating data blocks.
   1051 		 */
   1052 		offset = startoffset;
   1053 		nextserial = startserial + 1;
   1054 		DLOG((DLOG_RF, "LFS roll forward phase 4 beginning\n"));
   1055 		while (offset > 0 && offset != endpseg) {
   1056 			lfs_parse_pseg(fs, &offset, nextserial++, cred,
   1057 				     NULL, l, NULL, finfo_func_rfw,
   1058 				     CKSEG_AVAIL, NULL);
   1059 			DEBUG_CHECK_FREELIST(fs);
   1060 		}
   1061 
   1062 		/*
   1063 		 * Finish: flush our changes to disk.
   1064 		 */
   1065 		lfs_sb_setserial(fs, endserial);
   1066 
   1067 		lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC);
   1068 		DLOG((DLOG_RF, "lfs_mountfs: roll forward "
   1069 		      "examined %jd blocks\n",
   1070 		      (intmax_t)(endpseg - startoffset)));
   1071 	}
   1072 
   1073 	/* Get rid of our vnodes, except the ifile */
   1074 	drop_vnode_pages(mp, l);
   1075 	DLOG((DLOG_RF, "LFS roll forward complete\n"));
   1076 	printf("%s: roll forward recovered %d data blocks\n",
   1077 		lfs_sb_getfsmnt(fs), rblkcnt);
   1078 
   1079 	/*
   1080 	 * At this point we have no more changes to write to disk.
   1081 	 * Reset the "avail" count to match the segments as they
   1082 	 * appear on disk, and the clean segment count.
   1083 	 */
   1084 	lfs_reset_avail(fs);
   1085 }
   1086 
   1087 static bool
   1088 all_selector(void *cl, struct vnode *vp)
   1089 {
   1090 	return true;
   1091 }
   1092 
   1093 /*
   1094  * Dump any pages from vnodes that may have been put on
   1095  * during truncation.
   1096  */
   1097 static void
   1098 drop_vnode_pages(struct mount *mp, struct lwp *l)
   1099 {
   1100        struct vnode_iterator *marker;
   1101        struct lfs *fs;
   1102        struct vnode *vp;
   1103 
   1104        fs = VFSTOULFS(mp)->um_lfs;
   1105        vfs_vnode_iterator_init(mp, &marker);
   1106        while ((vp = vfs_vnode_iterator_next(marker,
   1107                all_selector, NULL)) != NULL) {
   1108                if (vp == fs->lfs_ivnode)
   1109                        continue;
   1110                VOP_LOCK(vp, LK_EXCLUSIVE | LK_RETRY);
   1111                uvm_vnp_setsize(vp, 0);
   1112                uvm_vnp_setsize(vp, VTOI(vp)->i_size);
   1113                VOP_UNLOCK(vp);
   1114                vrele(vp);
   1115        }
   1116        vfs_vnode_iterator_destroy(marker);
   1117 }
   1118