Home | History | Annotate | Line # | Download | only in lfs
      1 /*	$NetBSD: lfs_rfw.c,v 1.37 2025/09/17 04:37:47 perseant Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Konrad E. Schroder <perseant (at) hhhh.org>.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 __KERNEL_RCSID(0, "$NetBSD: lfs_rfw.c,v 1.37 2025/09/17 04:37:47 perseant Exp $");
     34 
     35 #if defined(_KERNEL_OPT)
     36 #include "opt_quota.h"
     37 #endif
     38 
     39 #include <sys/param.h>
     40 #include <sys/systm.h>
     41 #include <sys/namei.h>
     42 #include <sys/proc.h>
     43 #include <sys/kernel.h>
     44 #include <sys/vnode.h>
     45 #include <sys/mount.h>
     46 #include <sys/kthread.h>
     47 #include <sys/buf.h>
     48 #include <sys/device.h>
     49 #include <sys/file.h>
     50 #include <sys/disklabel.h>
     51 #include <sys/ioctl.h>
     52 #include <sys/errno.h>
     53 #include <sys/malloc.h>
     54 #include <sys/pool.h>
     55 #include <sys/socket.h>
     56 #include <sys/stat.h>
     57 #include <sys/syslog.h>
     58 #include <sys/sysctl.h>
     59 #include <sys/conf.h>
     60 #include <sys/kauth.h>
     61 
     62 #include <miscfs/specfs/specdev.h>
     63 
     64 #include <ufs/lfs/ulfs_quotacommon.h>
     65 #include <ufs/lfs/ulfs_inode.h>
     66 #include <ufs/lfs/ulfsmount.h>
     67 #include <ufs/lfs/ulfs_extern.h>
     68 
     69 #include <uvm/uvm_extern.h>
     70 
     71 #include <ufs/lfs/lfs.h>
     72 #include <ufs/lfs/lfs_accessors.h>
     73 #include <ufs/lfs/lfs_kernel.h>
     74 #include <ufs/lfs/lfs_extern.h>
     75 
     76 #include <miscfs/genfs/genfs.h>
     77 #include <miscfs/genfs/genfs_node.h>
     78 
     79 /*
     80  * Roll-forward code.
     81  */
     82 static daddr_t check_segsum(struct lfs *, daddr_t, u_int64_t,
     83     kauth_cred_t, int, int *, struct lwp *);
     84 
     85 static bool all_selector(void *, struct vnode *);
     86 static void drop_vnode_pages(struct mount *, struct lwp *);
     87 static int update_inogen(struct lfs *, daddr_t);
     88 static void update_inoblk_copy_dinode(struct lfs *, union lfs_dinode *, const union lfs_dinode *);
     89 
     90 extern int lfs_do_rfw;
     91 int rblkcnt;
     92 int lfs_rfw_max_psegs = 0;
     93 
     94 /*
     95  * Allocate a particular inode with a particular version number, freeing
     96  * any previous versions of this inode that may have gone before.
     97  * Used by the roll-forward code.
     98  *
     99  * XXX this function does not have appropriate locking to be used on a live fs;
    100  * XXX but something similar could probably be used for an "undelete" call.
    101  *
    102  * Called with the Ifile inode locked.
    103  */
    104 int
    105 lfs_rf_valloc(struct lfs *fs, ino_t ino, int vers, struct lwp *l,
    106 	      struct vnode **vpp, union lfs_dinode *dip)
    107 {
    108 	struct vattr va;
    109 	struct vnode *vp;
    110 	struct inode *ip;
    111 	int error;
    112 
    113 	KASSERT(ino > LFS_IFILE_INUM);
    114 	ASSERT_SEGLOCK(fs); /* XXX it doesn't, really */
    115 
    116 	/*
    117 	 * First, just try a vget. If the version number is the one we want,
    118 	 * we don't have to do anything else.  If the version number is wrong,
    119 	 * take appropriate action.
    120 	 */
    121 	error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, LK_EXCLUSIVE, &vp);
    122 	if (error == 0) {
    123 		DLOG((DLOG_RF, "lfs_rf_valloc[1]: ino %d vp %p\n",
    124 			(int)ino, vp));
    125 
    126 		*vpp = vp;
    127 		ip = VTOI(vp);
    128 		DLOG((DLOG_RF, "  ip->i_gen=%jd dip nlink %jd seeking"
    129 			" version %jd\n", (intmax_t)ip->i_gen,
    130 			(intmax_t)(dip == NULL ? -1
    131 				: lfs_dino_getnlink(fs, dip)), (intmax_t)vers));
    132 		if (ip->i_gen == vers) {
    133 			/*
    134 			 * We have what we wanted already.
    135 			 */
    136 			DLOG((DLOG_RF, "  pre-existing\n"));
    137 			return 0;
    138 		} else if (ip->i_gen < vers && dip != NULL
    139 			&& lfs_dino_getnlink(fs, dip) > 0) {
    140 			/*
    141 			 * We have found a newer version.  Truncate
    142 			 * the old vnode to zero and re-initialize
    143 			 * from the given dinode.
    144 			 */
    145 			DLOG((DLOG_RF, "  replace old version %jd\n",
    146 				(intmax_t)ip->i_gen));
    147 			lfs_truncate(vp, (off_t)0, 0, NOCRED);
    148 			ip->i_gen = vers;
    149 			vp->v_type = IFTOVT(lfs_dino_getmode(fs, dip));
    150 			update_inoblk_copy_dinode(fs, ip->i_din, dip);
    151 			LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE);
    152 			return 0;
    153 		} else {
    154 			/*
    155 			 * Not the right version and nothing to
    156 			 * initialize from.  Don't recover this data.
    157 			 */
    158 			DLOG((DLOG_RF, "ino %d: sought version %d, got %d\n",
    159 				(int)ino, (int)vers,
    160 				(int)lfs_dino_getgen(fs, ip->i_din)));
    161 			vput(vp);
    162 			*vpp = NULLVP;
    163 			return EEXIST;
    164 		}
    165 	}
    166 
    167 	/*
    168 	 * No version of this inode was found in the cache.
    169 	 * Make a new one from the dinode.  We will add data blocks
    170 	 * as they come in, so scrub any block addresses off of the
    171 	 * inode and reset block counts to zero.
    172 	 */
    173 	if (dip == NULL)
    174 		return ENOENT;
    175 
    176 	vattr_null(&va);
    177 	va.va_type = IFTOVT(lfs_dino_getmode(fs, dip));
    178 	va.va_mode = lfs_dino_getmode(fs, dip) & ALLPERMS;
    179 	va.va_fileid = ino;
    180 	va.va_gen = vers;
    181 	error = vcache_new(fs->lfs_ivnode->v_mount, NULL, &va, NOCRED, NULL,
    182 	    &vp);
    183 	if (error)
    184 		return error;
    185 	error = vn_lock(vp, LK_EXCLUSIVE);
    186 	if (error)
    187 		goto err;
    188 
    189 	ip = VTOI(vp);
    190 	update_inoblk_copy_dinode(fs, ip->i_din, dip);
    191 
    192 	DLOG((DLOG_RF, "lfs_valloc[2] ino %d vp %p size=%lld effnblks=%d,"
    193 		" blocks=%d\n", (int)ino, vp, (long long)ip->i_size,
    194 		(int)ip->i_lfs_effnblks,
    195 		(int)lfs_dino_getblocks(fs, ip->i_din)));
    196 	*vpp = vp;
    197 	return 0;
    198 
    199 err:
    200 	vrele(vp);
    201 	*vpp = NULLVP;
    202 	return error;
    203 }
    204 
    205 /*
    206  * Load the appropriate indirect block, and change the appropriate pointer.
    207  * Mark the block dirty.  Do segment and avail accounting.
    208  */
    209 static int
    210 update_meta(struct lfs *fs, ino_t ino, int vers, daddr_t lbn,
    211 	    daddr_t ndaddr, size_t size, struct lwp *l)
    212 {
    213 	int error;
    214 	struct vnode *vp;
    215 	struct inode *ip;
    216 	daddr_t odaddr;
    217 	struct indir a[ULFS_NIADDR];
    218 	int num;
    219 	struct buf *bp;
    220 	SEGUSE *sup;
    221 	u_int64_t newsize, loff;
    222 
    223 	KASSERT(lbn >= 0);	/* no indirect blocks */
    224 	KASSERT(ino > LFS_IFILE_INUM);
    225 
    226 	DLOG((DLOG_RF, "update_meta: ino %d lbn %d size %d at 0x%jx\n",
    227 	      (int)ino, (int)lbn, (int)size, (uintmax_t)ndaddr));
    228 
    229 	if ((error = lfs_rf_valloc(fs, ino, vers, l, &vp, NULL)) != 0)
    230 		return error;
    231 	ip = VTOI(vp);
    232 
    233 	/*
    234 	 * If block already exists, note its new location
    235 	 * but do not account it as new.
    236 	 */
    237 	ulfs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL, NULL);
    238 	if (odaddr == UNASSIGNED) {
    239 		if ((error = lfs_balloc(vp, (lbn << lfs_sb_getbshift(fs)),
    240 					size, NOCRED, 0, &bp)) != 0) {
    241 			vput(vp);
    242 			return (error);
    243 		}
    244 		/* No need to write, the block is already on disk */
    245 		if (bp->b_oflags & BO_DELWRI) {
    246 			LFS_UNLOCK_BUF(bp);
    247 			/* Account recovery of the previous version */
    248 			lfs_sb_addavail(fs, lfs_btofsb(fs, bp->b_bcount));
    249 		}
    250 		brelse(bp, BC_INVAL);
    251 		DLOG((DLOG_RF, "balloc ip->i_lfs_effnblks = %d,"
    252 			" lfs_dino_getblocks(fs, ip->i_din) = %d\n",
    253 			(int)ip->i_lfs_effnblks,
    254 			(int)lfs_dino_getblocks(fs, ip->i_din)));
    255 	} else {
    256 		/* XXX fragextend? */
    257 		DLOG((DLOG_RF, "block exists, no balloc\n"));
    258 	}
    259 
    260 	/*
    261 	 * Extend the file, if it is not large enough already.
    262 	 * XXX this is not exactly right, we don't know how much of the
    263 	 * XXX last block is actually used.
    264 	 */
    265 	loff = lfs_lblktosize(fs, lbn);
    266 	if (loff >= (ULFS_NDADDR << lfs_sb_getbshift(fs))) {
    267 		/* No fragments */
    268 		newsize = loff + 1;
    269 	} else {
    270 		/* Subtract only a fragment to account for block size */
    271 		newsize = loff + size - lfs_fsbtob(fs, 1) + 1;
    272 	}
    273 
    274 	if (ip->i_size < newsize) {
    275 		DLOG((DLOG_RF, "ino %d size %d -> %d\n",
    276 		      (int)ino, (int)ip->i_size, (int)newsize));
    277 		lfs_dino_setsize(fs, ip->i_din, newsize);
    278 		ip->i_size = newsize;
    279 		/*
    280 		 * tell vm our new size for the case the inode won't
    281 		 * appear later.
    282 		 */
    283 		uvm_vnp_setsize(vp, newsize);
    284 	}
    285 
    286 	lfs_update_single(fs, NULL, vp, lbn, ndaddr, size);
    287 
    288 	LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, ndaddr), bp);
    289 	sup->su_nbytes += size;
    290 	LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, ndaddr), bp);
    291 
    292 	/* differences here should be due to UNWRITTEN indirect blocks. */
    293 	if (vp->v_type != VLNK) {
    294 		if (!(ip->i_lfs_effnblks >= lfs_dino_getblocks(fs, ip->i_din))
    295 #if 0
    296 		    || !(lfs_lblkno(fs, ip->i_size) > ULFS_NDADDR ||
    297 			 ip->i_lfs_effnblks == lfs_dino_getblocks(fs, ip->i_din))
    298 #endif /* 0 */
    299 			) {
    300 			vprint("vnode", vp);
    301 			printf("effnblks=%jd dino_getblocks=%jd\n",
    302 			       (intmax_t)ip->i_lfs_effnblks,
    303 			       (intmax_t)lfs_dino_getblocks(fs, ip->i_din));
    304 		}
    305 		KASSERT(ip->i_lfs_effnblks >= lfs_dino_getblocks(fs, ip->i_din));
    306 #if 0
    307 		KASSERT(lfs_lblkno(fs, ip->i_size) > ULFS_NDADDR ||
    308 			ip->i_lfs_effnblks == lfs_dino_getblocks(fs, ip->i_din));
    309 #endif /* 0 */
    310 	}
    311 
    312 #ifdef DEBUG
    313 	/* Now look again to make sure it worked */
    314 	ulfs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL, NULL);
    315 	if (LFS_DBTOFSB(fs, odaddr) != ndaddr)
    316 		DLOG((DLOG_RF, "update_meta: failed setting ino %jd lbn %jd"
    317 		      " to %jd\n", (intmax_t)ino, (intmax_t)lbn, (intmax_t)ndaddr));
    318 #endif /* DEBUG */
    319 	vput(vp);
    320 	return 0;
    321 }
    322 
    323 /*
    324  * Copy some the fields of the dinode as needed by update_inoblk().
    325  */
    326 static void
    327 update_inoblk_copy_dinode(struct lfs *fs,
    328     union lfs_dinode *dstu, const union lfs_dinode *srcu)
    329 {
    330 	if (fs->lfs_is64) {
    331 		struct lfs64_dinode *dst = &dstu->u_64;
    332 		const struct lfs64_dinode *src = &srcu->u_64;
    333 		unsigned i;
    334 
    335 		/*
    336 		 * Copy everything but the block pointers and di_blocks.
    337 		 * XXX what about di_extb?
    338 		 */
    339 		dst->di_mode = src->di_mode;
    340 		dst->di_nlink = src->di_nlink;
    341 		dst->di_uid = src->di_uid;
    342 		dst->di_gid = src->di_gid;
    343 		dst->di_blksize = src->di_blksize;
    344 		dst->di_size = src->di_size;
    345 		dst->di_atime = src->di_atime;
    346 		dst->di_mtime = src->di_mtime;
    347 		dst->di_ctime = src->di_ctime;
    348 		dst->di_birthtime = src->di_birthtime;
    349 		dst->di_mtimensec = src->di_mtimensec;
    350 		dst->di_atimensec = src->di_atimensec;
    351 		dst->di_ctimensec = src->di_ctimensec;
    352 		dst->di_birthnsec = src->di_birthnsec;
    353 		dst->di_gen = src->di_gen;
    354 		dst->di_kernflags = src->di_kernflags;
    355 		dst->di_flags = src->di_flags;
    356 		dst->di_extsize = src->di_extsize;
    357 		dst->di_modrev = src->di_modrev;
    358 		dst->di_inumber = src->di_inumber;
    359 		for (i = 0; i < __arraycount(src->di_spare); i++) {
    360 			dst->di_spare[i] = src->di_spare[i];
    361 		}
    362 		/* Short symlinks store their data in di_db. */
    363 		if ((src->di_mode & LFS_IFMT) == LFS_IFLNK
    364 		    && src->di_size < lfs_sb_getmaxsymlinklen(fs)) {
    365 			memcpy(dst->di_db, src->di_db, src->di_size);
    366 		}
    367 	} else {
    368 		struct lfs32_dinode *dst = &dstu->u_32;
    369 		const struct lfs32_dinode *src = &srcu->u_32;
    370 
    371 		/* Get mode, link count, size, and times */
    372 		memcpy(dst, src, offsetof(struct lfs32_dinode, di_db[0]));
    373 
    374 		/* Then the rest, except di_blocks */
    375 		dst->di_flags = src->di_flags;
    376 		dst->di_gen = src->di_gen;
    377 		dst->di_uid = src->di_uid;
    378 		dst->di_gid = src->di_gid;
    379 		dst->di_modrev = src->di_modrev;
    380 
    381 		/* Short symlinks store their data in di_db. */
    382 		if ((src->di_mode & LFS_IFMT) == LFS_IFLNK
    383 		    && src->di_size < lfs_sb_getmaxsymlinklen(fs)) {
    384 			memcpy(dst->di_db, src->di_db, src->di_size);
    385 		}
    386 	}
    387 }
    388 
    389 static int
    390 update_inoblk(struct lfs *fs, daddr_t offset, kauth_cred_t cred,
    391 	      struct lwp *l)
    392 {
    393 	struct vnode *devvp, *vp;
    394 	struct inode *ip;
    395 	union lfs_dinode *dip;
    396 	struct buf *dbp, *ibp;
    397 	int error;
    398 	daddr_t daddr;
    399 	IFILE *ifp;
    400 	SEGUSE *sup;
    401 	unsigned i, num;
    402 	uint32_t gen;
    403 	char *buf;
    404 
    405 	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
    406 
    407 	/*
    408 	 * Get the inode, update times and perms.
    409 	 * DO NOT update disk blocks, we do that separately.
    410 	 */
    411 	error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs),
    412 	    0, &dbp);
    413 	if (error) {
    414 		DLOG((DLOG_RF, "update_inoblk: bread returned %d\n", error));
    415 		return error;
    416 	}
    417 	buf = malloc(dbp->b_bcount, M_SEGMENT, M_WAITOK);
    418 	memcpy(buf, dbp->b_data, dbp->b_bcount);
    419 	brelse(dbp, BC_AGE);
    420 	num = LFS_INOPB(fs);
    421 	for (i = num; i-- > 0; ) {
    422 		dip = DINO_IN_BLOCK(fs, buf, i);
    423 		if (lfs_dino_getinumber(fs, dip) <= LFS_IFILE_INUM)
    424 			continue;
    425 
    426 		/* Check generation number */
    427 		LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp);
    428 		gen = lfs_if_getversion(fs, ifp);
    429 		brelse(ibp, 0);
    430 		if (lfs_dino_getgen(fs, dip) < gen) {
    431 			continue;
    432 		}
    433 
    434 		/*
    435 		 * This inode is the newest generation.  Load it.
    436 		 */
    437 		error = lfs_rf_valloc(fs, lfs_dino_getinumber(fs, dip),
    438 				      lfs_dino_getgen(fs, dip),
    439 				      l, &vp, dip);
    440 		if (error) {
    441 			DLOG((DLOG_RF, "update_inoblk: lfs_rf_valloc"
    442 			      " returned %d\n", error));
    443 			continue;
    444 		}
    445 		ip = VTOI(vp);
    446 		if (lfs_dino_getsize(fs, dip) != ip->i_size
    447 		    && vp->v_type != VLNK) {
    448 			/* XXX What should we do sith symlinks? */
    449 			DLOG((DLOG_RF, "  ino %jd size %jd -> %jd\n",
    450 				(intmax_t)lfs_dino_getinumber(fs, dip),
    451 				(intmax_t)ip->i_size,
    452 				(intmax_t)lfs_dino_getsize(fs, dip)));
    453 			lfs_truncate(vp, lfs_dino_getsize(fs, dip), 0,
    454 				     NOCRED);
    455 		}
    456 		update_inoblk_copy_dinode(fs, ip->i_din, dip);
    457 
    458 		ip->i_flags = lfs_dino_getflags(fs, dip);
    459 		ip->i_gen = lfs_dino_getgen(fs, dip);
    460 		ip->i_uid = lfs_dino_getuid(fs, dip);
    461 		ip->i_gid = lfs_dino_getgid(fs, dip);
    462 
    463 		ip->i_mode = lfs_dino_getmode(fs, dip);
    464 		ip->i_nlink = lfs_dino_getnlink(fs, dip);
    465 		ip->i_size = lfs_dino_getsize(fs, dip);
    466 
    467 		LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE);
    468 
    469 		/* Re-initialize to get type right */
    470 		ulfs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p,
    471 			  &vp);
    472 
    473 		/* Record change in location */
    474 		LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp);
    475 		daddr = lfs_if_getdaddr(fs, ifp);
    476 		lfs_if_setdaddr(fs, ifp, LFS_DBTOFSB(fs, dbp->b_blkno));
    477 		error = LFS_BWRITE_LOG(ibp); /* Ifile */
    478 		/* And do segment accounting */
    479 		if (lfs_dtosn(fs, daddr)
    480 		    != lfs_dtosn(fs, LFS_DBTOFSB(fs, dbp->b_blkno))) {
    481 			if (!DADDR_IS_BAD(daddr)) {
    482 				LFS_SEGENTRY(sup, fs,
    483 					     lfs_dtosn(fs, daddr), ibp);
    484 				sup->su_nbytes -= DINOSIZE(fs);
    485 				LFS_WRITESEGENTRY(sup, fs,
    486 						  lfs_dtosn(fs, daddr),
    487 						  ibp);
    488 			}
    489 			LFS_SEGENTRY(sup, fs, lfs_dtosn(fs,
    490 				       LFS_DBTOFSB(fs, dbp->b_blkno)),
    491 				     ibp);
    492 			sup->su_nbytes += DINOSIZE(fs);
    493 			LFS_WRITESEGENTRY(sup, fs,
    494 					  lfs_dtosn(fs, LFS_DBTOFSB(fs,
    495 						dbp->b_blkno)),
    496 					  ibp);
    497 		}
    498 		vput(vp);
    499 	}
    500 	free(buf, M_SEGMENT);
    501 
    502 	return 0;
    503 }
    504 
    505 /*
    506  * Note the highest generation number of each inode in the Ifile.
    507  * This allows us to skip processing data for intermediate versions.
    508  */
    509 static int
    510 update_inogen(struct lfs *fs, daddr_t offset)
    511 {
    512 	struct vnode *devvp;
    513 	union lfs_dinode *dip;
    514 	struct buf *dbp, *ibp;
    515 	int error;
    516 	IFILE *ifp;
    517 	unsigned i, num;
    518 
    519 	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
    520 
    521 	/* Read inode block */
    522 	error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs),
    523 	    0, &dbp);
    524 	if (error) {
    525 		DLOG((DLOG_RF, "update_inoblk: bread returned %d\n", error));
    526 		return error;
    527 	}
    528 
    529 	/* Check each inode against ifile entry */
    530 	num = LFS_INOPB(fs);
    531 	for (i = num; i-- > 0; ) {
    532 		dip = DINO_IN_BLOCK(fs, dbp->b_data, i);
    533 		if (lfs_dino_getinumber(fs, dip) == LFS_IFILE_INUM)
    534 			continue;
    535 
    536 		/* Update generation number */
    537 		LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp);
    538 		if (lfs_if_getversion(fs, ifp) < lfs_dino_getgen(fs, dip))
    539 			lfs_if_setversion(fs, ifp, lfs_dino_getgen(fs, dip));
    540 		error = LFS_BWRITE_LOG(ibp); /* Ifile */
    541 		if (error)
    542 			break;
    543 	}
    544 	brelse(dbp, BC_AGE);
    545 
    546 	return error;
    547 }
    548 
    549 #define CHECK_CKSUM	1	/* Check the checksum to make sure it's valid */
    550 #define CHECK_GEN	2	/* Update highest generation number */
    551 #define CHECK_INODES	3	/* Read and process inodes */
    552 #define CHECK_DATA	4	/* Identify and process data blocks */
    553 
    554 static daddr_t
    555 check_segsum(struct lfs *fs, daddr_t offset, u_int64_t nextserial,
    556 	     kauth_cred_t cred, int phase, int *pseg_flags, struct lwp *l)
    557 {
    558 	struct vnode *devvp;
    559 	struct buf *bp, *dbp;
    560 	int error, ninos, i, j;
    561 	SEGSUM *ssp;
    562 	daddr_t prevoffset;
    563 	IINFO *iip;
    564 	FINFO *fip;
    565 	SEGUSE *sup;
    566 	size_t size;
    567 	uint32_t datasum, foundsum;
    568 	char *buf;
    569 
    570 	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
    571 
    572 	/*
    573 	 * If this is segment 0, skip the label.
    574 	 * If the segment has a superblock and we're at the top
    575 	 * of the segment, skip the superblock.
    576 	 */
    577 	if (offset == lfs_sb_gets0addr(fs))
    578 		offset += lfs_btofsb(fs, LFS_LABELPAD);
    579 	if (lfs_sntod(fs, lfs_dtosn(fs, offset)) == offset) {
    580 		LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp);
    581 		if (sup->su_flags & SEGUSE_SUPERBLOCK)
    582 			offset += lfs_btofsb(fs, LFS_SBPAD);
    583 		brelse(bp, 0);
    584 	}
    585 
    586 	/* Read in the segment summary */
    587 	error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getsumsize(fs),
    588 	    0, &bp);
    589 	if (error)
    590 		return -1;
    591 	buf = malloc(bp->b_bcount, M_SEGMENT, M_WAITOK);
    592 	memcpy(buf, bp->b_data, bp->b_bcount);
    593 	brelse(bp, BC_AGE);
    594 
    595 	ssp = (SEGSUM *)buf;
    596 
    597 	/*
    598 	 * Phase I: Check summary checksum.
    599 	 */
    600 	if (phase == CHECK_CKSUM) {
    601 		size_t sumstart;
    602 
    603 		sumstart = lfs_ss_getsumstart(fs);
    604 		if (lfs_ss_getsumsum(fs, ssp) !=
    605 		    cksum((char *)ssp + sumstart,
    606 			  lfs_sb_getsumsize(fs) - sumstart)) {
    607 			DLOG((DLOG_RF, "Sumsum error at 0x%" PRIx64 "\n",
    608 				offset));
    609 			offset = -1;
    610 			goto err;
    611 		}
    612 		if (lfs_ss_getnfinfo(fs, ssp) == 0 &&
    613 		    lfs_ss_getninos(fs, ssp) == 0) {
    614 			DLOG((DLOG_RF, "Empty pseg at 0x%" PRIx64 "\n",
    615 				offset));
    616 			offset = -1;
    617 			goto err;
    618 		}
    619 		if (lfs_sb_getversion(fs) == 1) {
    620 			if (lfs_ss_getcreate(fs, ssp) < lfs_sb_gettstamp(fs)) {
    621 				DLOG((DLOG_RF, "Old data at 0x%" PRIx64 "\n", offset));
    622 				offset = -1;
    623 				goto err;
    624 			}
    625 		} else {
    626 			if (lfs_ss_getserial(fs, ssp) != nextserial) {
    627 				DLOG((DLOG_RF, "Serial number at 0x%jx given as 0x%jx,"
    628 				      " expected 0x%jx\n", (intmax_t)offset,
    629 				      (intmax_t)lfs_ss_getserial(fs, ssp),
    630 				      (intmax_t)nextserial));
    631 				offset = -1;
    632 				goto err;
    633 			}
    634 			if (lfs_ss_getident(fs, ssp) != lfs_sb_getident(fs)) {
    635 				DLOG((DLOG_RF, "Incorrect fsid (0x%x vs 0x%x) at 0x%"
    636 				      PRIx64 "\n", lfs_ss_getident(fs, ssp),
    637 				      lfs_sb_getident(fs), offset));
    638 				offset = -1;
    639 				goto err;
    640 			}
    641 		}
    642 	}
    643 	if (pseg_flags)
    644 		*pseg_flags = lfs_ss_getflags(fs, ssp);
    645 	prevoffset = offset;
    646 	offset += lfs_btofsb(fs, lfs_sb_getsumsize(fs));
    647 
    648 	/* Handle individual blocks */
    649 	foundsum = 0;
    650 	ninos = howmany(lfs_ss_getninos(fs, ssp), LFS_INOPB(fs));
    651 	iip = SEGSUM_IINFOSTART(fs, buf);
    652 	fip = SEGSUM_FINFOBASE(fs, (SEGSUM *)buf);
    653 	for (i = 0; i < lfs_ss_getnfinfo(fs, ssp) || ninos; ++i) {
    654 		/* Inode block? */
    655 		if (ninos && lfs_ii_getblock(fs, iip) == offset) {
    656 			if (phase == CHECK_CKSUM) {
    657 				/* Read in the head and add to the buffer */
    658 				error = bread(devvp, LFS_FSBTODB(fs, offset),
    659 					lfs_sb_getbsize(fs), 0, &dbp);
    660 				if (error) {
    661 					offset = -1;
    662 					goto err;
    663 				}
    664 				foundsum = lfs_cksum_part(dbp->b_data,
    665 					sizeof(uint32_t), foundsum);
    666 				brelse(dbp, BC_AGE);
    667 			}
    668 			if (phase == CHECK_GEN) {
    669 				if ((error = update_inogen(fs, offset))
    670 				    != 0) {
    671 					offset = -1;
    672 					goto err;
    673 				}
    674 			}
    675 			if (phase == CHECK_INODES) {
    676 				if ((error = update_inoblk(fs, offset, cred, l))
    677 				    != 0) {
    678 					offset = -1;
    679 					goto err;
    680 				}
    681 			}
    682 			offset += lfs_btofsb(fs, lfs_sb_getibsize(fs));
    683 			iip = NEXTLOWER_IINFO(fs, iip);
    684 			--ninos;
    685 			--i; /* compensate for ++i in loop header */
    686 			continue;
    687 		}
    688 
    689 		/* File block */
    690 		size = lfs_sb_getbsize(fs);
    691 		for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) {
    692 			if (j == lfs_fi_getnblocks(fs, fip) - 1)
    693 				size = lfs_fi_getlastlength(fs, fip);
    694 			if (phase == CHECK_CKSUM) {
    695 				error = bread(devvp, LFS_FSBTODB(fs, offset),
    696 						size, 0, &dbp);
    697 				if (error) {
    698 					offset = -1;
    699 					goto err;
    700 				}
    701 				foundsum = lfs_cksum_part(dbp->b_data,
    702 					  sizeof(uint32_t), foundsum);
    703 				brelse(dbp, BC_AGE);
    704 			}
    705 			/* Account for and update any direct blocks */
    706 			if (phase == CHECK_DATA &&
    707 			   lfs_fi_getino(fs, fip) > LFS_IFILE_INUM &&
    708 			   lfs_fi_getblock(fs, fip, j) >= 0) {
    709 				update_meta(fs, lfs_fi_getino(fs, fip),
    710 					    lfs_fi_getversion(fs, fip),
    711 					    lfs_fi_getblock(fs, fip, j),
    712 					    offset, size, l);
    713 				++rblkcnt;
    714 			}
    715 			offset += lfs_btofsb(fs, size);
    716 		}
    717 
    718 		fip = NEXT_FINFO(fs, fip);
    719 	}
    720 
    721 	/* Checksum the array, compare */
    722 	if (phase == CHECK_CKSUM) {
    723 		datasum = lfs_ss_getdatasum(fs, ssp);
    724 		foundsum = lfs_cksum_fold(foundsum);
    725 		if (datasum != foundsum) {
    726 			DLOG((DLOG_RF, "Datasum error at 0x%" PRIx64
    727 			      " (wanted %x got %x)\n",
    728 			      offset, datasum, foundsum));
    729 			offset = -1;
    730 			goto err;
    731 		}
    732 	}
    733 
    734 	if (phase == CHECK_CKSUM)
    735 		lfs_sb_subavail(fs, offset - prevoffset);
    736 	else {
    737 		/* Don't clog the buffer queue */
    738 		mutex_enter(&lfs_lock);
    739 		if (locked_queue_count > LFS_MAX_BUFS ||
    740 		    locked_queue_bytes > LFS_MAX_BYTES) {
    741 			lfs_flush(fs, SEGM_CKP, 0);
    742 		}
    743 		mutex_exit(&lfs_lock);
    744 	}
    745 
    746 	/*
    747 	 * If we're at the end of the segment, move to the next.
    748 	 * A partial segment needs space for a segment header (1 fsb)
    749 	 * and a full block ("frag" fsb).  Thus, adding "frag" fsb should
    750 	 * still be within the current segment (whereas frag + 1 might
    751 	 * be at the start of the next segment).
    752 	 *
    753 	 * This needs to match the definition of LFS_PARTIAL_FITS
    754 	 * in lfs_segment.c.
    755 	 */
    756 	if (lfs_dtosn(fs, offset + lfs_sb_getfrag(fs))
    757 	    != lfs_dtosn(fs, offset)) {
    758 		if (lfs_dtosn(fs, offset) == lfs_dtosn(fs, lfs_ss_getnext(fs,
    759 									ssp))) {
    760 			printf("WHOA! at 0x%jx/seg %jd moving to 0x%jx/seg %jd\n",
    761 			       (intmax_t)offset,
    762 			       (intmax_t)lfs_dtosn(fs, offset),
    763 			       (intmax_t)lfs_ss_getnext(fs, ssp),
    764 			       (intmax_t)lfs_dtosn(fs, lfs_ss_getnext(fs, ssp)));
    765 			offset = -1;
    766 			goto err;
    767 		}
    768 		offset = lfs_ss_getnext(fs, ssp);
    769 		DLOG((DLOG_RF, "LFS roll forward: moving to offset 0x%" PRIx64
    770 		       " -> segment %d\n", offset, lfs_dtosn(fs,offset)));
    771 	}
    772 
    773     err:
    774 	free(buf, M_SEGMENT);
    775 
    776 	return offset;
    777 }
    778 
    779 void
    780 lfs_roll_forward(struct lfs *fs, struct mount *mp, struct lwp *l)
    781 {
    782 	int flags, dirty, phase;
    783 	daddr_t startoffset, offset, nextoffset, endpseg;
    784 	u_int64_t nextserial, startserial, endserial;
    785 	int sn, curseg;
    786 	struct proc *p;
    787 	kauth_cred_t cred;
    788 	SEGUSE *sup;
    789 	struct buf *bp;
    790 
    791 	p = l ? l->l_proc : NULL;
    792 	cred = p ? p->p_cred : NOCRED;
    793 
    794 	/*
    795 	 * Roll forward.
    796 	 *
    797 	 * We don't roll forward for v1 filesystems, because
    798 	 * of the danger that the clock was turned back between the last
    799 	 * checkpoint and crash.  This would roll forward garbage.
    800 	 *
    801 	 * v2 filesystems don't have this problem because they use a
    802 	 * monotonically increasing serial number instead of a timestamp.
    803 	 */
    804 	rblkcnt = 0;
    805 	if ((lfs_sb_getpflags(fs) & LFS_PF_CLEAN) || !lfs_do_rfw
    806 	    || lfs_sb_getversion(fs) <= 1 || p == NULL)
    807 		return;
    808 
    809 	DLOG((DLOG_RF, "%s: begin roll forward at serial 0x%jx\n",
    810 		lfs_sb_getfsmnt(fs), (intmax_t)lfs_sb_getserial(fs)));
    811 	DEBUG_CHECK_FREELIST(fs);
    812 
    813 	/*
    814 	 * Phase I: Find the address of the last good partial
    815 	 * segment that was written after the checkpoint.  Mark
    816 	 * the segments in question dirty, so they won't be
    817 	 * reallocated.
    818 	 */
    819 	endpseg = startoffset = offset = lfs_sb_getoffset(fs);
    820 	flags = 0x0;
    821 	DLOG((DLOG_RF, "LFS roll forward phase 1: start at offset 0x%"
    822 	      PRIx64 "\n", offset));
    823 	LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp);
    824 	if (!(sup->su_flags & SEGUSE_DIRTY))
    825 		lfs_sb_subnclean(fs, 1);
    826 	sup->su_flags |= SEGUSE_DIRTY;
    827 	LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp);
    828 
    829 	startserial = lfs_sb_getserial(fs);
    830 	endserial = nextserial = startserial + 1;
    831 	while ((nextoffset = check_segsum(fs, offset, nextserial,
    832 	    cred, CHECK_CKSUM, &flags, l)) > 0) {
    833 		if (lfs_sntod(fs, offset) != lfs_sntod(fs, nextoffset)) {
    834 			LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset),
    835 				     bp);
    836 			if (!(sup->su_flags & SEGUSE_DIRTY))
    837 				lfs_sb_subnclean(fs, 1);
    838 			sup->su_flags |= SEGUSE_DIRTY;
    839 			LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp);
    840 		}
    841 
    842 		DLOG((DLOG_RF, "LFS roll forward phase 1: offset=0x%jx"
    843 			" serial=0x%jx\n", (intmax_t)nextoffset,
    844 			(intmax_t)nextserial));
    845 		if (flags & SS_DIROP) {
    846 			DLOG((DLOG_RF, "lfs_mountfs: dirops at 0x%"
    847 			      PRIx64 "\n", offset));
    848 			if (!(flags & SS_CONT)) {
    849 			     DLOG((DLOG_RF, "lfs_mountfs: dirops end "
    850 				   "at 0x%" PRIx64 "\n", offset));
    851 			}
    852 		}
    853 		offset = nextoffset;
    854 		++nextserial;
    855 
    856 		if (!(flags & SS_CONT)) {
    857 			endpseg = nextoffset;
    858 			endserial = nextserial;
    859 		}
    860 		if (lfs_rfw_max_psegs > 0
    861 		    && nextserial > startserial + lfs_rfw_max_psegs)
    862 			break;
    863 	}
    864 	if (flags & SS_CONT) {
    865 		DLOG((DLOG_RF, "LFS roll forward: warning: incomplete "
    866 			"dirops discarded (0x%jx < 0x%jx)\n",
    867 			endpseg, nextoffset));
    868 	}
    869 	if (lfs_sb_getversion(fs) > 1)
    870 		lfs_sb_setserial(fs, endserial);
    871 	DLOG((DLOG_RF, "LFS roll forward phase 1: completed: "
    872 	      "endpseg=0x%" PRIx64 "\n", endpseg));
    873 	offset = startoffset;
    874 	if (offset != endpseg) {
    875 		/* Don't overwrite what we're trying to preserve */
    876 		lfs_sb_setoffset(fs, endpseg);
    877 		lfs_sb_setcurseg(fs, lfs_sntod(fs, lfs_dtosn(fs, endpseg)));
    878 		for (sn = curseg = lfs_dtosn(fs, lfs_sb_getcurseg(fs));;) {
    879 			sn = (sn + 1) % lfs_sb_getnseg(fs);
    880 			/* XXX could we just fail to roll forward? */
    881 			if (sn == curseg)
    882 				panic("lfs_mountfs: no clean segments");
    883 			LFS_SEGENTRY(sup, fs, sn, bp);
    884 			dirty = (sup->su_flags & SEGUSE_DIRTY);
    885 			brelse(bp, 0);
    886 			if (!dirty)
    887 				break;
    888 		}
    889 		lfs_sb_setnextseg(fs, lfs_sntod(fs, sn));
    890 		/* Explicitly set this segment dirty */
    891 		LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, endpseg), bp);
    892 		sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE;
    893 		LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, endpseg), bp);
    894 
    895 
    896 		/*
    897 		 * Phase II: Identify the highest generation of each
    898 		 * inode.
    899 		 *
    900 		 * Phase III: Update inodes.  We end up with the
    901 		 * last version of each inode present, and can ignore
    902 		 * data blocks belonging to previous versions.
    903 		 *
    904 		 * Phase IV: Roll forward, updating data blocks.
    905 		 */
    906 		for (phase = CHECK_GEN; phase <= CHECK_DATA; ++phase) {
    907 			offset = startoffset;
    908 			nextserial = startserial + 1;
    909 			printf("LFS roll forward phase %d beginning\n", phase);
    910 			while (offset > 0 && offset != endpseg) {
    911 				if (phase == CHECK_DATA) {
    912 					DLOG((DLOG_RF, "LFS roll forward"
    913 					" phase %d: offset=0x%jx"
    914 					" serial=0x%jx\n",
    915 					phase, (intmax_t)offset,
    916 					(intmax_t)nextserial));
    917 				}
    918 				offset = check_segsum(fs, offset,
    919 			    		nextserial, cred,
    920 					phase, NULL, l);
    921 				++nextserial;
    922 				DEBUG_CHECK_FREELIST(fs);
    923 			}
    924 		}
    925 
    926 		/*
    927 		 * Finish: flush our changes to disk.
    928 		 */
    929 		lfs_sb_setserial(fs, endserial);
    930 
    931 		lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC);
    932 		DLOG((DLOG_RF, "lfs_mountfs: roll forward "
    933 		      "examined %jd blocks\n",
    934 		      (intmax_t)(endpseg - startoffset)));
    935 	}
    936 
    937 	/* Get rid of our vnodes, except the ifile */
    938 	drop_vnode_pages(mp, l);
    939 	DLOG((DLOG_RF, "LFS roll forward complete\n"));
    940 	printf("%s: roll forward recovered %d data blocks\n",
    941 		lfs_sb_getfsmnt(fs), rblkcnt);
    942 
    943 	/*
    944 	 * At this point we have no more changes to write to disk.
    945 	 * Reset the "avail" count to match the segments as they
    946 	 * appear on disk, and the clean segment count.
    947 	 */
    948 	lfs_reset_avail(fs);
    949 }
    950 
    951 static bool
    952 all_selector(void *cl, struct vnode *vp)
    953 {
    954 	return true;
    955 }
    956 
    957 
    958 /*
    959  * Dump any pages from vnodes that may have been put on
    960  * during truncation.
    961  */
    962 static void
    963 drop_vnode_pages(struct mount *mp, struct lwp *l)
    964 {
    965        struct vnode_iterator *marker;
    966        struct lfs *fs;
    967        struct vnode *vp;
    968 
    969        fs = VFSTOULFS(mp)->um_lfs;
    970        vfs_vnode_iterator_init(mp, &marker);
    971        while ((vp = vfs_vnode_iterator_next(marker,
    972                all_selector, NULL)) != NULL) {
    973                if (vp == fs->lfs_ivnode)
    974                        continue;
    975                VOP_LOCK(vp, LK_EXCLUSIVE | LK_RETRY);
    976                uvm_vnp_setsize(vp, 0);
    977                uvm_vnp_setsize(vp, VTOI(vp)->i_size);
    978                VOP_UNLOCK(vp);
    979                vrele(vp);
    980        }
    981        vfs_vnode_iterator_destroy(marker);
    982 }
    983 
    984