1 /* $NetBSD: lfs_rfw.c,v 1.38 2025/10/06 20:58:48 perseant Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Konrad E. Schroder <perseant (at) hhhh.org>. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: lfs_rfw.c,v 1.38 2025/10/06 20:58:48 perseant Exp $"); 34 35 #if defined(_KERNEL_OPT) 36 #include "opt_quota.h" 37 #endif 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/namei.h> 42 #include <sys/proc.h> 43 #include <sys/kernel.h> 44 #include <sys/vnode.h> 45 #include <sys/mount.h> 46 #include <sys/kthread.h> 47 #include <sys/buf.h> 48 #include <sys/device.h> 49 #include <sys/file.h> 50 #include <sys/disklabel.h> 51 #include <sys/ioctl.h> 52 #include <sys/errno.h> 53 #include <sys/malloc.h> 54 #include <sys/pool.h> 55 #include <sys/socket.h> 56 #include <sys/stat.h> 57 #include <sys/syslog.h> 58 #include <sys/sysctl.h> 59 #include <sys/conf.h> 60 #include <sys/kauth.h> 61 62 #include <miscfs/specfs/specdev.h> 63 64 #include <ufs/lfs/ulfs_quotacommon.h> 65 #include <ufs/lfs/ulfs_inode.h> 66 #include <ufs/lfs/ulfsmount.h> 67 #include <ufs/lfs/ulfs_extern.h> 68 69 #include <uvm/uvm_extern.h> 70 71 #include <ufs/lfs/lfs.h> 72 #include <ufs/lfs/lfs_accessors.h> 73 #include <ufs/lfs/lfs_kernel.h> 74 #include <ufs/lfs/lfs_extern.h> 75 76 #include <miscfs/genfs/genfs.h> 77 #include <miscfs/genfs/genfs_node.h> 78 79 /* 80 * Roll-forward code. 81 */ 82 static daddr_t check_segsum(struct lfs *, daddr_t, u_int64_t, 83 kauth_cred_t, int, int *, struct lwp *); 84 85 static bool all_selector(void *, struct vnode *); 86 static void drop_vnode_pages(struct mount *, struct lwp *); 87 static int update_inogen(struct lfs *, daddr_t); 88 static void update_inoblk_copy_dinode(struct lfs *, union lfs_dinode *, const union lfs_dinode *); 89 90 extern int lfs_do_rfw; 91 int rblkcnt; 92 int lfs_rfw_max_psegs = 0; 93 94 /* 95 * Allocate a particular inode with a particular version number, freeing 96 * any previous versions of this inode that may have gone before. 97 * Used by the roll-forward code. 98 * 99 * XXX this function does not have appropriate locking to be used on a live fs; 100 * XXX but something similar could probably be used for an "undelete" call. 101 * 102 * Called with the Ifile inode locked. 103 */ 104 int 105 lfs_rf_valloc(struct lfs *fs, ino_t ino, int vers, struct lwp *l, 106 struct vnode **vpp, union lfs_dinode *dip) 107 { 108 struct vattr va; 109 struct vnode *vp; 110 struct inode *ip; 111 int error; 112 113 KASSERT(ino > LFS_IFILE_INUM); 114 ASSERT_SEGLOCK(fs); /* XXX it doesn't, really */ 115 116 /* 117 * First, just try a vget. If the version number is the one we want, 118 * we don't have to do anything else. If the version number is wrong, 119 * take appropriate action. 120 */ 121 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, LK_EXCLUSIVE, &vp); 122 if (error == 0) { 123 DLOG((DLOG_RF, "lfs_rf_valloc[1]: ino %d vp %p\n", 124 (int)ino, vp)); 125 126 *vpp = vp; 127 ip = VTOI(vp); 128 DLOG((DLOG_RF, " ip->i_gen=%jd dip nlink %jd seeking" 129 " version %jd\n", (intmax_t)ip->i_gen, 130 (intmax_t)(dip == NULL ? -1 131 : lfs_dino_getnlink(fs, dip)), (intmax_t)vers)); 132 if (ip->i_gen == vers) { 133 /* 134 * We have what we wanted already. 135 */ 136 DLOG((DLOG_RF, " pre-existing\n")); 137 return 0; 138 } else if (ip->i_gen < vers && dip != NULL 139 && lfs_dino_getnlink(fs, dip) > 0) { 140 /* 141 * We have found a newer version. Truncate 142 * the old vnode to zero and re-initialize 143 * from the given dinode. 144 */ 145 DLOG((DLOG_RF, " replace old version %jd\n", 146 (intmax_t)ip->i_gen)); 147 lfs_truncate(vp, (off_t)0, 0, NOCRED); 148 ip->i_gen = vers; 149 vp->v_type = IFTOVT(lfs_dino_getmode(fs, dip)); 150 update_inoblk_copy_dinode(fs, ip->i_din, dip); 151 LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE); 152 return 0; 153 } else { 154 /* 155 * Not the right version and nothing to 156 * initialize from. Don't recover this data. 157 */ 158 DLOG((DLOG_RF, "ino %d: sought version %d, got %d\n", 159 (int)ino, (int)vers, 160 (int)lfs_dino_getgen(fs, ip->i_din))); 161 vput(vp); 162 *vpp = NULLVP; 163 return EEXIST; 164 } 165 } 166 167 /* 168 * No version of this inode was found in the cache. 169 * Make a new one from the dinode. We will add data blocks 170 * as they come in, so scrub any block addresses off of the 171 * inode and reset block counts to zero. 172 */ 173 if (dip == NULL) 174 return ENOENT; 175 176 vattr_null(&va); 177 va.va_type = IFTOVT(lfs_dino_getmode(fs, dip)); 178 va.va_mode = lfs_dino_getmode(fs, dip) & ALLPERMS; 179 va.va_fileid = ino; 180 va.va_gen = vers; 181 error = vcache_new(fs->lfs_ivnode->v_mount, NULL, &va, NOCRED, NULL, 182 &vp); 183 if (error) 184 return error; 185 error = vn_lock(vp, LK_EXCLUSIVE); 186 if (error) 187 goto err; 188 189 ip = VTOI(vp); 190 update_inoblk_copy_dinode(fs, ip->i_din, dip); 191 192 DLOG((DLOG_RF, "lfs_valloc[2] ino %d vp %p size=%lld effnblks=%d," 193 " blocks=%d\n", (int)ino, vp, (long long)ip->i_size, 194 (int)ip->i_lfs_effnblks, 195 (int)lfs_dino_getblocks(fs, ip->i_din))); 196 *vpp = vp; 197 return 0; 198 199 err: 200 vrele(vp); 201 *vpp = NULLVP; 202 return error; 203 } 204 205 /* 206 * Load the appropriate indirect block, and change the appropriate pointer. 207 * Mark the block dirty. Do segment and avail accounting. 208 */ 209 static int 210 update_meta(struct lfs *fs, ino_t ino, int vers, daddr_t lbn, 211 daddr_t ndaddr, size_t size, struct lwp *l) 212 { 213 int error; 214 struct vnode *vp; 215 struct inode *ip; 216 daddr_t odaddr; 217 struct indir a[ULFS_NIADDR]; 218 int num; 219 struct buf *bp; 220 SEGUSE *sup; 221 u_int64_t newsize, loff; 222 223 KASSERT(lbn >= 0); /* no indirect blocks */ 224 KASSERT(ino > LFS_IFILE_INUM); 225 226 DLOG((DLOG_RF, "update_meta: ino %d lbn %d size %d at 0x%jx\n", 227 (int)ino, (int)lbn, (int)size, (uintmax_t)ndaddr)); 228 229 if ((error = lfs_rf_valloc(fs, ino, vers, l, &vp, NULL)) != 0) 230 return error; 231 ip = VTOI(vp); 232 233 /* 234 * If block already exists, note its new location 235 * but do not account it as new. 236 */ 237 ulfs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL, NULL); 238 if (odaddr == UNASSIGNED) { 239 if ((error = lfs_balloc(vp, (lbn << lfs_sb_getbshift(fs)), 240 size, NOCRED, 0, &bp)) != 0) { 241 vput(vp); 242 return (error); 243 } 244 /* No need to write, the block is already on disk */ 245 if (bp->b_oflags & BO_DELWRI) { 246 LFS_UNLOCK_BUF(bp); 247 /* Account recovery of the previous version */ 248 lfs_sb_addavail(fs, lfs_btofsb(fs, bp->b_bcount)); 249 } 250 brelse(bp, BC_INVAL); 251 DLOG((DLOG_RF, "balloc ip->i_lfs_effnblks = %d," 252 " lfs_dino_getblocks(fs, ip->i_din) = %d\n", 253 (int)ip->i_lfs_effnblks, 254 (int)lfs_dino_getblocks(fs, ip->i_din))); 255 } else { 256 /* XXX fragextend? */ 257 DLOG((DLOG_RF, "block exists, no balloc\n")); 258 } 259 260 /* 261 * Extend the file, if it is not large enough already. 262 * XXX this is not exactly right, we don't know how much of the 263 * XXX last block is actually used. 264 */ 265 loff = lfs_lblktosize(fs, lbn); 266 if (loff >= (ULFS_NDADDR << lfs_sb_getbshift(fs))) { 267 /* No fragments */ 268 newsize = loff + 1; 269 } else { 270 /* Subtract only a fragment to account for block size */ 271 newsize = loff + size - lfs_fsbtob(fs, 1) + 1; 272 } 273 274 if (ip->i_size < newsize) { 275 DLOG((DLOG_RF, "ino %d size %d -> %d\n", 276 (int)ino, (int)ip->i_size, (int)newsize)); 277 lfs_dino_setsize(fs, ip->i_din, newsize); 278 ip->i_size = newsize; 279 /* 280 * tell vm our new size for the case the inode won't 281 * appear later. 282 */ 283 uvm_vnp_setsize(vp, newsize); 284 } 285 286 lfs_update_single(fs, NULL, vp, lbn, ndaddr, size); 287 288 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, ndaddr), bp); 289 sup->su_nbytes += size; 290 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, ndaddr), bp); 291 292 /* differences here should be due to UNWRITTEN indirect blocks. */ 293 if (vp->v_type != VLNK) { 294 if (!(ip->i_lfs_effnblks >= lfs_dino_getblocks(fs, ip->i_din)) 295 #if 0 296 || !(lfs_lblkno(fs, ip->i_size) > ULFS_NDADDR || 297 ip->i_lfs_effnblks == lfs_dino_getblocks(fs, ip->i_din)) 298 #endif /* 0 */ 299 ) { 300 vprint("vnode", vp); 301 printf("effnblks=%jd dino_getblocks=%jd\n", 302 (intmax_t)ip->i_lfs_effnblks, 303 (intmax_t)lfs_dino_getblocks(fs, ip->i_din)); 304 } 305 KASSERT(ip->i_lfs_effnblks >= lfs_dino_getblocks(fs, ip->i_din)); 306 #if 0 307 KASSERT(lfs_lblkno(fs, ip->i_size) > ULFS_NDADDR || 308 ip->i_lfs_effnblks == lfs_dino_getblocks(fs, ip->i_din)); 309 #endif /* 0 */ 310 } 311 312 #ifdef DEBUG 313 /* Now look again to make sure it worked */ 314 ulfs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL, NULL); 315 if (LFS_DBTOFSB(fs, odaddr) != ndaddr) 316 DLOG((DLOG_RF, "update_meta: failed setting ino %jd lbn %jd" 317 " to %jd\n", (intmax_t)ino, (intmax_t)lbn, (intmax_t)ndaddr)); 318 #endif /* DEBUG */ 319 vput(vp); 320 return 0; 321 } 322 323 /* 324 * Copy some the fields of the dinode as needed by update_inoblk(). 325 */ 326 static void 327 update_inoblk_copy_dinode(struct lfs *fs, 328 union lfs_dinode *dstu, const union lfs_dinode *srcu) 329 { 330 if (fs->lfs_is64) { 331 struct lfs64_dinode *dst = &dstu->u_64; 332 const struct lfs64_dinode *src = &srcu->u_64; 333 unsigned i; 334 335 /* 336 * Copy everything but the block pointers and di_blocks. 337 * XXX what about di_extb? 338 */ 339 dst->di_mode = src->di_mode; 340 dst->di_nlink = src->di_nlink; 341 dst->di_uid = src->di_uid; 342 dst->di_gid = src->di_gid; 343 dst->di_blksize = src->di_blksize; 344 dst->di_size = src->di_size; 345 dst->di_atime = src->di_atime; 346 dst->di_mtime = src->di_mtime; 347 dst->di_ctime = src->di_ctime; 348 dst->di_birthtime = src->di_birthtime; 349 dst->di_mtimensec = src->di_mtimensec; 350 dst->di_atimensec = src->di_atimensec; 351 dst->di_ctimensec = src->di_ctimensec; 352 dst->di_birthnsec = src->di_birthnsec; 353 dst->di_gen = src->di_gen; 354 dst->di_kernflags = src->di_kernflags; 355 dst->di_flags = src->di_flags; 356 dst->di_extsize = src->di_extsize; 357 dst->di_modrev = src->di_modrev; 358 dst->di_inumber = src->di_inumber; 359 for (i = 0; i < __arraycount(src->di_spare); i++) { 360 dst->di_spare[i] = src->di_spare[i]; 361 } 362 /* Short symlinks store their data in di_db. */ 363 if ((src->di_mode & LFS_IFMT) == LFS_IFLNK 364 && src->di_size < lfs_sb_getmaxsymlinklen(fs)) { 365 memcpy(dst->di_db, src->di_db, src->di_size); 366 } 367 } else { 368 struct lfs32_dinode *dst = &dstu->u_32; 369 const struct lfs32_dinode *src = &srcu->u_32; 370 371 /* Get mode, link count, size, and times */ 372 memcpy(dst, src, offsetof(struct lfs32_dinode, di_db[0])); 373 374 /* Then the rest, except di_blocks */ 375 dst->di_flags = src->di_flags; 376 dst->di_gen = src->di_gen; 377 dst->di_uid = src->di_uid; 378 dst->di_gid = src->di_gid; 379 dst->di_modrev = src->di_modrev; 380 381 /* Short symlinks store their data in di_db. */ 382 if ((src->di_mode & LFS_IFMT) == LFS_IFLNK 383 && src->di_size < lfs_sb_getmaxsymlinklen(fs)) { 384 memcpy(dst->di_db, src->di_db, src->di_size); 385 } 386 } 387 } 388 389 static int 390 update_inoblk(struct lfs *fs, daddr_t offset, kauth_cred_t cred, 391 struct lwp *l) 392 { 393 struct vnode *devvp, *vp; 394 struct inode *ip; 395 union lfs_dinode *dip; 396 struct buf *dbp, *ibp; 397 int error; 398 daddr_t daddr; 399 IFILE *ifp; 400 SEGUSE *sup; 401 unsigned i, num; 402 uint32_t gen; 403 char *buf; 404 405 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 406 407 /* 408 * Get the inode, update times and perms. 409 * DO NOT update disk blocks, we do that separately. 410 */ 411 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs), 412 0, &dbp); 413 if (error) { 414 DLOG((DLOG_RF, "update_inoblk: bread returned %d\n", error)); 415 return error; 416 } 417 buf = malloc(dbp->b_bcount, M_SEGMENT, M_WAITOK); 418 memcpy(buf, dbp->b_data, dbp->b_bcount); 419 brelse(dbp, BC_AGE); 420 num = LFS_INOPB(fs); 421 for (i = num; i-- > 0; ) { 422 dip = DINO_IN_BLOCK(fs, buf, i); 423 if (lfs_dino_getinumber(fs, dip) <= LFS_IFILE_INUM) 424 continue; 425 426 /* Check generation number */ 427 LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp); 428 gen = lfs_if_getversion(fs, ifp); 429 brelse(ibp, 0); 430 if (lfs_dino_getgen(fs, dip) < gen) { 431 continue; 432 } 433 434 /* 435 * This inode is the newest generation. Load it. 436 */ 437 error = lfs_rf_valloc(fs, lfs_dino_getinumber(fs, dip), 438 lfs_dino_getgen(fs, dip), 439 l, &vp, dip); 440 if (error) { 441 DLOG((DLOG_RF, "update_inoblk: lfs_rf_valloc" 442 " returned %d\n", error)); 443 continue; 444 } 445 ip = VTOI(vp); 446 if (lfs_dino_getsize(fs, dip) != ip->i_size 447 && vp->v_type != VLNK) { 448 /* XXX What should we do sith symlinks? */ 449 DLOG((DLOG_RF, " ino %jd size %jd -> %jd\n", 450 (intmax_t)lfs_dino_getinumber(fs, dip), 451 (intmax_t)ip->i_size, 452 (intmax_t)lfs_dino_getsize(fs, dip))); 453 lfs_truncate(vp, lfs_dino_getsize(fs, dip), 0, 454 NOCRED); 455 } 456 update_inoblk_copy_dinode(fs, ip->i_din, dip); 457 458 ip->i_flags = lfs_dino_getflags(fs, dip); 459 ip->i_gen = lfs_dino_getgen(fs, dip); 460 ip->i_uid = lfs_dino_getuid(fs, dip); 461 ip->i_gid = lfs_dino_getgid(fs, dip); 462 463 ip->i_mode = lfs_dino_getmode(fs, dip); 464 ip->i_nlink = lfs_dino_getnlink(fs, dip); 465 ip->i_size = lfs_dino_getsize(fs, dip); 466 467 LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE); 468 469 /* Re-initialize to get type right */ 470 ulfs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p, 471 &vp); 472 473 /* Record change in location */ 474 LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp); 475 daddr = lfs_if_getdaddr(fs, ifp); 476 lfs_if_setdaddr(fs, ifp, LFS_DBTOFSB(fs, dbp->b_blkno)); 477 error = LFS_BWRITE_LOG(ibp); /* Ifile */ 478 /* And do segment accounting */ 479 if (lfs_dtosn(fs, daddr) 480 != lfs_dtosn(fs, LFS_DBTOFSB(fs, dbp->b_blkno))) { 481 if (!DADDR_IS_BAD(daddr)) { 482 LFS_SEGENTRY(sup, fs, 483 lfs_dtosn(fs, daddr), ibp); 484 sup->su_nbytes -= DINOSIZE(fs); 485 LFS_WRITESEGENTRY(sup, fs, 486 lfs_dtosn(fs, daddr), 487 ibp); 488 } 489 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, 490 LFS_DBTOFSB(fs, dbp->b_blkno)), 491 ibp); 492 sup->su_nbytes += DINOSIZE(fs); 493 LFS_WRITESEGENTRY(sup, fs, 494 lfs_dtosn(fs, LFS_DBTOFSB(fs, 495 dbp->b_blkno)), 496 ibp); 497 } 498 vput(vp); 499 } 500 free(buf, M_SEGMENT); 501 502 return 0; 503 } 504 505 /* 506 * Note the highest generation number of each inode in the Ifile. 507 * This allows us to skip processing data for intermediate versions. 508 */ 509 static int 510 update_inogen(struct lfs *fs, daddr_t offset) 511 { 512 struct vnode *devvp; 513 union lfs_dinode *dip; 514 struct buf *dbp, *ibp; 515 int error; 516 IFILE *ifp; 517 unsigned i, num; 518 519 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 520 521 /* Read inode block */ 522 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs), 523 0, &dbp); 524 if (error) { 525 DLOG((DLOG_RF, "update_inoblk: bread returned %d\n", error)); 526 return error; 527 } 528 529 /* Check each inode against ifile entry */ 530 num = LFS_INOPB(fs); 531 for (i = num; i-- > 0; ) { 532 dip = DINO_IN_BLOCK(fs, dbp->b_data, i); 533 if (lfs_dino_getinumber(fs, dip) == LFS_IFILE_INUM) 534 continue; 535 536 /* Update generation number */ 537 LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp); 538 if (lfs_if_getversion(fs, ifp) < lfs_dino_getgen(fs, dip)) 539 lfs_if_setversion(fs, ifp, lfs_dino_getgen(fs, dip)); 540 error = LFS_BWRITE_LOG(ibp); /* Ifile */ 541 if (error) 542 break; 543 } 544 brelse(dbp, BC_AGE); 545 546 return error; 547 } 548 549 #define CHECK_CKSUM 1 /* Check the checksum to make sure it's valid */ 550 #define CHECK_GEN 2 /* Update highest generation number */ 551 #define CHECK_INODES 3 /* Read and process inodes */ 552 #define CHECK_DATA 4 /* Identify and process data blocks */ 553 554 static daddr_t 555 check_segsum(struct lfs *fs, daddr_t offset, u_int64_t nextserial, 556 kauth_cred_t cred, int phase, int *pseg_flags, struct lwp *l) 557 { 558 struct vnode *devvp; 559 struct buf *bp, *dbp; 560 int error, ninos, i, j; 561 SEGSUM *ssp; 562 daddr_t prevoffset; 563 IINFO *iip; 564 FINFO *fip; 565 SEGUSE *sup; 566 size_t size; 567 uint32_t datasum, foundsum; 568 char *buf; 569 570 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 571 572 /* 573 * If this is segment 0, skip the label. 574 * If the segment has a superblock and we're at the top 575 * of the segment, skip the superblock. 576 */ 577 if (offset == lfs_sb_gets0addr(fs)) 578 offset += lfs_btofsb(fs, LFS_LABELPAD); 579 if (lfs_sntod(fs, lfs_dtosn(fs, offset)) == offset) { 580 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp); 581 if (sup->su_flags & SEGUSE_SUPERBLOCK) 582 offset += lfs_btofsb(fs, LFS_SBPAD); 583 brelse(bp, 0); 584 } 585 586 /* Read in the segment summary */ 587 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getsumsize(fs), 588 0, &bp); 589 if (error) 590 return -1; 591 buf = malloc(bp->b_bcount, M_SEGMENT, M_WAITOK); 592 memcpy(buf, bp->b_data, bp->b_bcount); 593 brelse(bp, BC_AGE); 594 595 ssp = (SEGSUM *)buf; 596 597 /* 598 * Phase I: Check summary checksum. 599 */ 600 if (phase == CHECK_CKSUM) { 601 size_t sumstart; 602 603 sumstart = lfs_ss_getsumstart(fs); 604 if (lfs_ss_getsumsum(fs, ssp) != 605 cksum((char *)ssp + sumstart, 606 lfs_sb_getsumsize(fs) - sumstart)) { 607 DLOG((DLOG_RF, "Sumsum error at 0x%" PRIx64 "\n", 608 offset)); 609 offset = -1; 610 goto err; 611 } 612 #if 0 613 /* 614 * Under normal conditions, we should never be producing 615 * a partial segment with neither inode blocks nor data blocks. 616 * However, these do sometimes appear and they need not 617 * prevent us from continuing. 618 */ 619 if (lfs_ss_getnfinfo(fs, ssp) == 0 && 620 lfs_ss_getninos(fs, ssp) == 0) { 621 DLOG((DLOG_RF, "Empty pseg at 0x%" PRIx64 "\n", 622 offset)); 623 offset = -1; 624 goto err; 625 } 626 #endif /* 0 */ 627 if (lfs_sb_getversion(fs) == 1) { 628 if (lfs_ss_getcreate(fs, ssp) < lfs_sb_gettstamp(fs)) { 629 DLOG((DLOG_RF, "Old data at 0x%" PRIx64 "\n", offset)); 630 offset = -1; 631 goto err; 632 } 633 } else { 634 if (lfs_ss_getserial(fs, ssp) != nextserial) { 635 DLOG((DLOG_RF, "Serial number at 0x%jx given as 0x%jx," 636 " expected 0x%jx\n", (intmax_t)offset, 637 (intmax_t)lfs_ss_getserial(fs, ssp), 638 (intmax_t)nextserial)); 639 offset = -1; 640 goto err; 641 } 642 if (lfs_ss_getident(fs, ssp) != lfs_sb_getident(fs)) { 643 DLOG((DLOG_RF, "Incorrect fsid (0x%x vs 0x%x) at 0x%" 644 PRIx64 "\n", lfs_ss_getident(fs, ssp), 645 lfs_sb_getident(fs), offset)); 646 offset = -1; 647 goto err; 648 } 649 } 650 } 651 if (pseg_flags) 652 *pseg_flags = lfs_ss_getflags(fs, ssp); 653 prevoffset = offset; 654 offset += lfs_btofsb(fs, lfs_sb_getsumsize(fs)); 655 656 /* Handle individual blocks */ 657 foundsum = 0; 658 ninos = howmany(lfs_ss_getninos(fs, ssp), LFS_INOPB(fs)); 659 iip = SEGSUM_IINFOSTART(fs, buf); 660 fip = SEGSUM_FINFOBASE(fs, (SEGSUM *)buf); 661 for (i = 0; i < lfs_ss_getnfinfo(fs, ssp) || ninos; ++i) { 662 /* Inode block? */ 663 if (ninos && lfs_ii_getblock(fs, iip) == offset) { 664 if (phase == CHECK_CKSUM) { 665 /* Read in the head and add to the buffer */ 666 error = bread(devvp, LFS_FSBTODB(fs, offset), 667 lfs_sb_getbsize(fs), 0, &dbp); 668 if (error) { 669 offset = -1; 670 goto err; 671 } 672 foundsum = lfs_cksum_part(dbp->b_data, 673 sizeof(uint32_t), foundsum); 674 brelse(dbp, BC_AGE); 675 } 676 if (phase == CHECK_GEN) { 677 if ((error = update_inogen(fs, offset)) 678 != 0) { 679 offset = -1; 680 goto err; 681 } 682 } 683 if (phase == CHECK_INODES) { 684 if ((error = update_inoblk(fs, offset, cred, l)) 685 != 0) { 686 offset = -1; 687 goto err; 688 } 689 } 690 offset += lfs_btofsb(fs, lfs_sb_getibsize(fs)); 691 iip = NEXTLOWER_IINFO(fs, iip); 692 --ninos; 693 --i; /* compensate for ++i in loop header */ 694 continue; 695 } 696 697 /* File block */ 698 size = lfs_sb_getbsize(fs); 699 for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) { 700 if (j == lfs_fi_getnblocks(fs, fip) - 1) 701 size = lfs_fi_getlastlength(fs, fip); 702 if (phase == CHECK_CKSUM) { 703 error = bread(devvp, LFS_FSBTODB(fs, offset), 704 size, 0, &dbp); 705 if (error) { 706 offset = -1; 707 goto err; 708 } 709 foundsum = lfs_cksum_part(dbp->b_data, 710 sizeof(uint32_t), foundsum); 711 brelse(dbp, BC_AGE); 712 } 713 /* Account for and update any direct blocks */ 714 if (phase == CHECK_DATA && 715 lfs_fi_getino(fs, fip) > LFS_IFILE_INUM && 716 lfs_fi_getblock(fs, fip, j) >= 0) { 717 update_meta(fs, lfs_fi_getino(fs, fip), 718 lfs_fi_getversion(fs, fip), 719 lfs_fi_getblock(fs, fip, j), 720 offset, size, l); 721 ++rblkcnt; 722 } 723 offset += lfs_btofsb(fs, size); 724 } 725 726 fip = NEXT_FINFO(fs, fip); 727 } 728 729 /* Checksum the array, compare */ 730 if (phase == CHECK_CKSUM) { 731 datasum = lfs_ss_getdatasum(fs, ssp); 732 foundsum = lfs_cksum_fold(foundsum); 733 if (datasum != foundsum) { 734 DLOG((DLOG_RF, "Datasum error at 0x%" PRIx64 735 " (wanted %x got %x)\n", 736 offset, datasum, foundsum)); 737 offset = -1; 738 goto err; 739 } 740 } 741 742 if (phase == CHECK_CKSUM) 743 lfs_sb_subavail(fs, offset - prevoffset); 744 else { 745 /* Don't clog the buffer queue */ 746 mutex_enter(&lfs_lock); 747 if (locked_queue_count > LFS_MAX_BUFS || 748 locked_queue_bytes > LFS_MAX_BYTES) { 749 lfs_flush(fs, SEGM_CKP, 0); 750 } 751 mutex_exit(&lfs_lock); 752 } 753 754 /* 755 * If we're at the end of the segment, move to the next. 756 * A partial segment needs space for a segment header (1 fsb) 757 * and a full block ("frag" fsb). Thus, adding "frag" fsb should 758 * still be within the current segment (whereas frag + 1 might 759 * be at the start of the next segment). 760 * 761 * This needs to match the definition of LFS_PARTIAL_FITS 762 * in lfs_segment.c. 763 */ 764 if (lfs_dtosn(fs, offset + lfs_sb_getfrag(fs)) 765 != lfs_dtosn(fs, offset)) { 766 if (lfs_dtosn(fs, offset) == lfs_dtosn(fs, lfs_ss_getnext(fs, 767 ssp))) { 768 printf("WHOA! at 0x%jx/seg %jd moving to 0x%jx/seg %jd\n", 769 (intmax_t)offset, 770 (intmax_t)lfs_dtosn(fs, offset), 771 (intmax_t)lfs_ss_getnext(fs, ssp), 772 (intmax_t)lfs_dtosn(fs, lfs_ss_getnext(fs, ssp))); 773 offset = -1; 774 goto err; 775 } 776 offset = lfs_ss_getnext(fs, ssp); 777 DLOG((DLOG_RF, "LFS roll forward: moving to offset 0x%" PRIx64 778 " -> segment %d\n", offset, lfs_dtosn(fs,offset))); 779 } 780 781 err: 782 free(buf, M_SEGMENT); 783 784 return offset; 785 } 786 787 void 788 lfs_roll_forward(struct lfs *fs, struct mount *mp, struct lwp *l) 789 { 790 int flags, dirty, phase; 791 daddr_t startoffset, offset, nextoffset, endpseg; 792 u_int64_t nextserial, startserial, endserial; 793 int sn, curseg; 794 struct proc *p; 795 kauth_cred_t cred; 796 SEGUSE *sup; 797 struct buf *bp; 798 799 p = l ? l->l_proc : NULL; 800 cred = p ? p->p_cred : NOCRED; 801 802 /* 803 * Roll forward. 804 * 805 * We don't roll forward for v1 filesystems, because 806 * of the danger that the clock was turned back between the last 807 * checkpoint and crash. This would roll forward garbage. 808 * 809 * v2 filesystems don't have this problem because they use a 810 * monotonically increasing serial number instead of a timestamp. 811 */ 812 rblkcnt = 0; 813 if ((lfs_sb_getpflags(fs) & LFS_PF_CLEAN) || !lfs_do_rfw 814 || lfs_sb_getversion(fs) <= 1 || p == NULL) 815 return; 816 817 DLOG((DLOG_RF, "%s: begin roll forward at serial 0x%jx\n", 818 lfs_sb_getfsmnt(fs), (intmax_t)lfs_sb_getserial(fs))); 819 DEBUG_CHECK_FREELIST(fs); 820 821 /* 822 * Phase I: Find the address of the last good partial 823 * segment that was written after the checkpoint. Mark 824 * the segments in question dirty, so they won't be 825 * reallocated. 826 */ 827 endpseg = startoffset = offset = lfs_sb_getoffset(fs); 828 flags = 0x0; 829 DLOG((DLOG_RF, "LFS roll forward phase 1: start at offset 0x%" 830 PRIx64 "\n", offset)); 831 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp); 832 if (!(sup->su_flags & SEGUSE_DIRTY)) 833 lfs_sb_subnclean(fs, 1); 834 sup->su_flags |= SEGUSE_DIRTY; 835 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp); 836 837 startserial = lfs_sb_getserial(fs); 838 endserial = nextserial = startserial + 1; 839 while ((nextoffset = check_segsum(fs, offset, nextserial, 840 cred, CHECK_CKSUM, &flags, l)) > 0) { 841 if (lfs_sntod(fs, offset) != lfs_sntod(fs, nextoffset)) { 842 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset), 843 bp); 844 if (!(sup->su_flags & SEGUSE_DIRTY)) 845 lfs_sb_subnclean(fs, 1); 846 sup->su_flags |= SEGUSE_DIRTY; 847 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp); 848 } 849 850 DLOG((DLOG_RF, "LFS roll forward phase 1: offset=0x%jx" 851 " serial=0x%jx\n", (intmax_t)nextoffset, 852 (intmax_t)nextserial)); 853 if (flags & SS_DIROP) { 854 DLOG((DLOG_RF, "lfs_mountfs: dirops at 0x%" 855 PRIx64 "\n", offset)); 856 if (!(flags & SS_CONT)) { 857 DLOG((DLOG_RF, "lfs_mountfs: dirops end " 858 "at 0x%" PRIx64 "\n", offset)); 859 } 860 } 861 offset = nextoffset; 862 ++nextserial; 863 864 if (!(flags & SS_CONT)) { 865 endpseg = nextoffset; 866 endserial = nextserial; 867 } 868 if (lfs_rfw_max_psegs > 0 869 && nextserial > startserial + lfs_rfw_max_psegs) 870 break; 871 } 872 if (flags & SS_CONT) { 873 DLOG((DLOG_RF, "LFS roll forward: warning: incomplete " 874 "dirops discarded (0x%jx < 0x%jx)\n", 875 endpseg, nextoffset)); 876 } 877 if (lfs_sb_getversion(fs) > 1) 878 lfs_sb_setserial(fs, endserial); 879 DLOG((DLOG_RF, "LFS roll forward phase 1: completed: " 880 "endpseg=0x%" PRIx64 "\n", endpseg)); 881 offset = startoffset; 882 if (offset != endpseg) { 883 /* Don't overwrite what we're trying to preserve */ 884 lfs_sb_setoffset(fs, endpseg); 885 lfs_sb_setcurseg(fs, lfs_sntod(fs, lfs_dtosn(fs, endpseg))); 886 for (sn = curseg = lfs_dtosn(fs, lfs_sb_getcurseg(fs));;) { 887 sn = (sn + 1) % lfs_sb_getnseg(fs); 888 /* XXX could we just fail to roll forward? */ 889 if (sn == curseg) 890 panic("lfs_mountfs: no clean segments"); 891 LFS_SEGENTRY(sup, fs, sn, bp); 892 dirty = (sup->su_flags & SEGUSE_DIRTY); 893 brelse(bp, 0); 894 if (!dirty) 895 break; 896 } 897 lfs_sb_setnextseg(fs, lfs_sntod(fs, sn)); 898 /* Explicitly set this segment dirty */ 899 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, endpseg), bp); 900 sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE; 901 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, endpseg), bp); 902 903 904 /* 905 * Phase II: Identify the highest generation of each 906 * inode. 907 * 908 * Phase III: Update inodes. We end up with the 909 * last version of each inode present, and can ignore 910 * data blocks belonging to previous versions. 911 * 912 * Phase IV: Roll forward, updating data blocks. 913 */ 914 for (phase = CHECK_GEN; phase <= CHECK_DATA; ++phase) { 915 offset = startoffset; 916 nextserial = startserial + 1; 917 printf("LFS roll forward phase %d beginning\n", phase); 918 while (offset > 0 && offset != endpseg) { 919 if (phase == CHECK_DATA) { 920 DLOG((DLOG_RF, "LFS roll forward" 921 " phase %d: offset=0x%jx" 922 " serial=0x%jx\n", 923 phase, (intmax_t)offset, 924 (intmax_t)nextserial)); 925 } 926 offset = check_segsum(fs, offset, 927 nextserial, cred, 928 phase, NULL, l); 929 ++nextserial; 930 DEBUG_CHECK_FREELIST(fs); 931 } 932 } 933 934 /* 935 * Finish: flush our changes to disk. 936 */ 937 lfs_sb_setserial(fs, endserial); 938 939 lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC); 940 DLOG((DLOG_RF, "lfs_mountfs: roll forward " 941 "examined %jd blocks\n", 942 (intmax_t)(endpseg - startoffset))); 943 } 944 945 /* Get rid of our vnodes, except the ifile */ 946 drop_vnode_pages(mp, l); 947 DLOG((DLOG_RF, "LFS roll forward complete\n")); 948 printf("%s: roll forward recovered %d data blocks\n", 949 lfs_sb_getfsmnt(fs), rblkcnt); 950 951 /* 952 * At this point we have no more changes to write to disk. 953 * Reset the "avail" count to match the segments as they 954 * appear on disk, and the clean segment count. 955 */ 956 lfs_reset_avail(fs); 957 } 958 959 static bool 960 all_selector(void *cl, struct vnode *vp) 961 { 962 return true; 963 } 964 965 966 /* 967 * Dump any pages from vnodes that may have been put on 968 * during truncation. 969 */ 970 static void 971 drop_vnode_pages(struct mount *mp, struct lwp *l) 972 { 973 struct vnode_iterator *marker; 974 struct lfs *fs; 975 struct vnode *vp; 976 977 fs = VFSTOULFS(mp)->um_lfs; 978 vfs_vnode_iterator_init(mp, &marker); 979 while ((vp = vfs_vnode_iterator_next(marker, 980 all_selector, NULL)) != NULL) { 981 if (vp == fs->lfs_ivnode) 982 continue; 983 VOP_LOCK(vp, LK_EXCLUSIVE | LK_RETRY); 984 uvm_vnp_setsize(vp, 0); 985 uvm_vnp_setsize(vp, VTOI(vp)->i_size); 986 VOP_UNLOCK(vp); 987 vrele(vp); 988 } 989 vfs_vnode_iterator_destroy(marker); 990 } 991 992