1 /* $NetBSD: lfs_rfw.c,v 1.37 2025/09/17 04:37:47 perseant Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Konrad E. Schroder <perseant (at) hhhh.org>. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: lfs_rfw.c,v 1.37 2025/09/17 04:37:47 perseant Exp $"); 34 35 #if defined(_KERNEL_OPT) 36 #include "opt_quota.h" 37 #endif 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/namei.h> 42 #include <sys/proc.h> 43 #include <sys/kernel.h> 44 #include <sys/vnode.h> 45 #include <sys/mount.h> 46 #include <sys/kthread.h> 47 #include <sys/buf.h> 48 #include <sys/device.h> 49 #include <sys/file.h> 50 #include <sys/disklabel.h> 51 #include <sys/ioctl.h> 52 #include <sys/errno.h> 53 #include <sys/malloc.h> 54 #include <sys/pool.h> 55 #include <sys/socket.h> 56 #include <sys/stat.h> 57 #include <sys/syslog.h> 58 #include <sys/sysctl.h> 59 #include <sys/conf.h> 60 #include <sys/kauth.h> 61 62 #include <miscfs/specfs/specdev.h> 63 64 #include <ufs/lfs/ulfs_quotacommon.h> 65 #include <ufs/lfs/ulfs_inode.h> 66 #include <ufs/lfs/ulfsmount.h> 67 #include <ufs/lfs/ulfs_extern.h> 68 69 #include <uvm/uvm_extern.h> 70 71 #include <ufs/lfs/lfs.h> 72 #include <ufs/lfs/lfs_accessors.h> 73 #include <ufs/lfs/lfs_kernel.h> 74 #include <ufs/lfs/lfs_extern.h> 75 76 #include <miscfs/genfs/genfs.h> 77 #include <miscfs/genfs/genfs_node.h> 78 79 /* 80 * Roll-forward code. 81 */ 82 static daddr_t check_segsum(struct lfs *, daddr_t, u_int64_t, 83 kauth_cred_t, int, int *, struct lwp *); 84 85 static bool all_selector(void *, struct vnode *); 86 static void drop_vnode_pages(struct mount *, struct lwp *); 87 static int update_inogen(struct lfs *, daddr_t); 88 static void update_inoblk_copy_dinode(struct lfs *, union lfs_dinode *, const union lfs_dinode *); 89 90 extern int lfs_do_rfw; 91 int rblkcnt; 92 int lfs_rfw_max_psegs = 0; 93 94 /* 95 * Allocate a particular inode with a particular version number, freeing 96 * any previous versions of this inode that may have gone before. 97 * Used by the roll-forward code. 98 * 99 * XXX this function does not have appropriate locking to be used on a live fs; 100 * XXX but something similar could probably be used for an "undelete" call. 101 * 102 * Called with the Ifile inode locked. 103 */ 104 int 105 lfs_rf_valloc(struct lfs *fs, ino_t ino, int vers, struct lwp *l, 106 struct vnode **vpp, union lfs_dinode *dip) 107 { 108 struct vattr va; 109 struct vnode *vp; 110 struct inode *ip; 111 int error; 112 113 KASSERT(ino > LFS_IFILE_INUM); 114 ASSERT_SEGLOCK(fs); /* XXX it doesn't, really */ 115 116 /* 117 * First, just try a vget. If the version number is the one we want, 118 * we don't have to do anything else. If the version number is wrong, 119 * take appropriate action. 120 */ 121 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, LK_EXCLUSIVE, &vp); 122 if (error == 0) { 123 DLOG((DLOG_RF, "lfs_rf_valloc[1]: ino %d vp %p\n", 124 (int)ino, vp)); 125 126 *vpp = vp; 127 ip = VTOI(vp); 128 DLOG((DLOG_RF, " ip->i_gen=%jd dip nlink %jd seeking" 129 " version %jd\n", (intmax_t)ip->i_gen, 130 (intmax_t)(dip == NULL ? -1 131 : lfs_dino_getnlink(fs, dip)), (intmax_t)vers)); 132 if (ip->i_gen == vers) { 133 /* 134 * We have what we wanted already. 135 */ 136 DLOG((DLOG_RF, " pre-existing\n")); 137 return 0; 138 } else if (ip->i_gen < vers && dip != NULL 139 && lfs_dino_getnlink(fs, dip) > 0) { 140 /* 141 * We have found a newer version. Truncate 142 * the old vnode to zero and re-initialize 143 * from the given dinode. 144 */ 145 DLOG((DLOG_RF, " replace old version %jd\n", 146 (intmax_t)ip->i_gen)); 147 lfs_truncate(vp, (off_t)0, 0, NOCRED); 148 ip->i_gen = vers; 149 vp->v_type = IFTOVT(lfs_dino_getmode(fs, dip)); 150 update_inoblk_copy_dinode(fs, ip->i_din, dip); 151 LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE); 152 return 0; 153 } else { 154 /* 155 * Not the right version and nothing to 156 * initialize from. Don't recover this data. 157 */ 158 DLOG((DLOG_RF, "ino %d: sought version %d, got %d\n", 159 (int)ino, (int)vers, 160 (int)lfs_dino_getgen(fs, ip->i_din))); 161 vput(vp); 162 *vpp = NULLVP; 163 return EEXIST; 164 } 165 } 166 167 /* 168 * No version of this inode was found in the cache. 169 * Make a new one from the dinode. We will add data blocks 170 * as they come in, so scrub any block addresses off of the 171 * inode and reset block counts to zero. 172 */ 173 if (dip == NULL) 174 return ENOENT; 175 176 vattr_null(&va); 177 va.va_type = IFTOVT(lfs_dino_getmode(fs, dip)); 178 va.va_mode = lfs_dino_getmode(fs, dip) & ALLPERMS; 179 va.va_fileid = ino; 180 va.va_gen = vers; 181 error = vcache_new(fs->lfs_ivnode->v_mount, NULL, &va, NOCRED, NULL, 182 &vp); 183 if (error) 184 return error; 185 error = vn_lock(vp, LK_EXCLUSIVE); 186 if (error) 187 goto err; 188 189 ip = VTOI(vp); 190 update_inoblk_copy_dinode(fs, ip->i_din, dip); 191 192 DLOG((DLOG_RF, "lfs_valloc[2] ino %d vp %p size=%lld effnblks=%d," 193 " blocks=%d\n", (int)ino, vp, (long long)ip->i_size, 194 (int)ip->i_lfs_effnblks, 195 (int)lfs_dino_getblocks(fs, ip->i_din))); 196 *vpp = vp; 197 return 0; 198 199 err: 200 vrele(vp); 201 *vpp = NULLVP; 202 return error; 203 } 204 205 /* 206 * Load the appropriate indirect block, and change the appropriate pointer. 207 * Mark the block dirty. Do segment and avail accounting. 208 */ 209 static int 210 update_meta(struct lfs *fs, ino_t ino, int vers, daddr_t lbn, 211 daddr_t ndaddr, size_t size, struct lwp *l) 212 { 213 int error; 214 struct vnode *vp; 215 struct inode *ip; 216 daddr_t odaddr; 217 struct indir a[ULFS_NIADDR]; 218 int num; 219 struct buf *bp; 220 SEGUSE *sup; 221 u_int64_t newsize, loff; 222 223 KASSERT(lbn >= 0); /* no indirect blocks */ 224 KASSERT(ino > LFS_IFILE_INUM); 225 226 DLOG((DLOG_RF, "update_meta: ino %d lbn %d size %d at 0x%jx\n", 227 (int)ino, (int)lbn, (int)size, (uintmax_t)ndaddr)); 228 229 if ((error = lfs_rf_valloc(fs, ino, vers, l, &vp, NULL)) != 0) 230 return error; 231 ip = VTOI(vp); 232 233 /* 234 * If block already exists, note its new location 235 * but do not account it as new. 236 */ 237 ulfs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL, NULL); 238 if (odaddr == UNASSIGNED) { 239 if ((error = lfs_balloc(vp, (lbn << lfs_sb_getbshift(fs)), 240 size, NOCRED, 0, &bp)) != 0) { 241 vput(vp); 242 return (error); 243 } 244 /* No need to write, the block is already on disk */ 245 if (bp->b_oflags & BO_DELWRI) { 246 LFS_UNLOCK_BUF(bp); 247 /* Account recovery of the previous version */ 248 lfs_sb_addavail(fs, lfs_btofsb(fs, bp->b_bcount)); 249 } 250 brelse(bp, BC_INVAL); 251 DLOG((DLOG_RF, "balloc ip->i_lfs_effnblks = %d," 252 " lfs_dino_getblocks(fs, ip->i_din) = %d\n", 253 (int)ip->i_lfs_effnblks, 254 (int)lfs_dino_getblocks(fs, ip->i_din))); 255 } else { 256 /* XXX fragextend? */ 257 DLOG((DLOG_RF, "block exists, no balloc\n")); 258 } 259 260 /* 261 * Extend the file, if it is not large enough already. 262 * XXX this is not exactly right, we don't know how much of the 263 * XXX last block is actually used. 264 */ 265 loff = lfs_lblktosize(fs, lbn); 266 if (loff >= (ULFS_NDADDR << lfs_sb_getbshift(fs))) { 267 /* No fragments */ 268 newsize = loff + 1; 269 } else { 270 /* Subtract only a fragment to account for block size */ 271 newsize = loff + size - lfs_fsbtob(fs, 1) + 1; 272 } 273 274 if (ip->i_size < newsize) { 275 DLOG((DLOG_RF, "ino %d size %d -> %d\n", 276 (int)ino, (int)ip->i_size, (int)newsize)); 277 lfs_dino_setsize(fs, ip->i_din, newsize); 278 ip->i_size = newsize; 279 /* 280 * tell vm our new size for the case the inode won't 281 * appear later. 282 */ 283 uvm_vnp_setsize(vp, newsize); 284 } 285 286 lfs_update_single(fs, NULL, vp, lbn, ndaddr, size); 287 288 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, ndaddr), bp); 289 sup->su_nbytes += size; 290 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, ndaddr), bp); 291 292 /* differences here should be due to UNWRITTEN indirect blocks. */ 293 if (vp->v_type != VLNK) { 294 if (!(ip->i_lfs_effnblks >= lfs_dino_getblocks(fs, ip->i_din)) 295 #if 0 296 || !(lfs_lblkno(fs, ip->i_size) > ULFS_NDADDR || 297 ip->i_lfs_effnblks == lfs_dino_getblocks(fs, ip->i_din)) 298 #endif /* 0 */ 299 ) { 300 vprint("vnode", vp); 301 printf("effnblks=%jd dino_getblocks=%jd\n", 302 (intmax_t)ip->i_lfs_effnblks, 303 (intmax_t)lfs_dino_getblocks(fs, ip->i_din)); 304 } 305 KASSERT(ip->i_lfs_effnblks >= lfs_dino_getblocks(fs, ip->i_din)); 306 #if 0 307 KASSERT(lfs_lblkno(fs, ip->i_size) > ULFS_NDADDR || 308 ip->i_lfs_effnblks == lfs_dino_getblocks(fs, ip->i_din)); 309 #endif /* 0 */ 310 } 311 312 #ifdef DEBUG 313 /* Now look again to make sure it worked */ 314 ulfs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL, NULL); 315 if (LFS_DBTOFSB(fs, odaddr) != ndaddr) 316 DLOG((DLOG_RF, "update_meta: failed setting ino %jd lbn %jd" 317 " to %jd\n", (intmax_t)ino, (intmax_t)lbn, (intmax_t)ndaddr)); 318 #endif /* DEBUG */ 319 vput(vp); 320 return 0; 321 } 322 323 /* 324 * Copy some the fields of the dinode as needed by update_inoblk(). 325 */ 326 static void 327 update_inoblk_copy_dinode(struct lfs *fs, 328 union lfs_dinode *dstu, const union lfs_dinode *srcu) 329 { 330 if (fs->lfs_is64) { 331 struct lfs64_dinode *dst = &dstu->u_64; 332 const struct lfs64_dinode *src = &srcu->u_64; 333 unsigned i; 334 335 /* 336 * Copy everything but the block pointers and di_blocks. 337 * XXX what about di_extb? 338 */ 339 dst->di_mode = src->di_mode; 340 dst->di_nlink = src->di_nlink; 341 dst->di_uid = src->di_uid; 342 dst->di_gid = src->di_gid; 343 dst->di_blksize = src->di_blksize; 344 dst->di_size = src->di_size; 345 dst->di_atime = src->di_atime; 346 dst->di_mtime = src->di_mtime; 347 dst->di_ctime = src->di_ctime; 348 dst->di_birthtime = src->di_birthtime; 349 dst->di_mtimensec = src->di_mtimensec; 350 dst->di_atimensec = src->di_atimensec; 351 dst->di_ctimensec = src->di_ctimensec; 352 dst->di_birthnsec = src->di_birthnsec; 353 dst->di_gen = src->di_gen; 354 dst->di_kernflags = src->di_kernflags; 355 dst->di_flags = src->di_flags; 356 dst->di_extsize = src->di_extsize; 357 dst->di_modrev = src->di_modrev; 358 dst->di_inumber = src->di_inumber; 359 for (i = 0; i < __arraycount(src->di_spare); i++) { 360 dst->di_spare[i] = src->di_spare[i]; 361 } 362 /* Short symlinks store their data in di_db. */ 363 if ((src->di_mode & LFS_IFMT) == LFS_IFLNK 364 && src->di_size < lfs_sb_getmaxsymlinklen(fs)) { 365 memcpy(dst->di_db, src->di_db, src->di_size); 366 } 367 } else { 368 struct lfs32_dinode *dst = &dstu->u_32; 369 const struct lfs32_dinode *src = &srcu->u_32; 370 371 /* Get mode, link count, size, and times */ 372 memcpy(dst, src, offsetof(struct lfs32_dinode, di_db[0])); 373 374 /* Then the rest, except di_blocks */ 375 dst->di_flags = src->di_flags; 376 dst->di_gen = src->di_gen; 377 dst->di_uid = src->di_uid; 378 dst->di_gid = src->di_gid; 379 dst->di_modrev = src->di_modrev; 380 381 /* Short symlinks store their data in di_db. */ 382 if ((src->di_mode & LFS_IFMT) == LFS_IFLNK 383 && src->di_size < lfs_sb_getmaxsymlinklen(fs)) { 384 memcpy(dst->di_db, src->di_db, src->di_size); 385 } 386 } 387 } 388 389 static int 390 update_inoblk(struct lfs *fs, daddr_t offset, kauth_cred_t cred, 391 struct lwp *l) 392 { 393 struct vnode *devvp, *vp; 394 struct inode *ip; 395 union lfs_dinode *dip; 396 struct buf *dbp, *ibp; 397 int error; 398 daddr_t daddr; 399 IFILE *ifp; 400 SEGUSE *sup; 401 unsigned i, num; 402 uint32_t gen; 403 char *buf; 404 405 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 406 407 /* 408 * Get the inode, update times and perms. 409 * DO NOT update disk blocks, we do that separately. 410 */ 411 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs), 412 0, &dbp); 413 if (error) { 414 DLOG((DLOG_RF, "update_inoblk: bread returned %d\n", error)); 415 return error; 416 } 417 buf = malloc(dbp->b_bcount, M_SEGMENT, M_WAITOK); 418 memcpy(buf, dbp->b_data, dbp->b_bcount); 419 brelse(dbp, BC_AGE); 420 num = LFS_INOPB(fs); 421 for (i = num; i-- > 0; ) { 422 dip = DINO_IN_BLOCK(fs, buf, i); 423 if (lfs_dino_getinumber(fs, dip) <= LFS_IFILE_INUM) 424 continue; 425 426 /* Check generation number */ 427 LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp); 428 gen = lfs_if_getversion(fs, ifp); 429 brelse(ibp, 0); 430 if (lfs_dino_getgen(fs, dip) < gen) { 431 continue; 432 } 433 434 /* 435 * This inode is the newest generation. Load it. 436 */ 437 error = lfs_rf_valloc(fs, lfs_dino_getinumber(fs, dip), 438 lfs_dino_getgen(fs, dip), 439 l, &vp, dip); 440 if (error) { 441 DLOG((DLOG_RF, "update_inoblk: lfs_rf_valloc" 442 " returned %d\n", error)); 443 continue; 444 } 445 ip = VTOI(vp); 446 if (lfs_dino_getsize(fs, dip) != ip->i_size 447 && vp->v_type != VLNK) { 448 /* XXX What should we do sith symlinks? */ 449 DLOG((DLOG_RF, " ino %jd size %jd -> %jd\n", 450 (intmax_t)lfs_dino_getinumber(fs, dip), 451 (intmax_t)ip->i_size, 452 (intmax_t)lfs_dino_getsize(fs, dip))); 453 lfs_truncate(vp, lfs_dino_getsize(fs, dip), 0, 454 NOCRED); 455 } 456 update_inoblk_copy_dinode(fs, ip->i_din, dip); 457 458 ip->i_flags = lfs_dino_getflags(fs, dip); 459 ip->i_gen = lfs_dino_getgen(fs, dip); 460 ip->i_uid = lfs_dino_getuid(fs, dip); 461 ip->i_gid = lfs_dino_getgid(fs, dip); 462 463 ip->i_mode = lfs_dino_getmode(fs, dip); 464 ip->i_nlink = lfs_dino_getnlink(fs, dip); 465 ip->i_size = lfs_dino_getsize(fs, dip); 466 467 LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE); 468 469 /* Re-initialize to get type right */ 470 ulfs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p, 471 &vp); 472 473 /* Record change in location */ 474 LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp); 475 daddr = lfs_if_getdaddr(fs, ifp); 476 lfs_if_setdaddr(fs, ifp, LFS_DBTOFSB(fs, dbp->b_blkno)); 477 error = LFS_BWRITE_LOG(ibp); /* Ifile */ 478 /* And do segment accounting */ 479 if (lfs_dtosn(fs, daddr) 480 != lfs_dtosn(fs, LFS_DBTOFSB(fs, dbp->b_blkno))) { 481 if (!DADDR_IS_BAD(daddr)) { 482 LFS_SEGENTRY(sup, fs, 483 lfs_dtosn(fs, daddr), ibp); 484 sup->su_nbytes -= DINOSIZE(fs); 485 LFS_WRITESEGENTRY(sup, fs, 486 lfs_dtosn(fs, daddr), 487 ibp); 488 } 489 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, 490 LFS_DBTOFSB(fs, dbp->b_blkno)), 491 ibp); 492 sup->su_nbytes += DINOSIZE(fs); 493 LFS_WRITESEGENTRY(sup, fs, 494 lfs_dtosn(fs, LFS_DBTOFSB(fs, 495 dbp->b_blkno)), 496 ibp); 497 } 498 vput(vp); 499 } 500 free(buf, M_SEGMENT); 501 502 return 0; 503 } 504 505 /* 506 * Note the highest generation number of each inode in the Ifile. 507 * This allows us to skip processing data for intermediate versions. 508 */ 509 static int 510 update_inogen(struct lfs *fs, daddr_t offset) 511 { 512 struct vnode *devvp; 513 union lfs_dinode *dip; 514 struct buf *dbp, *ibp; 515 int error; 516 IFILE *ifp; 517 unsigned i, num; 518 519 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 520 521 /* Read inode block */ 522 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs), 523 0, &dbp); 524 if (error) { 525 DLOG((DLOG_RF, "update_inoblk: bread returned %d\n", error)); 526 return error; 527 } 528 529 /* Check each inode against ifile entry */ 530 num = LFS_INOPB(fs); 531 for (i = num; i-- > 0; ) { 532 dip = DINO_IN_BLOCK(fs, dbp->b_data, i); 533 if (lfs_dino_getinumber(fs, dip) == LFS_IFILE_INUM) 534 continue; 535 536 /* Update generation number */ 537 LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp); 538 if (lfs_if_getversion(fs, ifp) < lfs_dino_getgen(fs, dip)) 539 lfs_if_setversion(fs, ifp, lfs_dino_getgen(fs, dip)); 540 error = LFS_BWRITE_LOG(ibp); /* Ifile */ 541 if (error) 542 break; 543 } 544 brelse(dbp, BC_AGE); 545 546 return error; 547 } 548 549 #define CHECK_CKSUM 1 /* Check the checksum to make sure it's valid */ 550 #define CHECK_GEN 2 /* Update highest generation number */ 551 #define CHECK_INODES 3 /* Read and process inodes */ 552 #define CHECK_DATA 4 /* Identify and process data blocks */ 553 554 static daddr_t 555 check_segsum(struct lfs *fs, daddr_t offset, u_int64_t nextserial, 556 kauth_cred_t cred, int phase, int *pseg_flags, struct lwp *l) 557 { 558 struct vnode *devvp; 559 struct buf *bp, *dbp; 560 int error, ninos, i, j; 561 SEGSUM *ssp; 562 daddr_t prevoffset; 563 IINFO *iip; 564 FINFO *fip; 565 SEGUSE *sup; 566 size_t size; 567 uint32_t datasum, foundsum; 568 char *buf; 569 570 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 571 572 /* 573 * If this is segment 0, skip the label. 574 * If the segment has a superblock and we're at the top 575 * of the segment, skip the superblock. 576 */ 577 if (offset == lfs_sb_gets0addr(fs)) 578 offset += lfs_btofsb(fs, LFS_LABELPAD); 579 if (lfs_sntod(fs, lfs_dtosn(fs, offset)) == offset) { 580 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp); 581 if (sup->su_flags & SEGUSE_SUPERBLOCK) 582 offset += lfs_btofsb(fs, LFS_SBPAD); 583 brelse(bp, 0); 584 } 585 586 /* Read in the segment summary */ 587 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getsumsize(fs), 588 0, &bp); 589 if (error) 590 return -1; 591 buf = malloc(bp->b_bcount, M_SEGMENT, M_WAITOK); 592 memcpy(buf, bp->b_data, bp->b_bcount); 593 brelse(bp, BC_AGE); 594 595 ssp = (SEGSUM *)buf; 596 597 /* 598 * Phase I: Check summary checksum. 599 */ 600 if (phase == CHECK_CKSUM) { 601 size_t sumstart; 602 603 sumstart = lfs_ss_getsumstart(fs); 604 if (lfs_ss_getsumsum(fs, ssp) != 605 cksum((char *)ssp + sumstart, 606 lfs_sb_getsumsize(fs) - sumstart)) { 607 DLOG((DLOG_RF, "Sumsum error at 0x%" PRIx64 "\n", 608 offset)); 609 offset = -1; 610 goto err; 611 } 612 if (lfs_ss_getnfinfo(fs, ssp) == 0 && 613 lfs_ss_getninos(fs, ssp) == 0) { 614 DLOG((DLOG_RF, "Empty pseg at 0x%" PRIx64 "\n", 615 offset)); 616 offset = -1; 617 goto err; 618 } 619 if (lfs_sb_getversion(fs) == 1) { 620 if (lfs_ss_getcreate(fs, ssp) < lfs_sb_gettstamp(fs)) { 621 DLOG((DLOG_RF, "Old data at 0x%" PRIx64 "\n", offset)); 622 offset = -1; 623 goto err; 624 } 625 } else { 626 if (lfs_ss_getserial(fs, ssp) != nextserial) { 627 DLOG((DLOG_RF, "Serial number at 0x%jx given as 0x%jx," 628 " expected 0x%jx\n", (intmax_t)offset, 629 (intmax_t)lfs_ss_getserial(fs, ssp), 630 (intmax_t)nextserial)); 631 offset = -1; 632 goto err; 633 } 634 if (lfs_ss_getident(fs, ssp) != lfs_sb_getident(fs)) { 635 DLOG((DLOG_RF, "Incorrect fsid (0x%x vs 0x%x) at 0x%" 636 PRIx64 "\n", lfs_ss_getident(fs, ssp), 637 lfs_sb_getident(fs), offset)); 638 offset = -1; 639 goto err; 640 } 641 } 642 } 643 if (pseg_flags) 644 *pseg_flags = lfs_ss_getflags(fs, ssp); 645 prevoffset = offset; 646 offset += lfs_btofsb(fs, lfs_sb_getsumsize(fs)); 647 648 /* Handle individual blocks */ 649 foundsum = 0; 650 ninos = howmany(lfs_ss_getninos(fs, ssp), LFS_INOPB(fs)); 651 iip = SEGSUM_IINFOSTART(fs, buf); 652 fip = SEGSUM_FINFOBASE(fs, (SEGSUM *)buf); 653 for (i = 0; i < lfs_ss_getnfinfo(fs, ssp) || ninos; ++i) { 654 /* Inode block? */ 655 if (ninos && lfs_ii_getblock(fs, iip) == offset) { 656 if (phase == CHECK_CKSUM) { 657 /* Read in the head and add to the buffer */ 658 error = bread(devvp, LFS_FSBTODB(fs, offset), 659 lfs_sb_getbsize(fs), 0, &dbp); 660 if (error) { 661 offset = -1; 662 goto err; 663 } 664 foundsum = lfs_cksum_part(dbp->b_data, 665 sizeof(uint32_t), foundsum); 666 brelse(dbp, BC_AGE); 667 } 668 if (phase == CHECK_GEN) { 669 if ((error = update_inogen(fs, offset)) 670 != 0) { 671 offset = -1; 672 goto err; 673 } 674 } 675 if (phase == CHECK_INODES) { 676 if ((error = update_inoblk(fs, offset, cred, l)) 677 != 0) { 678 offset = -1; 679 goto err; 680 } 681 } 682 offset += lfs_btofsb(fs, lfs_sb_getibsize(fs)); 683 iip = NEXTLOWER_IINFO(fs, iip); 684 --ninos; 685 --i; /* compensate for ++i in loop header */ 686 continue; 687 } 688 689 /* File block */ 690 size = lfs_sb_getbsize(fs); 691 for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) { 692 if (j == lfs_fi_getnblocks(fs, fip) - 1) 693 size = lfs_fi_getlastlength(fs, fip); 694 if (phase == CHECK_CKSUM) { 695 error = bread(devvp, LFS_FSBTODB(fs, offset), 696 size, 0, &dbp); 697 if (error) { 698 offset = -1; 699 goto err; 700 } 701 foundsum = lfs_cksum_part(dbp->b_data, 702 sizeof(uint32_t), foundsum); 703 brelse(dbp, BC_AGE); 704 } 705 /* Account for and update any direct blocks */ 706 if (phase == CHECK_DATA && 707 lfs_fi_getino(fs, fip) > LFS_IFILE_INUM && 708 lfs_fi_getblock(fs, fip, j) >= 0) { 709 update_meta(fs, lfs_fi_getino(fs, fip), 710 lfs_fi_getversion(fs, fip), 711 lfs_fi_getblock(fs, fip, j), 712 offset, size, l); 713 ++rblkcnt; 714 } 715 offset += lfs_btofsb(fs, size); 716 } 717 718 fip = NEXT_FINFO(fs, fip); 719 } 720 721 /* Checksum the array, compare */ 722 if (phase == CHECK_CKSUM) { 723 datasum = lfs_ss_getdatasum(fs, ssp); 724 foundsum = lfs_cksum_fold(foundsum); 725 if (datasum != foundsum) { 726 DLOG((DLOG_RF, "Datasum error at 0x%" PRIx64 727 " (wanted %x got %x)\n", 728 offset, datasum, foundsum)); 729 offset = -1; 730 goto err; 731 } 732 } 733 734 if (phase == CHECK_CKSUM) 735 lfs_sb_subavail(fs, offset - prevoffset); 736 else { 737 /* Don't clog the buffer queue */ 738 mutex_enter(&lfs_lock); 739 if (locked_queue_count > LFS_MAX_BUFS || 740 locked_queue_bytes > LFS_MAX_BYTES) { 741 lfs_flush(fs, SEGM_CKP, 0); 742 } 743 mutex_exit(&lfs_lock); 744 } 745 746 /* 747 * If we're at the end of the segment, move to the next. 748 * A partial segment needs space for a segment header (1 fsb) 749 * and a full block ("frag" fsb). Thus, adding "frag" fsb should 750 * still be within the current segment (whereas frag + 1 might 751 * be at the start of the next segment). 752 * 753 * This needs to match the definition of LFS_PARTIAL_FITS 754 * in lfs_segment.c. 755 */ 756 if (lfs_dtosn(fs, offset + lfs_sb_getfrag(fs)) 757 != lfs_dtosn(fs, offset)) { 758 if (lfs_dtosn(fs, offset) == lfs_dtosn(fs, lfs_ss_getnext(fs, 759 ssp))) { 760 printf("WHOA! at 0x%jx/seg %jd moving to 0x%jx/seg %jd\n", 761 (intmax_t)offset, 762 (intmax_t)lfs_dtosn(fs, offset), 763 (intmax_t)lfs_ss_getnext(fs, ssp), 764 (intmax_t)lfs_dtosn(fs, lfs_ss_getnext(fs, ssp))); 765 offset = -1; 766 goto err; 767 } 768 offset = lfs_ss_getnext(fs, ssp); 769 DLOG((DLOG_RF, "LFS roll forward: moving to offset 0x%" PRIx64 770 " -> segment %d\n", offset, lfs_dtosn(fs,offset))); 771 } 772 773 err: 774 free(buf, M_SEGMENT); 775 776 return offset; 777 } 778 779 void 780 lfs_roll_forward(struct lfs *fs, struct mount *mp, struct lwp *l) 781 { 782 int flags, dirty, phase; 783 daddr_t startoffset, offset, nextoffset, endpseg; 784 u_int64_t nextserial, startserial, endserial; 785 int sn, curseg; 786 struct proc *p; 787 kauth_cred_t cred; 788 SEGUSE *sup; 789 struct buf *bp; 790 791 p = l ? l->l_proc : NULL; 792 cred = p ? p->p_cred : NOCRED; 793 794 /* 795 * Roll forward. 796 * 797 * We don't roll forward for v1 filesystems, because 798 * of the danger that the clock was turned back between the last 799 * checkpoint and crash. This would roll forward garbage. 800 * 801 * v2 filesystems don't have this problem because they use a 802 * monotonically increasing serial number instead of a timestamp. 803 */ 804 rblkcnt = 0; 805 if ((lfs_sb_getpflags(fs) & LFS_PF_CLEAN) || !lfs_do_rfw 806 || lfs_sb_getversion(fs) <= 1 || p == NULL) 807 return; 808 809 DLOG((DLOG_RF, "%s: begin roll forward at serial 0x%jx\n", 810 lfs_sb_getfsmnt(fs), (intmax_t)lfs_sb_getserial(fs))); 811 DEBUG_CHECK_FREELIST(fs); 812 813 /* 814 * Phase I: Find the address of the last good partial 815 * segment that was written after the checkpoint. Mark 816 * the segments in question dirty, so they won't be 817 * reallocated. 818 */ 819 endpseg = startoffset = offset = lfs_sb_getoffset(fs); 820 flags = 0x0; 821 DLOG((DLOG_RF, "LFS roll forward phase 1: start at offset 0x%" 822 PRIx64 "\n", offset)); 823 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp); 824 if (!(sup->su_flags & SEGUSE_DIRTY)) 825 lfs_sb_subnclean(fs, 1); 826 sup->su_flags |= SEGUSE_DIRTY; 827 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp); 828 829 startserial = lfs_sb_getserial(fs); 830 endserial = nextserial = startserial + 1; 831 while ((nextoffset = check_segsum(fs, offset, nextserial, 832 cred, CHECK_CKSUM, &flags, l)) > 0) { 833 if (lfs_sntod(fs, offset) != lfs_sntod(fs, nextoffset)) { 834 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset), 835 bp); 836 if (!(sup->su_flags & SEGUSE_DIRTY)) 837 lfs_sb_subnclean(fs, 1); 838 sup->su_flags |= SEGUSE_DIRTY; 839 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp); 840 } 841 842 DLOG((DLOG_RF, "LFS roll forward phase 1: offset=0x%jx" 843 " serial=0x%jx\n", (intmax_t)nextoffset, 844 (intmax_t)nextserial)); 845 if (flags & SS_DIROP) { 846 DLOG((DLOG_RF, "lfs_mountfs: dirops at 0x%" 847 PRIx64 "\n", offset)); 848 if (!(flags & SS_CONT)) { 849 DLOG((DLOG_RF, "lfs_mountfs: dirops end " 850 "at 0x%" PRIx64 "\n", offset)); 851 } 852 } 853 offset = nextoffset; 854 ++nextserial; 855 856 if (!(flags & SS_CONT)) { 857 endpseg = nextoffset; 858 endserial = nextserial; 859 } 860 if (lfs_rfw_max_psegs > 0 861 && nextserial > startserial + lfs_rfw_max_psegs) 862 break; 863 } 864 if (flags & SS_CONT) { 865 DLOG((DLOG_RF, "LFS roll forward: warning: incomplete " 866 "dirops discarded (0x%jx < 0x%jx)\n", 867 endpseg, nextoffset)); 868 } 869 if (lfs_sb_getversion(fs) > 1) 870 lfs_sb_setserial(fs, endserial); 871 DLOG((DLOG_RF, "LFS roll forward phase 1: completed: " 872 "endpseg=0x%" PRIx64 "\n", endpseg)); 873 offset = startoffset; 874 if (offset != endpseg) { 875 /* Don't overwrite what we're trying to preserve */ 876 lfs_sb_setoffset(fs, endpseg); 877 lfs_sb_setcurseg(fs, lfs_sntod(fs, lfs_dtosn(fs, endpseg))); 878 for (sn = curseg = lfs_dtosn(fs, lfs_sb_getcurseg(fs));;) { 879 sn = (sn + 1) % lfs_sb_getnseg(fs); 880 /* XXX could we just fail to roll forward? */ 881 if (sn == curseg) 882 panic("lfs_mountfs: no clean segments"); 883 LFS_SEGENTRY(sup, fs, sn, bp); 884 dirty = (sup->su_flags & SEGUSE_DIRTY); 885 brelse(bp, 0); 886 if (!dirty) 887 break; 888 } 889 lfs_sb_setnextseg(fs, lfs_sntod(fs, sn)); 890 /* Explicitly set this segment dirty */ 891 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, endpseg), bp); 892 sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE; 893 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, endpseg), bp); 894 895 896 /* 897 * Phase II: Identify the highest generation of each 898 * inode. 899 * 900 * Phase III: Update inodes. We end up with the 901 * last version of each inode present, and can ignore 902 * data blocks belonging to previous versions. 903 * 904 * Phase IV: Roll forward, updating data blocks. 905 */ 906 for (phase = CHECK_GEN; phase <= CHECK_DATA; ++phase) { 907 offset = startoffset; 908 nextserial = startserial + 1; 909 printf("LFS roll forward phase %d beginning\n", phase); 910 while (offset > 0 && offset != endpseg) { 911 if (phase == CHECK_DATA) { 912 DLOG((DLOG_RF, "LFS roll forward" 913 " phase %d: offset=0x%jx" 914 " serial=0x%jx\n", 915 phase, (intmax_t)offset, 916 (intmax_t)nextserial)); 917 } 918 offset = check_segsum(fs, offset, 919 nextserial, cred, 920 phase, NULL, l); 921 ++nextserial; 922 DEBUG_CHECK_FREELIST(fs); 923 } 924 } 925 926 /* 927 * Finish: flush our changes to disk. 928 */ 929 lfs_sb_setserial(fs, endserial); 930 931 lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC); 932 DLOG((DLOG_RF, "lfs_mountfs: roll forward " 933 "examined %jd blocks\n", 934 (intmax_t)(endpseg - startoffset))); 935 } 936 937 /* Get rid of our vnodes, except the ifile */ 938 drop_vnode_pages(mp, l); 939 DLOG((DLOG_RF, "LFS roll forward complete\n")); 940 printf("%s: roll forward recovered %d data blocks\n", 941 lfs_sb_getfsmnt(fs), rblkcnt); 942 943 /* 944 * At this point we have no more changes to write to disk. 945 * Reset the "avail" count to match the segments as they 946 * appear on disk, and the clean segment count. 947 */ 948 lfs_reset_avail(fs); 949 } 950 951 static bool 952 all_selector(void *cl, struct vnode *vp) 953 { 954 return true; 955 } 956 957 958 /* 959 * Dump any pages from vnodes that may have been put on 960 * during truncation. 961 */ 962 static void 963 drop_vnode_pages(struct mount *mp, struct lwp *l) 964 { 965 struct vnode_iterator *marker; 966 struct lfs *fs; 967 struct vnode *vp; 968 969 fs = VFSTOULFS(mp)->um_lfs; 970 vfs_vnode_iterator_init(mp, &marker); 971 while ((vp = vfs_vnode_iterator_next(marker, 972 all_selector, NULL)) != NULL) { 973 if (vp == fs->lfs_ivnode) 974 continue; 975 VOP_LOCK(vp, LK_EXCLUSIVE | LK_RETRY); 976 uvm_vnp_setsize(vp, 0); 977 uvm_vnp_setsize(vp, VTOI(vp)->i_size); 978 VOP_UNLOCK(vp); 979 vrele(vp); 980 } 981 vfs_vnode_iterator_destroy(marker); 982 } 983 984