1 /* $NetBSD: lfs_rfw.c,v 1.39 2025/10/14 00:13:31 perseant Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Konrad E. Schroder <perseant (at) hhhh.org>. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: lfs_rfw.c,v 1.39 2025/10/14 00:13:31 perseant Exp $"); 34 35 #if defined(_KERNEL_OPT) 36 #include "opt_quota.h" 37 #endif 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/namei.h> 42 #include <sys/proc.h> 43 #include <sys/kernel.h> 44 #include <sys/vnode.h> 45 #include <sys/mount.h> 46 #include <sys/kthread.h> 47 #include <sys/buf.h> 48 #include <sys/device.h> 49 #include <sys/file.h> 50 #include <sys/disklabel.h> 51 #include <sys/ioctl.h> 52 #include <sys/errno.h> 53 #include <sys/malloc.h> 54 #include <sys/pool.h> 55 #include <sys/socket.h> 56 #include <sys/stat.h> 57 #include <sys/syslog.h> 58 #include <sys/sysctl.h> 59 #include <sys/conf.h> 60 #include <sys/kauth.h> 61 62 #include <miscfs/specfs/specdev.h> 63 64 #include <ufs/lfs/ulfs_quotacommon.h> 65 #include <ufs/lfs/ulfs_inode.h> 66 #include <ufs/lfs/ulfsmount.h> 67 #include <ufs/lfs/ulfs_extern.h> 68 69 #include <uvm/uvm_extern.h> 70 71 #include <ufs/lfs/lfs.h> 72 #include <ufs/lfs/lfs_accessors.h> 73 #include <ufs/lfs/lfs_kernel.h> 74 #include <ufs/lfs/lfs_extern.h> 75 76 #include <miscfs/genfs/genfs.h> 77 #include <miscfs/genfs/genfs_node.h> 78 79 /* 80 * Roll-forward code. 81 */ 82 static daddr_t check_segsum(struct lfs *, daddr_t, u_int64_t, 83 kauth_cred_t, int, int *, struct lwp *); 84 85 static bool all_selector(void *, struct vnode *); 86 static void drop_vnode_pages(struct mount *, struct lwp *); 87 static int update_inogen(struct lfs *, daddr_t); 88 static void update_inoblk_copy_dinode(struct lfs *, union lfs_dinode *, const union lfs_dinode *); 89 90 extern int lfs_do_rfw; 91 int rblkcnt; 92 int lfs_rfw_max_psegs = 0; 93 94 /* 95 * Allocate a particular inode with a particular version number, freeing 96 * any previous versions of this inode that may have gone before. 97 * Used by the roll-forward code. 98 * 99 * XXX this function does not have appropriate locking to be used on a live fs; 100 * XXX but something similar could probably be used for an "undelete" call. 101 * 102 * Called with the Ifile inode locked. 103 */ 104 int 105 lfs_rf_valloc(struct lfs *fs, ino_t ino, int vers, struct lwp *l, 106 struct vnode **vpp, union lfs_dinode *dip) 107 { 108 struct vattr va; 109 struct vnode *vp; 110 struct inode *ip; 111 int error; 112 113 KASSERT(ino > LFS_IFILE_INUM); 114 ASSERT_SEGLOCK(fs); /* XXX it doesn't, really */ 115 116 /* 117 * First, just try a vget. If the version number is the one we want, 118 * we don't have to do anything else. If the version number is wrong, 119 * take appropriate action. 120 */ 121 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, LK_EXCLUSIVE, &vp); 122 if (error == 0) { 123 DLOG((DLOG_RF, "lfs_rf_valloc[1]: ino %d vp %p\n", 124 (int)ino, vp)); 125 126 *vpp = vp; 127 ip = VTOI(vp); 128 DLOG((DLOG_RF, " ip->i_gen=%jd dip nlink %jd seeking" 129 " version %jd\n", (intmax_t)ip->i_gen, 130 (intmax_t)(dip == NULL ? -1 131 : lfs_dino_getnlink(fs, dip)), (intmax_t)vers)); 132 if (ip->i_gen == vers) { 133 /* 134 * We have what we wanted already. 135 */ 136 DLOG((DLOG_RF, " pre-existing\n")); 137 return 0; 138 } else if (ip->i_gen < vers && dip != NULL 139 && lfs_dino_getnlink(fs, dip) > 0) { 140 /* 141 * We have found a newer version. Truncate 142 * the old vnode to zero and re-initialize 143 * from the given dinode. 144 */ 145 DLOG((DLOG_RF, " replace old version %jd\n", 146 (intmax_t)ip->i_gen)); 147 lfs_truncate(vp, (off_t)0, 0, NOCRED); 148 ip->i_gen = vers; 149 vp->v_type = IFTOVT(lfs_dino_getmode(fs, dip)); 150 update_inoblk_copy_dinode(fs, ip->i_din, dip); 151 LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE); 152 return 0; 153 } else { 154 /* 155 * Not the right version and nothing to 156 * initialize from. Don't recover this data. 157 */ 158 DLOG((DLOG_RF, "ino %d: sought version %d, got %d\n", 159 (int)ino, (int)vers, 160 (int)lfs_dino_getgen(fs, ip->i_din))); 161 vput(vp); 162 *vpp = NULLVP; 163 return EEXIST; 164 } 165 } 166 167 /* 168 * No version of this inode was found in the cache. 169 * Make a new one from the dinode. We will add data blocks 170 * as they come in, so scrub any block addresses off of the 171 * inode and reset block counts to zero. 172 */ 173 if (dip == NULL) 174 return ENOENT; 175 176 vattr_null(&va); 177 va.va_type = IFTOVT(lfs_dino_getmode(fs, dip)); 178 va.va_mode = lfs_dino_getmode(fs, dip) & ALLPERMS; 179 va.va_fileid = ino; 180 va.va_gen = vers; 181 error = vcache_new(fs->lfs_ivnode->v_mount, NULL, &va, NOCRED, NULL, 182 &vp); 183 if (error) 184 return error; 185 error = vn_lock(vp, LK_EXCLUSIVE); 186 if (error) 187 goto err; 188 189 ip = VTOI(vp); 190 update_inoblk_copy_dinode(fs, ip->i_din, dip); 191 192 DLOG((DLOG_RF, "lfs_valloc[2] ino %d vp %p size=%lld effnblks=%d," 193 " blocks=%d\n", (int)ino, vp, (long long)ip->i_size, 194 (int)ip->i_lfs_effnblks, 195 (int)lfs_dino_getblocks(fs, ip->i_din))); 196 *vpp = vp; 197 return 0; 198 199 err: 200 vrele(vp); 201 *vpp = NULLVP; 202 return error; 203 } 204 205 /* 206 * Load the appropriate indirect block, and change the appropriate pointer. 207 * Mark the block dirty. Do segment and avail accounting. 208 */ 209 static int 210 update_meta(struct lfs *fs, ino_t ino, int vers, daddr_t lbn, 211 daddr_t ndaddr, size_t size, struct lwp *l) 212 { 213 int error; 214 struct vnode *vp; 215 struct inode *ip; 216 daddr_t odaddr; 217 struct indir a[ULFS_NIADDR]; 218 int num; 219 struct buf *bp; 220 SEGUSE *sup; 221 u_int64_t newsize, loff; 222 223 KASSERT(lbn >= 0); /* no indirect blocks */ 224 KASSERT(ino > LFS_IFILE_INUM); 225 226 DLOG((DLOG_RF, "update_meta: ino %d lbn %d size %d at 0x%jx\n", 227 (int)ino, (int)lbn, (int)size, (uintmax_t)ndaddr)); 228 229 if ((error = lfs_rf_valloc(fs, ino, vers, l, &vp, NULL)) != 0) 230 return error; 231 ip = VTOI(vp); 232 233 /* 234 * If block already exists, note its new location 235 * but do not account it as new. 236 */ 237 ulfs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL, NULL); 238 if (odaddr == UNASSIGNED) { 239 if ((error = lfs_balloc(vp, (lbn << lfs_sb_getbshift(fs)), 240 size, NOCRED, 0, &bp)) != 0) { 241 vput(vp); 242 return (error); 243 } 244 /* No need to write, the block is already on disk */ 245 if (bp->b_oflags & BO_DELWRI) { 246 LFS_UNLOCK_BUF(bp); 247 /* Account recovery of the previous version */ 248 lfs_sb_addavail(fs, lfs_btofsb(fs, bp->b_bcount)); 249 } 250 brelse(bp, BC_INVAL); 251 DLOG((DLOG_RF, "balloc ip->i_lfs_effnblks = %d," 252 " lfs_dino_getblocks(fs, ip->i_din) = %d\n", 253 (int)ip->i_lfs_effnblks, 254 (int)lfs_dino_getblocks(fs, ip->i_din))); 255 } else { 256 /* XXX fragextend? */ 257 DLOG((DLOG_RF, "block exists, no balloc\n")); 258 } 259 260 /* 261 * Extend the file, if it is not large enough already. 262 * XXX this is not exactly right, we don't know how much of the 263 * XXX last block is actually used. 264 */ 265 loff = lfs_lblktosize(fs, lbn); 266 if (loff >= (ULFS_NDADDR << lfs_sb_getbshift(fs))) { 267 /* No fragments */ 268 newsize = loff + 1; 269 } else { 270 /* Subtract only a fragment to account for block size */ 271 newsize = loff + size - lfs_fsbtob(fs, 1) + 1; 272 } 273 274 if (ip->i_size < newsize) { 275 DLOG((DLOG_RF, "ino %d size %d -> %d\n", 276 (int)ino, (int)ip->i_size, (int)newsize)); 277 lfs_dino_setsize(fs, ip->i_din, newsize); 278 ip->i_size = newsize; 279 /* 280 * tell vm our new size for the case the inode won't 281 * appear later. 282 */ 283 uvm_vnp_setsize(vp, newsize); 284 } 285 286 lfs_update_single(fs, NULL, vp, lbn, ndaddr, size); 287 288 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, ndaddr), bp); 289 sup->su_nbytes += size; 290 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, ndaddr), bp); 291 292 /* differences here should be due to UNWRITTEN indirect blocks. */ 293 if (vp->v_type != VLNK) { 294 if (!(ip->i_lfs_effnblks >= lfs_dino_getblocks(fs, ip->i_din)) 295 #if 0 296 || !(lfs_lblkno(fs, ip->i_size) > ULFS_NDADDR || 297 ip->i_lfs_effnblks == lfs_dino_getblocks(fs, ip->i_din)) 298 #endif /* 0 */ 299 ) { 300 vprint("vnode", vp); 301 printf("effnblks=%jd dino_getblocks=%jd\n", 302 (intmax_t)ip->i_lfs_effnblks, 303 (intmax_t)lfs_dino_getblocks(fs, ip->i_din)); 304 } 305 KASSERT(ip->i_lfs_effnblks >= lfs_dino_getblocks(fs, ip->i_din)); 306 #if 0 307 KASSERT(lfs_lblkno(fs, ip->i_size) > ULFS_NDADDR || 308 ip->i_lfs_effnblks == lfs_dino_getblocks(fs, ip->i_din)); 309 #endif /* 0 */ 310 } 311 312 #ifdef DEBUG 313 /* Now look again to make sure it worked */ 314 ulfs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL, NULL); 315 if (LFS_DBTOFSB(fs, odaddr) != ndaddr) 316 DLOG((DLOG_RF, "update_meta: failed setting ino %jd lbn %jd" 317 " to %jd\n", (intmax_t)ino, (intmax_t)lbn, (intmax_t)ndaddr)); 318 #endif /* DEBUG */ 319 vput(vp); 320 return 0; 321 } 322 323 /* 324 * Copy some the fields of the dinode as needed by update_inoblk(). 325 */ 326 static void 327 update_inoblk_copy_dinode(struct lfs *fs, 328 union lfs_dinode *dstu, const union lfs_dinode *srcu) 329 { 330 if (fs->lfs_is64) { 331 struct lfs64_dinode *dst = &dstu->u_64; 332 const struct lfs64_dinode *src = &srcu->u_64; 333 unsigned i; 334 335 /* 336 * Copy everything but the block pointers and di_blocks. 337 * XXX what about di_extb? 338 */ 339 dst->di_mode = src->di_mode; 340 dst->di_nlink = src->di_nlink; 341 dst->di_uid = src->di_uid; 342 dst->di_gid = src->di_gid; 343 dst->di_blksize = src->di_blksize; 344 dst->di_size = src->di_size; 345 dst->di_atime = src->di_atime; 346 dst->di_mtime = src->di_mtime; 347 dst->di_ctime = src->di_ctime; 348 dst->di_birthtime = src->di_birthtime; 349 dst->di_mtimensec = src->di_mtimensec; 350 dst->di_atimensec = src->di_atimensec; 351 dst->di_ctimensec = src->di_ctimensec; 352 dst->di_birthnsec = src->di_birthnsec; 353 dst->di_gen = src->di_gen; 354 dst->di_kernflags = src->di_kernflags; 355 dst->di_flags = src->di_flags; 356 dst->di_extsize = src->di_extsize; 357 dst->di_modrev = src->di_modrev; 358 dst->di_inumber = src->di_inumber; 359 for (i = 0; i < __arraycount(src->di_spare); i++) { 360 dst->di_spare[i] = src->di_spare[i]; 361 } 362 /* Short symlinks store their data in di_db. */ 363 if ((src->di_mode & LFS_IFMT) == LFS_IFLNK 364 && src->di_size < lfs_sb_getmaxsymlinklen(fs)) { 365 memcpy(dst->di_db, src->di_db, src->di_size); 366 } 367 } else { 368 struct lfs32_dinode *dst = &dstu->u_32; 369 const struct lfs32_dinode *src = &srcu->u_32; 370 371 /* Get mode, link count, size, and times */ 372 memcpy(dst, src, offsetof(struct lfs32_dinode, di_db[0])); 373 374 /* Then the rest, except di_blocks */ 375 dst->di_flags = src->di_flags; 376 dst->di_gen = src->di_gen; 377 dst->di_uid = src->di_uid; 378 dst->di_gid = src->di_gid; 379 dst->di_modrev = src->di_modrev; 380 381 /* Short symlinks store their data in di_db. */ 382 if ((src->di_mode & LFS_IFMT) == LFS_IFLNK 383 && src->di_size < lfs_sb_getmaxsymlinklen(fs)) { 384 memcpy(dst->di_db, src->di_db, src->di_size); 385 } 386 } 387 } 388 389 static int 390 update_inoblk(struct lfs *fs, daddr_t offset, kauth_cred_t cred, 391 struct lwp *l) 392 { 393 struct vnode *devvp, *vp; 394 struct inode *ip; 395 union lfs_dinode *dip; 396 struct buf *dbp, *ibp; 397 int error; 398 daddr_t daddr; 399 IFILE *ifp; 400 SEGUSE *sup; 401 unsigned i, num; 402 uint32_t gen, osn, nsn; 403 char *buf; 404 405 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 406 407 /* 408 * Get the inode, update times and perms. 409 * DO NOT update disk blocks, we do that separately. 410 */ 411 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs), 412 0, &dbp); 413 if (error) { 414 DLOG((DLOG_RF, "update_inoblk: bread returned %d\n", error)); 415 return error; 416 } 417 buf = malloc(dbp->b_bcount, M_SEGMENT, M_WAITOK); 418 memcpy(buf, dbp->b_data, dbp->b_bcount); 419 brelse(dbp, BC_AGE); 420 num = LFS_INOPB(fs); 421 for (i = num; i-- > 0; ) { 422 dip = DINO_IN_BLOCK(fs, buf, i); 423 if (lfs_dino_getinumber(fs, dip) <= LFS_IFILE_INUM) 424 continue; 425 426 /* Check generation number */ 427 LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp); 428 gen = lfs_if_getversion(fs, ifp); 429 brelse(ibp, 0); 430 if (lfs_dino_getgen(fs, dip) < gen) { 431 continue; 432 } 433 434 /* 435 * This inode is the newest generation. Load it. 436 */ 437 error = lfs_rf_valloc(fs, lfs_dino_getinumber(fs, dip), 438 lfs_dino_getgen(fs, dip), 439 l, &vp, dip); 440 if (error) { 441 DLOG((DLOG_RF, "update_inoblk: lfs_rf_valloc" 442 " returned %d\n", error)); 443 continue; 444 } 445 ip = VTOI(vp); 446 if (lfs_dino_getsize(fs, dip) != ip->i_size 447 && vp->v_type != VLNK) { 448 /* XXX What should we do sith symlinks? */ 449 DLOG((DLOG_RF, " ino %jd size %jd -> %jd\n", 450 (intmax_t)lfs_dino_getinumber(fs, dip), 451 (intmax_t)ip->i_size, 452 (intmax_t)lfs_dino_getsize(fs, dip))); 453 lfs_truncate(vp, lfs_dino_getsize(fs, dip), 0, 454 NOCRED); 455 } 456 update_inoblk_copy_dinode(fs, ip->i_din, dip); 457 458 ip->i_flags = lfs_dino_getflags(fs, dip); 459 ip->i_gen = lfs_dino_getgen(fs, dip); 460 ip->i_uid = lfs_dino_getuid(fs, dip); 461 ip->i_gid = lfs_dino_getgid(fs, dip); 462 463 ip->i_mode = lfs_dino_getmode(fs, dip); 464 ip->i_nlink = lfs_dino_getnlink(fs, dip); 465 ip->i_size = lfs_dino_getsize(fs, dip); 466 467 LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE); 468 469 /* Re-initialize to get type right */ 470 ulfs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p, 471 &vp); 472 473 /* Record change in location */ 474 LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp); 475 daddr = lfs_if_getdaddr(fs, ifp); 476 lfs_if_setdaddr(fs, ifp, offset); 477 error = LFS_BWRITE_LOG(ibp); /* Ifile */ 478 /* And do segment accounting */ 479 osn = lfs_dtosn(fs, daddr); 480 nsn = lfs_dtosn(fs, offset); 481 if (DADDR_IS_BAD(daddr) || osn != nsn) { 482 if (!DADDR_IS_BAD(daddr)) { 483 LFS_SEGENTRY(sup, fs, osn, ibp); 484 sup->su_nbytes -= DINOSIZE(fs); 485 LFS_WRITESEGENTRY(sup, fs, osn, ibp); 486 } 487 LFS_SEGENTRY(sup, fs, nsn, ibp); 488 sup->su_nbytes += DINOSIZE(fs); 489 LFS_WRITESEGENTRY(sup, fs, nsn, ibp); 490 } 491 vput(vp); 492 } 493 free(buf, M_SEGMENT); 494 495 return 0; 496 } 497 498 /* 499 * Note the highest generation number of each inode in the Ifile. 500 * This allows us to skip processing data for intermediate versions. 501 */ 502 static int 503 update_inogen(struct lfs *fs, daddr_t offset) 504 { 505 struct vnode *devvp; 506 union lfs_dinode *dip; 507 struct buf *dbp, *ibp; 508 int error; 509 IFILE *ifp; 510 unsigned i, num; 511 512 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 513 514 /* Read inode block */ 515 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs), 516 0, &dbp); 517 if (error) { 518 DLOG((DLOG_RF, "update_inoblk: bread returned %d\n", error)); 519 return error; 520 } 521 522 /* Check each inode against ifile entry */ 523 num = LFS_INOPB(fs); 524 for (i = num; i-- > 0; ) { 525 dip = DINO_IN_BLOCK(fs, dbp->b_data, i); 526 if (lfs_dino_getinumber(fs, dip) == LFS_IFILE_INUM) 527 continue; 528 529 /* Update generation number */ 530 LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp); 531 if (lfs_if_getversion(fs, ifp) < lfs_dino_getgen(fs, dip)) 532 lfs_if_setversion(fs, ifp, lfs_dino_getgen(fs, dip)); 533 error = LFS_BWRITE_LOG(ibp); /* Ifile */ 534 if (error) 535 break; 536 } 537 brelse(dbp, BC_AGE); 538 539 return error; 540 } 541 542 #define CHECK_CKSUM 1 /* Check the checksum to make sure it's valid */ 543 #define CHECK_GEN 2 /* Update highest generation number */ 544 #define CHECK_INODES 3 /* Read and process inodes */ 545 #define CHECK_DATA 4 /* Identify and process data blocks */ 546 547 static daddr_t 548 check_segsum(struct lfs *fs, daddr_t offset, u_int64_t nextserial, 549 kauth_cred_t cred, int phase, int *pseg_flags, struct lwp *l) 550 { 551 struct vnode *devvp; 552 struct buf *bp, *dbp; 553 int error, ninos, i, j; 554 SEGSUM *ssp; 555 daddr_t prevoffset; 556 IINFO *iip; 557 FINFO *fip; 558 SEGUSE *sup; 559 size_t size; 560 uint32_t datasum, foundsum; 561 char *buf; 562 563 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 564 565 /* 566 * If this is segment 0, skip the label. 567 * If the segment has a superblock and we're at the top 568 * of the segment, skip the superblock. 569 */ 570 if (offset == lfs_sb_gets0addr(fs)) 571 offset += lfs_btofsb(fs, LFS_LABELPAD); 572 if (lfs_sntod(fs, lfs_dtosn(fs, offset)) == offset) { 573 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp); 574 if (sup->su_flags & SEGUSE_SUPERBLOCK) 575 offset += lfs_btofsb(fs, LFS_SBPAD); 576 brelse(bp, 0); 577 } 578 579 /* Read in the segment summary */ 580 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getsumsize(fs), 581 0, &bp); 582 if (error) 583 return -1; 584 buf = malloc(bp->b_bcount, M_SEGMENT, M_WAITOK); 585 memcpy(buf, bp->b_data, bp->b_bcount); 586 brelse(bp, BC_AGE); 587 588 ssp = (SEGSUM *)buf; 589 590 /* 591 * Phase I: Check summary checksum. 592 */ 593 if (phase == CHECK_CKSUM) { 594 size_t sumstart; 595 596 sumstart = lfs_ss_getsumstart(fs); 597 if (lfs_ss_getsumsum(fs, ssp) != 598 cksum((char *)ssp + sumstart, 599 lfs_sb_getsumsize(fs) - sumstart)) { 600 DLOG((DLOG_RF, "Sumsum error at 0x%" PRIx64 "\n", 601 offset)); 602 offset = -1; 603 goto err; 604 } 605 #if 0 606 /* 607 * Under normal conditions, we should never be producing 608 * a partial segment with neither inode blocks nor data blocks. 609 * However, these do sometimes appear and they need not 610 * prevent us from continuing. 611 */ 612 if (lfs_ss_getnfinfo(fs, ssp) == 0 && 613 lfs_ss_getninos(fs, ssp) == 0) { 614 DLOG((DLOG_RF, "Empty pseg at 0x%" PRIx64 "\n", 615 offset)); 616 offset = -1; 617 goto err; 618 } 619 #endif /* 0 */ 620 if (lfs_sb_getversion(fs) == 1) { 621 if (lfs_ss_getcreate(fs, ssp) < lfs_sb_gettstamp(fs)) { 622 DLOG((DLOG_RF, "Old data at 0x%" PRIx64 "\n", offset)); 623 offset = -1; 624 goto err; 625 } 626 } else { 627 if (lfs_ss_getserial(fs, ssp) != nextserial) { 628 DLOG((DLOG_RF, "Serial number at 0x%jx given as 0x%jx," 629 " expected 0x%jx\n", (intmax_t)offset, 630 (intmax_t)lfs_ss_getserial(fs, ssp), 631 (intmax_t)nextserial)); 632 offset = -1; 633 goto err; 634 } 635 if (lfs_ss_getident(fs, ssp) != lfs_sb_getident(fs)) { 636 DLOG((DLOG_RF, "Incorrect fsid (0x%x vs 0x%x) at 0x%" 637 PRIx64 "\n", lfs_ss_getident(fs, ssp), 638 lfs_sb_getident(fs), offset)); 639 offset = -1; 640 goto err; 641 } 642 } 643 } 644 if (pseg_flags) 645 *pseg_flags = lfs_ss_getflags(fs, ssp); 646 prevoffset = offset; 647 offset += lfs_btofsb(fs, lfs_sb_getsumsize(fs)); 648 649 /* Handle individual blocks */ 650 foundsum = 0; 651 ninos = howmany(lfs_ss_getninos(fs, ssp), LFS_INOPB(fs)); 652 iip = SEGSUM_IINFOSTART(fs, buf); 653 fip = SEGSUM_FINFOBASE(fs, (SEGSUM *)buf); 654 for (i = 0; i < lfs_ss_getnfinfo(fs, ssp) || ninos; ++i) { 655 /* Inode block? */ 656 if (ninos && lfs_ii_getblock(fs, iip) == offset) { 657 if (phase == CHECK_CKSUM) { 658 /* Read in the head and add to the buffer */ 659 error = bread(devvp, LFS_FSBTODB(fs, offset), 660 lfs_sb_getbsize(fs), 0, &dbp); 661 if (error) { 662 offset = -1; 663 goto err; 664 } 665 foundsum = lfs_cksum_part(dbp->b_data, 666 sizeof(uint32_t), foundsum); 667 brelse(dbp, BC_AGE); 668 } 669 if (phase == CHECK_GEN) { 670 if ((error = update_inogen(fs, offset)) 671 != 0) { 672 offset = -1; 673 goto err; 674 } 675 } 676 if (phase == CHECK_INODES) { 677 if ((error = update_inoblk(fs, offset, cred, l)) 678 != 0) { 679 offset = -1; 680 goto err; 681 } 682 } 683 offset += lfs_btofsb(fs, lfs_sb_getibsize(fs)); 684 iip = NEXTLOWER_IINFO(fs, iip); 685 --ninos; 686 --i; /* compensate for ++i in loop header */ 687 continue; 688 } 689 690 /* File block */ 691 size = lfs_sb_getbsize(fs); 692 for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) { 693 if (j == lfs_fi_getnblocks(fs, fip) - 1) 694 size = lfs_fi_getlastlength(fs, fip); 695 if (phase == CHECK_CKSUM) { 696 error = bread(devvp, LFS_FSBTODB(fs, offset), 697 size, 0, &dbp); 698 if (error) { 699 offset = -1; 700 goto err; 701 } 702 foundsum = lfs_cksum_part(dbp->b_data, 703 sizeof(uint32_t), foundsum); 704 brelse(dbp, BC_AGE); 705 } 706 /* Account for and update any direct blocks */ 707 if (phase == CHECK_DATA && 708 lfs_fi_getino(fs, fip) > LFS_IFILE_INUM && 709 lfs_fi_getblock(fs, fip, j) >= 0) { 710 update_meta(fs, lfs_fi_getino(fs, fip), 711 lfs_fi_getversion(fs, fip), 712 lfs_fi_getblock(fs, fip, j), 713 offset, size, l); 714 ++rblkcnt; 715 } 716 offset += lfs_btofsb(fs, size); 717 } 718 719 fip = NEXT_FINFO(fs, fip); 720 } 721 722 /* Checksum the array, compare */ 723 if (phase == CHECK_CKSUM) { 724 datasum = lfs_ss_getdatasum(fs, ssp); 725 foundsum = lfs_cksum_fold(foundsum); 726 if (datasum != foundsum) { 727 DLOG((DLOG_RF, "Datasum error at 0x%" PRIx64 728 " (wanted %x got %x)\n", 729 offset, datasum, foundsum)); 730 offset = -1; 731 goto err; 732 } 733 } 734 735 if (phase == CHECK_CKSUM) 736 lfs_sb_subavail(fs, offset - prevoffset); 737 else { 738 /* Don't clog the buffer queue */ 739 mutex_enter(&lfs_lock); 740 if (locked_queue_count > LFS_MAX_BUFS || 741 locked_queue_bytes > LFS_MAX_BYTES) { 742 lfs_flush(fs, SEGM_CKP, 0); 743 } 744 mutex_exit(&lfs_lock); 745 } 746 747 /* 748 * If we're at the end of the segment, move to the next. 749 * A partial segment needs space for a segment header (1 fsb) 750 * and a full block ("frag" fsb). Thus, adding "frag" fsb should 751 * still be within the current segment (whereas frag + 1 might 752 * be at the start of the next segment). 753 * 754 * This needs to match the definition of LFS_PARTIAL_FITS 755 * in lfs_segment.c. 756 */ 757 if (lfs_dtosn(fs, offset + lfs_sb_getfrag(fs)) 758 != lfs_dtosn(fs, offset)) { 759 if (lfs_dtosn(fs, offset) == lfs_dtosn(fs, lfs_ss_getnext(fs, 760 ssp))) { 761 printf("WHOA! at 0x%jx/seg %jd moving to 0x%jx/seg %jd\n", 762 (intmax_t)offset, 763 (intmax_t)lfs_dtosn(fs, offset), 764 (intmax_t)lfs_ss_getnext(fs, ssp), 765 (intmax_t)lfs_dtosn(fs, lfs_ss_getnext(fs, ssp))); 766 offset = -1; 767 goto err; 768 } 769 offset = lfs_ss_getnext(fs, ssp); 770 DLOG((DLOG_RF, "LFS roll forward: moving to offset 0x%" PRIx64 771 " -> segment %d\n", offset, lfs_dtosn(fs,offset))); 772 } 773 774 err: 775 free(buf, M_SEGMENT); 776 777 return offset; 778 } 779 780 void 781 lfs_roll_forward(struct lfs *fs, struct mount *mp, struct lwp *l) 782 { 783 int flags, dirty, phase; 784 daddr_t startoffset, offset, nextoffset, endpseg; 785 u_int64_t nextserial, startserial, endserial; 786 int sn, curseg; 787 struct proc *p; 788 kauth_cred_t cred; 789 SEGUSE *sup; 790 struct buf *bp; 791 792 p = l ? l->l_proc : NULL; 793 cred = p ? p->p_cred : NOCRED; 794 795 /* 796 * Roll forward. 797 * 798 * We don't roll forward for v1 filesystems, because 799 * of the danger that the clock was turned back between the last 800 * checkpoint and crash. This would roll forward garbage. 801 * 802 * v2 filesystems don't have this problem because they use a 803 * monotonically increasing serial number instead of a timestamp. 804 */ 805 rblkcnt = 0; 806 if ((lfs_sb_getpflags(fs) & LFS_PF_CLEAN) || !lfs_do_rfw 807 || lfs_sb_getversion(fs) <= 1 || p == NULL) 808 return; 809 810 DLOG((DLOG_RF, "%s: begin roll forward at serial 0x%jx\n", 811 lfs_sb_getfsmnt(fs), (intmax_t)lfs_sb_getserial(fs))); 812 DEBUG_CHECK_FREELIST(fs); 813 814 /* 815 * Phase I: Find the address of the last good partial 816 * segment that was written after the checkpoint. Mark 817 * the segments in question dirty, so they won't be 818 * reallocated. 819 */ 820 endpseg = startoffset = offset = lfs_sb_getoffset(fs); 821 flags = 0x0; 822 DLOG((DLOG_RF, "LFS roll forward phase 1: start at offset 0x%" 823 PRIx64 "\n", offset)); 824 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp); 825 if (!(sup->su_flags & SEGUSE_DIRTY)) 826 lfs_sb_subnclean(fs, 1); 827 sup->su_flags |= SEGUSE_DIRTY; 828 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp); 829 830 startserial = lfs_sb_getserial(fs); 831 endserial = nextserial = startserial + 1; 832 while ((nextoffset = check_segsum(fs, offset, nextserial, 833 cred, CHECK_CKSUM, &flags, l)) > 0) { 834 if (lfs_sntod(fs, offset) != lfs_sntod(fs, nextoffset)) { 835 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset), 836 bp); 837 if (!(sup->su_flags & SEGUSE_DIRTY)) 838 lfs_sb_subnclean(fs, 1); 839 sup->su_flags |= SEGUSE_DIRTY; 840 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp); 841 } 842 843 DLOG((DLOG_RF, "LFS roll forward phase 1: offset=0x%jx" 844 " serial=0x%jx\n", (intmax_t)nextoffset, 845 (intmax_t)nextserial)); 846 if (flags & SS_DIROP) { 847 DLOG((DLOG_RF, "lfs_mountfs: dirops at 0x%" 848 PRIx64 "\n", offset)); 849 if (!(flags & SS_CONT)) { 850 DLOG((DLOG_RF, "lfs_mountfs: dirops end " 851 "at 0x%" PRIx64 "\n", offset)); 852 } 853 } 854 offset = nextoffset; 855 ++nextserial; 856 857 if (!(flags & SS_CONT)) { 858 endpseg = nextoffset; 859 endserial = nextserial; 860 } 861 if (lfs_rfw_max_psegs > 0 862 && nextserial > startserial + lfs_rfw_max_psegs) 863 break; 864 } 865 if (flags & SS_CONT) { 866 DLOG((DLOG_RF, "LFS roll forward: warning: incomplete " 867 "dirops discarded (0x%jx < 0x%jx)\n", 868 endpseg, nextoffset)); 869 } 870 if (lfs_sb_getversion(fs) > 1) 871 lfs_sb_setserial(fs, endserial); 872 DLOG((DLOG_RF, "LFS roll forward phase 1: completed: " 873 "endpseg=0x%" PRIx64 "\n", endpseg)); 874 offset = startoffset; 875 if (offset != endpseg) { 876 /* Don't overwrite what we're trying to preserve */ 877 lfs_sb_setoffset(fs, endpseg); 878 lfs_sb_setcurseg(fs, lfs_sntod(fs, lfs_dtosn(fs, endpseg))); 879 for (sn = curseg = lfs_dtosn(fs, lfs_sb_getcurseg(fs));;) { 880 sn = (sn + 1) % lfs_sb_getnseg(fs); 881 /* XXX could we just fail to roll forward? */ 882 if (sn == curseg) 883 panic("lfs_mountfs: no clean segments"); 884 LFS_SEGENTRY(sup, fs, sn, bp); 885 dirty = (sup->su_flags & SEGUSE_DIRTY); 886 brelse(bp, 0); 887 if (!dirty) 888 break; 889 } 890 lfs_sb_setnextseg(fs, lfs_sntod(fs, sn)); 891 /* Explicitly set this segment dirty */ 892 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, endpseg), bp); 893 sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE; 894 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, endpseg), bp); 895 896 897 /* 898 * Phase II: Identify the highest generation of each 899 * inode. 900 * 901 * Phase III: Update inodes. We end up with the 902 * last version of each inode present, and can ignore 903 * data blocks belonging to previous versions. 904 * 905 * Phase IV: Roll forward, updating data blocks. 906 */ 907 for (phase = CHECK_GEN; phase <= CHECK_DATA; ++phase) { 908 offset = startoffset; 909 nextserial = startserial + 1; 910 printf("LFS roll forward phase %d beginning\n", phase); 911 while (offset > 0 && offset != endpseg) { 912 if (phase == CHECK_DATA) { 913 DLOG((DLOG_RF, "LFS roll forward" 914 " phase %d: offset=0x%jx" 915 " serial=0x%jx\n", 916 phase, (intmax_t)offset, 917 (intmax_t)nextserial)); 918 } 919 offset = check_segsum(fs, offset, 920 nextserial, cred, 921 phase, NULL, l); 922 ++nextserial; 923 DEBUG_CHECK_FREELIST(fs); 924 } 925 } 926 927 /* 928 * Finish: flush our changes to disk. 929 */ 930 lfs_sb_setserial(fs, endserial); 931 932 lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC); 933 DLOG((DLOG_RF, "lfs_mountfs: roll forward " 934 "examined %jd blocks\n", 935 (intmax_t)(endpseg - startoffset))); 936 } 937 938 /* Get rid of our vnodes, except the ifile */ 939 drop_vnode_pages(mp, l); 940 DLOG((DLOG_RF, "LFS roll forward complete\n")); 941 printf("%s: roll forward recovered %d data blocks\n", 942 lfs_sb_getfsmnt(fs), rblkcnt); 943 944 /* 945 * At this point we have no more changes to write to disk. 946 * Reset the "avail" count to match the segments as they 947 * appear on disk, and the clean segment count. 948 */ 949 lfs_reset_avail(fs); 950 } 951 952 static bool 953 all_selector(void *cl, struct vnode *vp) 954 { 955 return true; 956 } 957 958 959 /* 960 * Dump any pages from vnodes that may have been put on 961 * during truncation. 962 */ 963 static void 964 drop_vnode_pages(struct mount *mp, struct lwp *l) 965 { 966 struct vnode_iterator *marker; 967 struct lfs *fs; 968 struct vnode *vp; 969 970 fs = VFSTOULFS(mp)->um_lfs; 971 vfs_vnode_iterator_init(mp, &marker); 972 while ((vp = vfs_vnode_iterator_next(marker, 973 all_selector, NULL)) != NULL) { 974 if (vp == fs->lfs_ivnode) 975 continue; 976 VOP_LOCK(vp, LK_EXCLUSIVE | LK_RETRY); 977 uvm_vnp_setsize(vp, 0); 978 uvm_vnp_setsize(vp, VTOI(vp)->i_size); 979 VOP_UNLOCK(vp); 980 vrele(vp); 981 } 982 vfs_vnode_iterator_destroy(marker); 983 } 984 985