1 /* $NetBSD: lfs_rfw.c,v 1.43 2025/12/10 03:20:59 perseant Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2025 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Konrad E. Schroder <perseant (at) hhhh.org>. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: lfs_rfw.c,v 1.43 2025/12/10 03:20:59 perseant Exp $"); 34 35 #if defined(_KERNEL_OPT) 36 #include "opt_quota.h" 37 #endif 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/namei.h> 42 #include <sys/proc.h> 43 #include <sys/kernel.h> 44 #include <sys/vnode.h> 45 #include <sys/mount.h> 46 #include <sys/kthread.h> 47 #include <sys/buf.h> 48 #include <sys/device.h> 49 #include <sys/file.h> 50 #include <sys/disklabel.h> 51 #include <sys/ioctl.h> 52 #include <sys/errno.h> 53 #include <sys/malloc.h> 54 #include <sys/pool.h> 55 #include <sys/socket.h> 56 #include <sys/stat.h> 57 #include <sys/syslog.h> 58 #include <sys/sysctl.h> 59 #include <sys/conf.h> 60 #include <sys/kauth.h> 61 62 #include <miscfs/specfs/specdev.h> 63 64 #include <ufs/lfs/ulfs_quotacommon.h> 65 #include <ufs/lfs/ulfs_inode.h> 66 #include <ufs/lfs/ulfsmount.h> 67 #include <ufs/lfs/ulfs_extern.h> 68 69 #include <uvm/uvm_extern.h> 70 71 #include <ufs/lfs/lfs.h> 72 #include <ufs/lfs/lfs_accessors.h> 73 #include <ufs/lfs/lfs_kernel.h> 74 #include <ufs/lfs/lfs_extern.h> 75 76 #include <miscfs/genfs/genfs.h> 77 #include <miscfs/genfs/genfs_node.h> 78 79 /* 80 * Roll-forward code. 81 */ 82 static bool all_selector(void *, struct vnode *); 83 static void drop_vnode_pages(struct mount *, struct lwp *); 84 static void update_inoblk_copy_dinode(struct lfs *, union lfs_dinode *, 85 const union lfs_dinode *); 86 static int update_inogen(struct lfs_inofuncarg *); 87 static int update_inoblk(struct lfs_inofuncarg *); 88 static int finfo_func_rfw(struct lfs_finfofuncarg *); 89 90 static int update_meta(struct lfs *, ino_t, int, daddr_t, daddr_t, size_t, 91 struct lwp *l); 92 #if 0 93 static bool lfs_isseq(const struct lfs *fs, long int lbn1, long int lbn2); 94 #endif 95 96 extern int lfs_do_rfw; 97 int rblkcnt; 98 int lfs_rfw_max_psegs = 0; 99 100 /* 101 * Allocate a particular inode with a particular version number, freeing 102 * any previous versions of this inode that may have gone before. 103 * Used by the roll-forward code. 104 * 105 * XXX this function does not have appropriate locking to be used on a live fs; 106 * XXX but something similar could probably be used for an "undelete" call. 107 * 108 * Called with the Ifile inode locked. 109 */ 110 int 111 lfs_rf_valloc(struct lfs *fs, ino_t ino, int vers, struct lwp *l, 112 struct vnode **vpp, union lfs_dinode *dip) 113 { 114 struct vattr va; 115 struct vnode *vp; 116 struct inode *ip; 117 int error; 118 119 KASSERT(ino > LFS_IFILE_INUM); 120 LFS_ASSERT_MAXINO(fs, ino); 121 122 ASSERT_SEGLOCK(fs); /* XXX it doesn't, really */ 123 124 /* 125 * First, just try a vget. If the version number is the one we want, 126 * we don't have to do anything else. If the version number is wrong, 127 * take appropriate action. 128 */ 129 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, LK_EXCLUSIVE, &vp); 130 if (error == 0) { 131 DLOG((DLOG_RF, "lfs_rf_valloc[1]: ino %d vp %p\n", 132 (int)ino, vp)); 133 134 *vpp = vp; 135 ip = VTOI(vp); 136 DLOG((DLOG_RF, " ip->i_gen=%jd dip nlink %jd seeking" 137 " version %jd\n", (intmax_t)ip->i_gen, 138 (intmax_t)(dip == NULL ? -1 139 : lfs_dino_getnlink(fs, dip)), (intmax_t)vers)); 140 if (ip->i_gen == vers) { 141 /* 142 * We have what we wanted already. 143 */ 144 DLOG((DLOG_RF, " pre-existing\n")); 145 return 0; 146 } else if (ip->i_gen < vers && dip != NULL 147 && lfs_dino_getnlink(fs, dip) > 0) { 148 /* 149 * We have found a newer version. Truncate 150 * the old vnode to zero and re-initialize 151 * from the given dinode. 152 */ 153 DLOG((DLOG_RF, " replace old version %jd\n", 154 (intmax_t)ip->i_gen)); 155 lfs_truncate(vp, (off_t)0, 0, NOCRED); 156 ip->i_gen = vers; 157 vp->v_type = IFTOVT(lfs_dino_getmode(fs, dip)); 158 update_inoblk_copy_dinode(fs, ip->i_din, dip); 159 LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE); 160 return 0; 161 } else { 162 /* 163 * Not the right version and nothing to 164 * initialize from. Don't recover this data. 165 */ 166 DLOG((DLOG_RF, "ino %d: sought version %d, got %d\n", 167 (int)ino, (int)vers, 168 (int)lfs_dino_getgen(fs, ip->i_din))); 169 vput(vp); 170 *vpp = NULLVP; 171 return EEXIST; 172 } 173 } 174 175 /* 176 * No version of this inode was found in the cache. 177 * Make a new one from the dinode. We will add data blocks 178 * as they come in, so scrub any block addresses off of the 179 * inode and reset block counts to zero. 180 */ 181 if (dip == NULL) 182 return ENOENT; 183 184 vattr_null(&va); 185 va.va_type = IFTOVT(lfs_dino_getmode(fs, dip)); 186 va.va_mode = lfs_dino_getmode(fs, dip) & ALLPERMS; 187 va.va_fileid = ino; 188 va.va_gen = vers; 189 error = vcache_new(fs->lfs_ivnode->v_mount, NULL, &va, NOCRED, NULL, 190 &vp); 191 if (error) 192 return error; 193 error = vn_lock(vp, LK_EXCLUSIVE); 194 if (error) 195 goto err; 196 197 ip = VTOI(vp); 198 update_inoblk_copy_dinode(fs, ip->i_din, dip); 199 200 DLOG((DLOG_RF, "lfs_valloc[2] ino %d vp %p size=%lld effnblks=%d," 201 " blocks=%d\n", (int)ino, vp, (long long)ip->i_size, 202 (int)ip->i_lfs_effnblks, 203 (int)lfs_dino_getblocks(fs, ip->i_din))); 204 *vpp = vp; 205 return 0; 206 207 err: 208 vrele(vp); 209 *vpp = NULLVP; 210 return error; 211 } 212 213 /* 214 * Load the appropriate indirect block, and change the appropriate pointer. 215 * Mark the block dirty. Do segment and avail accounting. 216 */ 217 static int 218 update_meta(struct lfs *fs, ino_t ino, int vers, daddr_t lbn, 219 daddr_t ndaddr, size_t size, struct lwp *l) 220 { 221 int error; 222 struct vnode *vp; 223 struct inode *ip; 224 daddr_t odaddr; 225 struct indir a[ULFS_NIADDR]; 226 int num; 227 struct buf *bp; 228 SEGUSE *sup; 229 u_int64_t newsize, loff; 230 231 KASSERT(lbn >= 0); /* no indirect blocks */ 232 KASSERT(ino > LFS_IFILE_INUM); 233 LFS_ASSERT_MAXINO(fs, ino); 234 235 DLOG((DLOG_RF, "update_meta: ino %d lbn %d size %d at 0x%jx\n", 236 (int)ino, (int)lbn, (int)size, (uintmax_t)ndaddr)); 237 238 if ((error = lfs_rf_valloc(fs, ino, vers, l, &vp, NULL)) != 0) 239 return error; 240 ip = VTOI(vp); 241 242 /* 243 * If block already exists, note its new location 244 * but do not account it as new. 245 */ 246 ulfs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL, NULL); 247 if (odaddr == UNASSIGNED) { 248 if ((error = lfs_balloc(vp, (lbn << lfs_sb_getbshift(fs)), 249 size, NOCRED, 0, &bp)) != 0) { 250 vput(vp); 251 return (error); 252 } 253 /* No need to write, the block is already on disk */ 254 if (bp->b_oflags & BO_DELWRI) { 255 LFS_UNLOCK_BUF(bp); 256 /* Account recovery of the previous version */ 257 lfs_sb_addavail(fs, lfs_btofsb(fs, bp->b_bcount)); 258 } 259 brelse(bp, BC_INVAL); 260 DLOG((DLOG_RF, "balloc ip->i_lfs_effnblks = %d," 261 " lfs_dino_getblocks(fs, ip->i_din) = %d\n", 262 (int)ip->i_lfs_effnblks, 263 (int)lfs_dino_getblocks(fs, ip->i_din))); 264 } else { 265 /* XXX fragextend? */ 266 DLOG((DLOG_RF, "block exists, no balloc\n")); 267 } 268 269 /* 270 * Extend the file, if it is not large enough already. 271 * XXX This is not exactly right, we don't know how much of the 272 * XXX last block is actually used. 273 * 274 * XXX We should be able to encode the actual data length of the 275 * XXX last block in fi_lastlength, since we can infer the 276 * XXX necessary block length from that using a variant of 277 * XXX lfs_blksize(). 278 */ 279 loff = lfs_lblktosize(fs, lbn); 280 if (loff >= (ULFS_NDADDR << lfs_sb_getbshift(fs))) { 281 /* No fragments */ 282 newsize = loff + 1; 283 } else { 284 /* Subtract only a fragment to account for block size */ 285 newsize = loff + size - lfs_fsbtob(fs, 1) + 1; 286 } 287 288 if (ip->i_size < newsize) { 289 DLOG((DLOG_RF, "ino %d size %d -> %d\n", 290 (int)ino, (int)ip->i_size, (int)newsize)); 291 lfs_dino_setsize(fs, ip->i_din, newsize); 292 ip->i_size = newsize; 293 /* 294 * tell vm our new size for the case the inode won't 295 * appear later. 296 */ 297 uvm_vnp_setsize(vp, newsize); 298 } 299 300 lfs_update_single(fs, NULL, vp, lbn, ndaddr, size); 301 302 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, ndaddr), bp); 303 DLOG((DLOG_SU, "seg %jd += %jd for ino %jd" 304 " lbn %jd db 0x%jd (rfw)\n", 305 (intmax_t)lfs_dtosn(fs, ndaddr), 306 (intmax_t)size, 307 (intmax_t)ip->i_number, 308 (intmax_t)lbn, 309 (intmax_t)ndaddr)); 310 sup->su_nbytes += size; 311 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, ndaddr), bp); 312 313 /* differences here should be due to UNWRITTEN indirect blocks. */ 314 if (vp->v_type != VLNK) { 315 if (!(ip->i_lfs_effnblks >= lfs_dino_getblocks(fs, ip->i_din)) 316 #if 0 317 || !(lfs_lblkno(fs, ip->i_size) > ULFS_NDADDR || 318 ip->i_lfs_effnblks == lfs_dino_getblocks(fs, ip->i_din)) 319 #endif /* 0 */ 320 ) { 321 vprint("vnode", vp); 322 printf("effnblks=%jd dino_getblocks=%jd\n", 323 (intmax_t)ip->i_lfs_effnblks, 324 (intmax_t)lfs_dino_getblocks(fs, ip->i_din)); 325 } 326 KASSERT(ip->i_lfs_effnblks >= lfs_dino_getblocks(fs, ip->i_din)); 327 #if 0 328 KASSERT(lfs_lblkno(fs, ip->i_size) > ULFS_NDADDR || 329 ip->i_lfs_effnblks == lfs_dino_getblocks(fs, ip->i_din)); 330 #endif /* 0 */ 331 } 332 333 #ifdef DEBUG 334 /* Now look again to make sure it worked */ 335 ulfs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL, NULL); 336 if (LFS_DBTOFSB(fs, odaddr) != ndaddr) 337 DLOG((DLOG_RF, "update_meta: failed setting ino %jd lbn %jd" 338 " to %jd\n", (intmax_t)ino, (intmax_t)lbn, (intmax_t)ndaddr)); 339 #endif /* DEBUG */ 340 vput(vp); 341 return 0; 342 } 343 344 /* 345 * Copy some the fields of the dinode as needed by update_inoblk(). 346 */ 347 static void 348 update_inoblk_copy_dinode(struct lfs *fs, 349 union lfs_dinode *dstu, const union lfs_dinode *srcu) 350 { 351 if (fs->lfs_is64) { 352 struct lfs64_dinode *dst = &dstu->u_64; 353 const struct lfs64_dinode *src = &srcu->u_64; 354 unsigned i; 355 356 /* 357 * Copy everything but the block pointers and di_blocks. 358 * XXX what about di_extb? 359 */ 360 dst->di_mode = src->di_mode; 361 dst->di_nlink = src->di_nlink; 362 dst->di_uid = src->di_uid; 363 dst->di_gid = src->di_gid; 364 dst->di_blksize = src->di_blksize; 365 dst->di_size = src->di_size; 366 dst->di_atime = src->di_atime; 367 dst->di_mtime = src->di_mtime; 368 dst->di_ctime = src->di_ctime; 369 dst->di_birthtime = src->di_birthtime; 370 dst->di_mtimensec = src->di_mtimensec; 371 dst->di_atimensec = src->di_atimensec; 372 dst->di_ctimensec = src->di_ctimensec; 373 dst->di_birthnsec = src->di_birthnsec; 374 dst->di_gen = src->di_gen; 375 dst->di_kernflags = src->di_kernflags; 376 dst->di_flags = src->di_flags; 377 dst->di_extsize = src->di_extsize; 378 dst->di_modrev = src->di_modrev; 379 dst->di_inumber = src->di_inumber; 380 for (i = 0; i < __arraycount(src->di_spare); i++) { 381 dst->di_spare[i] = src->di_spare[i]; 382 } 383 /* Short symlinks store their data in di_db. */ 384 if ((src->di_mode & LFS_IFMT) == LFS_IFLNK 385 && src->di_size < lfs_sb_getmaxsymlinklen(fs)) { 386 memcpy(dst->di_db, src->di_db, src->di_size); 387 } 388 } else { 389 struct lfs32_dinode *dst = &dstu->u_32; 390 const struct lfs32_dinode *src = &srcu->u_32; 391 392 /* Get mode, link count, size, and times */ 393 memcpy(dst, src, offsetof(struct lfs32_dinode, di_db[0])); 394 395 /* Then the rest, except di_blocks */ 396 dst->di_flags = src->di_flags; 397 dst->di_gen = src->di_gen; 398 dst->di_uid = src->di_uid; 399 dst->di_gid = src->di_gid; 400 dst->di_modrev = src->di_modrev; 401 402 /* Short symlinks store their data in di_db. */ 403 if ((src->di_mode & LFS_IFMT) == LFS_IFLNK 404 && src->di_size < lfs_sb_getmaxsymlinklen(fs)) { 405 memcpy(dst->di_db, src->di_db, src->di_size); 406 } 407 } 408 } 409 410 static int 411 update_inoblk(struct lfs_inofuncarg *lifa) 412 { 413 struct lfs *fs; 414 daddr_t offset; 415 struct lwp *l; 416 struct vnode *devvp, *vp; 417 struct inode *ip; 418 union lfs_dinode *dip; 419 struct buf *dbp, *ibp; 420 int error; 421 IFILE *ifp; 422 unsigned i, num; 423 uint32_t gen; 424 char *buf; 425 ino_t ino; 426 427 fs = lifa->fs; 428 offset = lifa->offset; 429 l = lifa->l; 430 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 431 432 /* 433 * Get the inode, update times and perms. 434 * DO NOT update disk blocks, we do that separately. 435 */ 436 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs), 437 0, &dbp); 438 if (error) { 439 DLOG((DLOG_RF, "update_inoblk: bread returned %d\n", error)); 440 return error; 441 } 442 buf = malloc(dbp->b_bcount, M_SEGMENT, M_WAITOK); 443 memcpy(buf, dbp->b_data, dbp->b_bcount); 444 brelse(dbp, BC_AGE); 445 num = LFS_INOPB(fs); 446 for (i = num; i-- > 0; ) { 447 dip = DINO_IN_BLOCK(fs, buf, i); 448 ino = lfs_dino_getinumber(fs, dip); 449 if (ino <= LFS_IFILE_INUM) 450 continue; 451 452 LFS_ASSERT_MAXINO(fs, ino); 453 454 /* Check generation number */ 455 LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp); 456 gen = lfs_if_getversion(fs, ifp); 457 brelse(ibp, 0); 458 if (lfs_dino_getgen(fs, dip) < gen) { 459 continue; 460 } 461 462 /* 463 * This inode is the newest generation. Load it. 464 */ 465 error = lfs_rf_valloc(fs, ino, lfs_dino_getgen(fs, dip), 466 l, &vp, dip); 467 if (error) { 468 DLOG((DLOG_RF, "update_inoblk: lfs_rf_valloc" 469 " returned %d\n", error)); 470 continue; 471 } 472 ip = VTOI(vp); 473 if (lfs_dino_getsize(fs, dip) != ip->i_size 474 && vp->v_type != VLNK) { 475 /* XXX What should we do with symlinks? */ 476 DLOG((DLOG_RF, " ino %jd size %jd -> %jd\n", 477 (intmax_t)ino, 478 (intmax_t)ip->i_size, 479 (intmax_t)lfs_dino_getsize(fs, dip))); 480 lfs_truncate(vp, lfs_dino_getsize(fs, dip), 0, 481 NOCRED); 482 } 483 update_inoblk_copy_dinode(fs, ip->i_din, dip); 484 485 ip->i_flags = lfs_dino_getflags(fs, dip); 486 ip->i_gen = lfs_dino_getgen(fs, dip); 487 ip->i_uid = lfs_dino_getuid(fs, dip); 488 ip->i_gid = lfs_dino_getgid(fs, dip); 489 490 ip->i_mode = lfs_dino_getmode(fs, dip); 491 ip->i_nlink = lfs_dino_getnlink(fs, dip); 492 ip->i_size = lfs_dino_getsize(fs, dip); 493 494 LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE); 495 496 /* Re-initialize to get type right */ 497 ulfs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p, 498 &vp); 499 500 /* Record change in location and do segment accounting */ 501 lfs_update_iaddr(fs, ip, offset); 502 503 vput(vp); 504 } 505 free(buf, M_SEGMENT); 506 507 return 0; 508 } 509 510 /* 511 * Note the highest generation number of each inode in the Ifile. 512 * This allows us to skip processing data for intermediate versions. 513 */ 514 static int 515 update_inogen(struct lfs_inofuncarg *lifa) 516 { 517 struct lfs *fs; 518 daddr_t offset; 519 struct vnode *devvp; 520 union lfs_dinode *dip; 521 struct buf *dbp, *ibp; 522 int error; 523 IFILE *ifp; 524 unsigned i, num; 525 526 fs = lifa->fs; 527 offset = lifa->offset; 528 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 529 530 /* Read inode block */ 531 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs), 532 0, &dbp); 533 if (error) { 534 DLOG((DLOG_RF, "update_inoblk: bread returned %d\n", error)); 535 return error; 536 } 537 538 /* Check each inode against ifile entry */ 539 num = LFS_INOPB(fs); 540 for (i = num; i-- > 0; ) { 541 dip = DINO_IN_BLOCK(fs, dbp->b_data, i); 542 if (lfs_dino_getinumber(fs, dip) == LFS_IFILE_INUM) 543 continue; 544 545 /* Update generation number */ 546 LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp); 547 if (lfs_if_getversion(fs, ifp) < lfs_dino_getgen(fs, dip)) 548 lfs_if_setversion(fs, ifp, lfs_dino_getgen(fs, dip)); 549 LFS_WRITEIENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp); 550 if (error) 551 break; 552 } 553 brelse(dbp, 0); 554 555 return error; 556 } 557 558 static int 559 finfo_func_rfw(struct lfs_finfofuncarg *lffa) 560 { 561 struct lfs *fs; 562 FINFO *fip; 563 daddr_t *offsetp; 564 struct lwp *l; 565 int j; 566 size_t size; 567 ino_t ino; 568 569 fs = lffa->fs; 570 fip = lffa->finfop; 571 offsetp = lffa->offsetp; 572 l = lffa->l; 573 size = lfs_sb_getbsize(fs); 574 ino = lfs_fi_getino(fs, fip); 575 LFS_ASSERT_MAXINO(fs, ino); 576 for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) { 577 if (j == lfs_fi_getnblocks(fs, fip) - 1) 578 size = lfs_fi_getlastlength(fs, fip); 579 580 /* Account for and update any direct blocks */ 581 if (ino > LFS_IFILE_INUM && 582 lfs_fi_getblock(fs, fip, j) >= 0) { 583 update_meta(fs, ino, 584 lfs_fi_getversion(fs, fip), 585 lfs_fi_getblock(fs, fip, j), 586 *offsetp, size, l); 587 ++rblkcnt; 588 } 589 *offsetp += lfs_btofsb(fs, size); 590 } 591 592 return 0; 593 } 594 595 int 596 lfs_skip_superblock(struct lfs *fs, daddr_t *offsetp) 597 { 598 daddr_t offset; 599 int i; 600 601 /* 602 * If this is segment 0, skip the label. 603 * If the segment has a superblock and we're at the top 604 * of the segment, skip the superblock. 605 */ 606 offset = *offsetp; 607 if (offset == lfs_sb_gets0addr(fs)) { 608 offset += lfs_btofsb(fs, LFS_LABELPAD); 609 } 610 for (i = 0; i < LFS_MAXNUMSB; i++) { 611 if (offset == lfs_sb_getsboff(fs, i)) { 612 offset += lfs_btofsb(fs, LFS_SBPAD); 613 break; 614 } 615 } 616 *offsetp = offset; 617 return 0; 618 } 619 620 /* 621 * Read the partial sement at offset. 622 * 623 * If finfo_func and ino_func are both NULL, check the summary 624 * and data checksums. During roll forward, this must be done in its 625 * entirety before processing any blocks. 626 * 627 * If finfo_func is given, use that to process every file block 628 * in the segment summary. If ino_func is given, use that to process 629 * every inode block. 630 */ 631 int 632 lfs_parse_pseg(struct lfs *fs, daddr_t *offsetp, u_int64_t nextserial, 633 kauth_cred_t cred, int *pseg_flags, struct lwp *l, 634 int (*ino_func)(struct lfs_inofuncarg *), 635 int (*finfo_func)(struct lfs_finfofuncarg *), 636 int flags, void *arg) 637 { 638 struct vnode *devvp; 639 struct buf *bp, *dbp; 640 int error, ninos, i, j; 641 SEGSUM *ssp; 642 daddr_t offset, prevoffset; 643 IINFO *iip; 644 FINFO *fip; 645 size_t size; 646 uint32_t datasum, foundsum; 647 char *buf; 648 struct lfs_inofuncarg lifa; 649 struct lfs_finfofuncarg lffa; 650 651 KASSERT(fs != NULL); 652 KASSERT(offsetp != NULL); 653 654 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 655 656 /* Set up callback arguments */ 657 lifa.fs = fs; 658 /* lifa.offset = offset; */ 659 lifa.cred = cred; 660 lifa.l = l; 661 lifa.buf = malloc(lfs_sb_getbsize(fs), M_SEGMENT, M_WAITOK); 662 663 lifa.arg = arg; 664 665 lffa.fs = fs; 666 /* lffa.offsetp = offsetp; */ 667 /* lffa.finfop = finfop; */ 668 lffa.cred = cred; 669 lffa.l = l; 670 lffa.arg = arg; 671 672 prevoffset = *offsetp; 673 lfs_skip_superblock(fs, offsetp); 674 offset = *offsetp; 675 676 /* Read in the segment summary */ 677 buf = malloc(lfs_sb_getsumsize(fs), M_SEGMENT, M_WAITOK); 678 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getsumsize(fs), 679 0, &bp); 680 if (error) 681 goto err; 682 memcpy(buf, bp->b_data, bp->b_bcount); 683 brelse(bp, BC_AGE); 684 685 ssp = (SEGSUM *)buf; 686 687 if (lfs_ss_getmagic(fs, ssp) != SS_MAGIC) { 688 DLOG((DLOG_RF, "Bad magic at 0x%" PRIx64 "\n", 689 offset)); 690 offset = -1; 691 goto err; 692 } 693 694 if (flags & CKSEG_CKSUM) { 695 size_t sumstart; 696 697 sumstart = lfs_ss_getsumstart(fs); 698 if (lfs_ss_getsumsum(fs, ssp) != 699 cksum((char *)ssp + sumstart, 700 lfs_sb_getsumsize(fs) - sumstart)) { 701 DLOG((DLOG_RF, "Sumsum error at 0x%" PRIx64 "\n", 702 offset)); 703 offset = -1; 704 goto err; 705 } 706 } 707 708 #if 0 709 /* 710 * Under normal conditions, we should never be producing 711 * a partial segment with neither inode blocks nor data blocks. 712 * However, these do sometimes appear and they need not 713 * prevent us from continuing. 714 */ 715 if (lfs_ss_getnfinfo(fs, ssp) == 0 && 716 lfs_ss_getninos(fs, ssp) == 0) { 717 DLOG((DLOG_RF, "Empty pseg at 0x%" PRIx64 "\n", 718 offset)); 719 offset = -1; 720 goto err; 721 } 722 #endif /* 0 */ 723 724 if (lfs_sb_getversion(fs) == 1) { 725 if (lfs_ss_getcreate(fs, ssp) < lfs_sb_gettstamp(fs)) { 726 DLOG((DLOG_RF, "Old data at 0x%" PRIx64 "\n", offset)); 727 offset = -1; 728 goto err; 729 } 730 } else { 731 if (nextserial > 0 732 && lfs_ss_getserial(fs, ssp) != nextserial) { 733 DLOG((DLOG_RF, "Serial number at 0x%jx given as 0x%jx," 734 " expected 0x%jx\n", (intmax_t)offset, 735 (intmax_t)lfs_ss_getserial(fs, ssp), 736 (intmax_t)nextserial)); 737 offset = -1; 738 goto err; 739 } 740 if (lfs_ss_getident(fs, ssp) != lfs_sb_getident(fs)) { 741 DLOG((DLOG_RF, "Incorrect fsid (0x%x vs 0x%x) at 0x%" 742 PRIx64 "\n", lfs_ss_getident(fs, ssp), 743 lfs_sb_getident(fs), offset)); 744 offset = -1; 745 goto err; 746 } 747 } 748 749 #ifdef DIAGNOSTIC 750 if (lfs_ss_getnfinfo(fs, ssp) > lfs_sb_getssize(fs) / lfs_sb_getfsize(fs)) { 751 printf("At offset 0x%jx, nfinfo %jd > max frags %jd\n", 752 (intmax_t)offset, 753 (intmax_t)lfs_ss_getnfinfo(fs, ssp), 754 (intmax_t)lfs_sb_getssize(fs) / lfs_sb_getfsize(fs)); 755 } 756 #endif 757 KASSERT(lfs_ss_getnfinfo(fs, ssp) <= lfs_sb_getssize(fs) / lfs_sb_getfsize(fs)); 758 #ifdef DIAGNOSTIC 759 if (lfs_ss_getnfinfo(fs, ssp) > lfs_sb_getfsize(fs) / sizeof(FINFO32)) { 760 printf("At offset 0x%jx, nfinfo %jd > max entries %jd\n", 761 (intmax_t)offset, 762 (intmax_t)lfs_ss_getnfinfo(fs, ssp), 763 (intmax_t)lfs_sb_getssize(fs) / lfs_sb_getfsize(fs)); 764 } 765 #endif 766 KASSERT(lfs_ss_getnfinfo(fs, ssp) <= lfs_sb_getfsize(fs) / sizeof(FINFO32)); 767 768 if (pseg_flags) 769 *pseg_flags = lfs_ss_getflags(fs, ssp); 770 ninos = howmany(lfs_ss_getninos(fs, ssp), LFS_INOPB(fs)); 771 iip = SEGSUM_IINFOSTART(fs, buf); 772 fip = SEGSUM_FINFOBASE(fs, (SEGSUM *)buf); 773 774 /* Handle individual blocks */ 775 foundsum = 0; 776 offset += lfs_btofsb(fs, lfs_sb_getsumsize(fs)); 777 for (i = 0; i < lfs_ss_getnfinfo(fs, ssp) || ninos; ++i) { 778 /* Inode block? */ 779 if (ninos && lfs_ii_getblock(fs, iip) == offset) { 780 if (flags & CKSEG_CKSUM) { 781 /* Read in the head and add to the buffer */ 782 error = bread(devvp, LFS_FSBTODB(fs, offset), 783 lfs_sb_getbsize(fs), 0, &dbp); 784 if (error) { 785 offset = -1; 786 goto err; 787 } 788 foundsum = lfs_cksum_part(dbp->b_data, 789 sizeof(uint32_t), foundsum); 790 brelse(dbp, BC_AGE); 791 } else if (ino_func != NULL) { 792 lifa.offset = offset; 793 error = (*ino_func)(&lifa); 794 if (error != 0) { 795 offset = -1; 796 goto err; 797 } 798 } 799 800 offset += lfs_btofsb(fs, lfs_sb_getibsize(fs)); 801 iip = NEXTLOWER_IINFO(fs, iip); 802 --ninos; 803 --i; /* compensate for ++i in loop header */ 804 continue; 805 } 806 807 /* File block */ 808 size = lfs_sb_getbsize(fs); 809 if (flags & CKSEG_CKSUM) { 810 for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) { 811 if (j == lfs_fi_getnblocks(fs, fip) - 1) 812 size = lfs_fi_getlastlength(fs, fip); 813 error = bread(devvp, LFS_FSBTODB(fs, offset), 814 size, 0, &dbp); 815 if (error) { 816 offset = -1; 817 goto err; 818 } 819 foundsum = lfs_cksum_part(dbp->b_data, 820 sizeof(uint32_t), foundsum); 821 brelse(dbp, BC_AGE); 822 offset += lfs_btofsb(fs, size); 823 } 824 } else if (finfo_func != NULL) { 825 lffa.offsetp = &offset; 826 lffa.finfop = fip; 827 (*finfo_func)(&lffa); 828 } else { 829 int n = lfs_fi_getnblocks(fs, fip); 830 size = lfs_fi_getlastlength(fs, fip); 831 offset += lfs_btofsb(fs, lfs_sb_getbsize(fs) * (n - 1) 832 + size); 833 } 834 fip = NEXT_FINFO(fs, fip); 835 } 836 837 /* Checksum the array, compare */ 838 if (flags & CKSEG_CKSUM) { 839 datasum = lfs_ss_getdatasum(fs, ssp); 840 foundsum = lfs_cksum_fold(foundsum); 841 if (datasum != foundsum) { 842 DLOG((DLOG_RF, "Datasum error at 0x%" PRIx64 843 " (wanted %x got %x)\n", 844 offset, datasum, foundsum)); 845 offset = -1; 846 goto err; 847 } 848 } else { 849 /* Don't clog the buffer queue */ 850 mutex_enter(&lfs_lock); 851 if (locked_queue_count > LFS_MAX_BUFS || 852 locked_queue_bytes > LFS_MAX_BYTES) { 853 lfs_flush(fs, SEGM_CKP, 0); 854 } 855 mutex_exit(&lfs_lock); 856 } 857 858 /* 859 * If we're at the end of the segment, move to the next. 860 * A partial segment needs space for a segment header (1 fsb) 861 * and a full block ("frag" fsb). Thus, adding "frag" fsb should 862 * still be within the current segment (whereas frag + 1 might 863 * be at the start of the next segment). 864 * 865 * This needs to match the definition of LFS_PARTIAL_FITS 866 * in lfs_segment.c. 867 */ 868 if (lfs_dtosn(fs, offset + lfs_sb_getfrag(fs)) 869 != lfs_dtosn(fs, offset)) { 870 if (lfs_dtosn(fs, offset) == lfs_dtosn(fs, lfs_ss_getnext(fs, 871 ssp))) { 872 offset = -1; 873 goto err; 874 } 875 offset = lfs_ss_getnext(fs, ssp); 876 DLOG((DLOG_RF, "LFS roll forward: moving to offset 0x%" PRIx64 877 " -> segment %d\n", offset, lfs_dtosn(fs,offset))); 878 } 879 if (flags & CKSEG_AVAIL) 880 lfs_sb_subavail(fs, offset - prevoffset); 881 882 err: 883 free(lifa.buf, M_SEGMENT); 884 free(buf, M_SEGMENT); 885 886 *offsetp = offset; 887 return 0; 888 } 889 890 /* 891 * Roll forward. 892 */ 893 void 894 lfs_roll_forward(struct lfs *fs, struct mount *mp, struct lwp *l) 895 { 896 int flags, dirty; 897 daddr_t startoffset, offset, nextoffset, endpseg; 898 u_int64_t nextserial, startserial, endserial; 899 int sn, curseg; 900 struct proc *p; 901 kauth_cred_t cred; 902 SEGUSE *sup; 903 struct buf *bp; 904 905 p = l ? l->l_proc : NULL; 906 cred = p ? p->p_cred : NOCRED; 907 908 /* 909 * We don't roll forward for v1 filesystems, because 910 * of the danger that the clock was turned back between the last 911 * checkpoint and crash. This would roll forward garbage. 912 * 913 * v2 filesystems don't have this problem because they use a 914 * monotonically increasing serial number instead of a timestamp. 915 */ 916 rblkcnt = 0; 917 if ((lfs_sb_getpflags(fs) & LFS_PF_CLEAN) || !lfs_do_rfw 918 || lfs_sb_getversion(fs) <= 1 || p == NULL) 919 return; 920 921 DLOG((DLOG_RF, "%s: begin roll forward at serial 0x%jx\n", 922 lfs_sb_getfsmnt(fs), (intmax_t)lfs_sb_getserial(fs))); 923 DEBUG_CHECK_FREELIST(fs); 924 925 /* 926 * Phase I: Find the address of the last good partial 927 * segment that was written after the checkpoint. Mark 928 * the segments in question dirty, so they won't be 929 * reallocated. 930 */ 931 endpseg = startoffset = offset = lfs_sb_getoffset(fs); 932 flags = 0x0; 933 DLOG((DLOG_RF, "LFS roll forward phase 1: start at offset 0x%" 934 PRIx64 "\n", offset)); 935 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp); 936 if (!(sup->su_flags & SEGUSE_DIRTY)) 937 lfs_sb_subnclean(fs, 1); 938 sup->su_flags |= SEGUSE_DIRTY; 939 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp); 940 941 startserial = lfs_sb_getserial(fs); 942 endserial = nextserial = startserial + 1; 943 nextoffset = offset; 944 while (1) { 945 nextoffset = offset; 946 lfs_parse_pseg(fs, &nextoffset, nextserial, 947 cred, &flags, l, NULL, NULL, CKSEG_CKSUM, NULL); 948 if (nextoffset == -1) 949 break; 950 if (lfs_sntod(fs, offset) != lfs_sntod(fs, nextoffset)) { 951 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset), 952 bp); 953 if (!(sup->su_flags & SEGUSE_DIRTY)) 954 lfs_sb_subnclean(fs, 1); 955 sup->su_flags |= SEGUSE_DIRTY; 956 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp); 957 } 958 959 DLOG((DLOG_RF, "LFS roll forward phase 1: offset=0x%jx" 960 " serial=0x%jx\n", (intmax_t)nextoffset, 961 (intmax_t)nextserial)); 962 if (flags & SS_DIROP) { 963 DLOG((DLOG_RF, "lfs_mountfs: dirops at 0x%" 964 PRIx64 "\n", offset)); 965 if (!(flags & SS_CONT)) { 966 DLOG((DLOG_RF, "lfs_mountfs: dirops end " 967 "at 0x%" PRIx64 "\n", offset)); 968 } 969 } 970 offset = nextoffset; 971 ++nextserial; 972 973 if (!(flags & SS_CONT)) { 974 endpseg = nextoffset; 975 endserial = nextserial; 976 } 977 if (lfs_rfw_max_psegs > 0 978 && nextserial > startserial + lfs_rfw_max_psegs) 979 break; 980 } 981 if (flags & SS_CONT) { 982 DLOG((DLOG_RF, "LFS roll forward: warning: incomplete " 983 "dirops discarded (0x%jx < 0x%jx)\n", 984 endpseg, nextoffset)); 985 } 986 if (lfs_sb_getversion(fs) > 1) 987 lfs_sb_setserial(fs, endserial); 988 DLOG((DLOG_RF, "LFS roll forward phase 1: completed: " 989 "endpseg=0x%" PRIx64 "\n", endpseg)); 990 offset = startoffset; 991 if (offset != endpseg) { 992 /* Don't overwrite what we're trying to preserve */ 993 lfs_sb_setoffset(fs, endpseg); 994 lfs_sb_setcurseg(fs, lfs_sntod(fs, lfs_dtosn(fs, endpseg))); 995 for (sn = curseg = lfs_dtosn(fs, lfs_sb_getcurseg(fs));;) { 996 sn = (sn + 1) % lfs_sb_getnseg(fs); 997 /* XXX could we just fail to roll forward? */ 998 if (sn == curseg) 999 panic("lfs_mountfs: no clean segments"); 1000 LFS_SEGENTRY(sup, fs, sn, bp); 1001 dirty = (sup->su_flags & SEGUSE_DIRTY); 1002 brelse(bp, 0); 1003 if (!dirty) 1004 break; 1005 } 1006 lfs_sb_setnextseg(fs, lfs_sntod(fs, sn)); 1007 /* Explicitly set this segment dirty */ 1008 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, endpseg), bp); 1009 sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE; 1010 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, endpseg), bp); 1011 1012 /* 1013 * Phase II: Identify the highest generation of each 1014 * inode. We will ignore inodes and data blocks 1015 * belonging to old versions. 1016 */ 1017 offset = startoffset; 1018 nextserial = startserial + 1; 1019 DLOG((DLOG_RF, "LFS roll forward phase 2 beginning\n")); 1020 while (offset > 0 && offset != endpseg) { 1021 lfs_parse_pseg(fs, &offset, nextserial++, cred, 1022 NULL, l, update_inogen, NULL, 1023 CKSEG_NONE, NULL); 1024 DEBUG_CHECK_FREELIST(fs); 1025 } 1026 1027 /* 1028 * Phase III: Update inodes. 1029 */ 1030 offset = startoffset; 1031 nextserial = startserial + 1; 1032 DLOG((DLOG_RF, "LFS roll forward phase 3 beginning\n")); 1033 while (offset > 0 && offset != endpseg) { 1034 lfs_parse_pseg(fs, &offset, nextserial++, cred, 1035 NULL, l, update_inoblk, NULL, 1036 CKSEG_NONE, NULL); 1037 DEBUG_CHECK_FREELIST(fs); 1038 } 1039 1040 /* 1041 * Phase IV: Roll forward, updating data blocks. 1042 */ 1043 offset = startoffset; 1044 nextserial = startserial + 1; 1045 DLOG((DLOG_RF, "LFS roll forward phase 4 beginning\n")); 1046 while (offset > 0 && offset != endpseg) { 1047 lfs_parse_pseg(fs, &offset, nextserial++, cred, 1048 NULL, l, NULL, finfo_func_rfw, 1049 CKSEG_AVAIL, NULL); 1050 DEBUG_CHECK_FREELIST(fs); 1051 } 1052 1053 /* 1054 * Finish: flush our changes to disk. 1055 */ 1056 lfs_sb_setserial(fs, endserial); 1057 1058 lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC); 1059 DLOG((DLOG_RF, "lfs_mountfs: roll forward " 1060 "examined %jd blocks\n", 1061 (intmax_t)(endpseg - startoffset))); 1062 } 1063 1064 /* Get rid of our vnodes, except the ifile */ 1065 drop_vnode_pages(mp, l); 1066 DLOG((DLOG_RF, "LFS roll forward complete\n")); 1067 printf("%s: roll forward recovered %d data blocks\n", 1068 lfs_sb_getfsmnt(fs), rblkcnt); 1069 1070 /* 1071 * At this point we have no more changes to write to disk. 1072 * Reset the "avail" count to match the segments as they 1073 * appear on disk, and the clean segment count. 1074 */ 1075 lfs_reset_avail(fs); 1076 } 1077 1078 static bool 1079 all_selector(void *cl, struct vnode *vp) 1080 { 1081 return true; 1082 } 1083 1084 /* 1085 * Dump any pages from vnodes that may have been put on 1086 * during truncation. 1087 */ 1088 static void 1089 drop_vnode_pages(struct mount *mp, struct lwp *l) 1090 { 1091 struct vnode_iterator *marker; 1092 struct lfs *fs; 1093 struct vnode *vp; 1094 1095 fs = VFSTOULFS(mp)->um_lfs; 1096 vfs_vnode_iterator_init(mp, &marker); 1097 while ((vp = vfs_vnode_iterator_next(marker, 1098 all_selector, NULL)) != NULL) { 1099 if (vp == fs->lfs_ivnode) 1100 continue; 1101 VOP_LOCK(vp, LK_EXCLUSIVE | LK_RETRY); 1102 uvm_vnp_setsize(vp, 0); 1103 uvm_vnp_setsize(vp, VTOI(vp)->i_size); 1104 VOP_UNLOCK(vp); 1105 vrele(vp); 1106 } 1107 vfs_vnode_iterator_destroy(marker); 1108 } 1109