1 /* $NetBSD: lfs_rfw.c,v 1.41 2025/11/06 15:45:32 perseant Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2025 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Konrad E. Schroder <perseant (at) hhhh.org>. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: lfs_rfw.c,v 1.41 2025/11/06 15:45:32 perseant Exp $"); 34 35 #if defined(_KERNEL_OPT) 36 #include "opt_quota.h" 37 #endif 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/namei.h> 42 #include <sys/proc.h> 43 #include <sys/kernel.h> 44 #include <sys/vnode.h> 45 #include <sys/mount.h> 46 #include <sys/kthread.h> 47 #include <sys/buf.h> 48 #include <sys/device.h> 49 #include <sys/file.h> 50 #include <sys/disklabel.h> 51 #include <sys/ioctl.h> 52 #include <sys/errno.h> 53 #include <sys/malloc.h> 54 #include <sys/pool.h> 55 #include <sys/socket.h> 56 #include <sys/stat.h> 57 #include <sys/syslog.h> 58 #include <sys/sysctl.h> 59 #include <sys/conf.h> 60 #include <sys/kauth.h> 61 62 #include <miscfs/specfs/specdev.h> 63 64 #include <ufs/lfs/ulfs_quotacommon.h> 65 #include <ufs/lfs/ulfs_inode.h> 66 #include <ufs/lfs/ulfsmount.h> 67 #include <ufs/lfs/ulfs_extern.h> 68 69 #include <uvm/uvm_extern.h> 70 71 #include <ufs/lfs/lfs.h> 72 #include <ufs/lfs/lfs_accessors.h> 73 #include <ufs/lfs/lfs_kernel.h> 74 #include <ufs/lfs/lfs_extern.h> 75 76 #include <miscfs/genfs/genfs.h> 77 #include <miscfs/genfs/genfs_node.h> 78 79 /* 80 * Roll-forward code. 81 */ 82 static bool all_selector(void *, struct vnode *); 83 static void drop_vnode_pages(struct mount *, struct lwp *); 84 static void update_inoblk_copy_dinode(struct lfs *, union lfs_dinode *, 85 const union lfs_dinode *); 86 static int update_inogen(struct lfs_inofuncarg *); 87 static int update_inoblk(struct lfs_inofuncarg *); 88 static int finfo_func_rfw(struct lfs_finfofuncarg *); 89 90 static int update_meta(struct lfs *, ino_t, int, daddr_t, daddr_t, size_t, 91 struct lwp *l); 92 #if 0 93 static bool lfs_isseq(const struct lfs *fs, long int lbn1, long int lbn2); 94 #endif 95 96 extern int lfs_do_rfw; 97 int rblkcnt; 98 int lfs_rfw_max_psegs = 0; 99 100 /* 101 * Allocate a particular inode with a particular version number, freeing 102 * any previous versions of this inode that may have gone before. 103 * Used by the roll-forward code. 104 * 105 * XXX this function does not have appropriate locking to be used on a live fs; 106 * XXX but something similar could probably be used for an "undelete" call. 107 * 108 * Called with the Ifile inode locked. 109 */ 110 int 111 lfs_rf_valloc(struct lfs *fs, ino_t ino, int vers, struct lwp *l, 112 struct vnode **vpp, union lfs_dinode *dip) 113 { 114 struct vattr va; 115 struct vnode *vp; 116 struct inode *ip; 117 int error; 118 119 KASSERT(ino > LFS_IFILE_INUM); 120 LFS_ASSERT_MAXINO(fs, ino); 121 122 ASSERT_SEGLOCK(fs); /* XXX it doesn't, really */ 123 124 /* 125 * First, just try a vget. If the version number is the one we want, 126 * we don't have to do anything else. If the version number is wrong, 127 * take appropriate action. 128 */ 129 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, LK_EXCLUSIVE, &vp); 130 if (error == 0) { 131 DLOG((DLOG_RF, "lfs_rf_valloc[1]: ino %d vp %p\n", 132 (int)ino, vp)); 133 134 *vpp = vp; 135 ip = VTOI(vp); 136 DLOG((DLOG_RF, " ip->i_gen=%jd dip nlink %jd seeking" 137 " version %jd\n", (intmax_t)ip->i_gen, 138 (intmax_t)(dip == NULL ? -1 139 : lfs_dino_getnlink(fs, dip)), (intmax_t)vers)); 140 if (ip->i_gen == vers) { 141 /* 142 * We have what we wanted already. 143 */ 144 DLOG((DLOG_RF, " pre-existing\n")); 145 return 0; 146 } else if (ip->i_gen < vers && dip != NULL 147 && lfs_dino_getnlink(fs, dip) > 0) { 148 /* 149 * We have found a newer version. Truncate 150 * the old vnode to zero and re-initialize 151 * from the given dinode. 152 */ 153 DLOG((DLOG_RF, " replace old version %jd\n", 154 (intmax_t)ip->i_gen)); 155 lfs_truncate(vp, (off_t)0, 0, NOCRED); 156 ip->i_gen = vers; 157 vp->v_type = IFTOVT(lfs_dino_getmode(fs, dip)); 158 update_inoblk_copy_dinode(fs, ip->i_din, dip); 159 LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE); 160 return 0; 161 } else { 162 /* 163 * Not the right version and nothing to 164 * initialize from. Don't recover this data. 165 */ 166 DLOG((DLOG_RF, "ino %d: sought version %d, got %d\n", 167 (int)ino, (int)vers, 168 (int)lfs_dino_getgen(fs, ip->i_din))); 169 vput(vp); 170 *vpp = NULLVP; 171 return EEXIST; 172 } 173 } 174 175 /* 176 * No version of this inode was found in the cache. 177 * Make a new one from the dinode. We will add data blocks 178 * as they come in, so scrub any block addresses off of the 179 * inode and reset block counts to zero. 180 */ 181 if (dip == NULL) 182 return ENOENT; 183 184 vattr_null(&va); 185 va.va_type = IFTOVT(lfs_dino_getmode(fs, dip)); 186 va.va_mode = lfs_dino_getmode(fs, dip) & ALLPERMS; 187 va.va_fileid = ino; 188 va.va_gen = vers; 189 error = vcache_new(fs->lfs_ivnode->v_mount, NULL, &va, NOCRED, NULL, 190 &vp); 191 if (error) 192 return error; 193 error = vn_lock(vp, LK_EXCLUSIVE); 194 if (error) 195 goto err; 196 197 ip = VTOI(vp); 198 update_inoblk_copy_dinode(fs, ip->i_din, dip); 199 200 DLOG((DLOG_RF, "lfs_valloc[2] ino %d vp %p size=%lld effnblks=%d," 201 " blocks=%d\n", (int)ino, vp, (long long)ip->i_size, 202 (int)ip->i_lfs_effnblks, 203 (int)lfs_dino_getblocks(fs, ip->i_din))); 204 *vpp = vp; 205 return 0; 206 207 err: 208 vrele(vp); 209 *vpp = NULLVP; 210 return error; 211 } 212 213 /* 214 * Load the appropriate indirect block, and change the appropriate pointer. 215 * Mark the block dirty. Do segment and avail accounting. 216 */ 217 static int 218 update_meta(struct lfs *fs, ino_t ino, int vers, daddr_t lbn, 219 daddr_t ndaddr, size_t size, struct lwp *l) 220 { 221 int error; 222 struct vnode *vp; 223 struct inode *ip; 224 daddr_t odaddr; 225 struct indir a[ULFS_NIADDR]; 226 int num; 227 struct buf *bp; 228 SEGUSE *sup; 229 u_int64_t newsize, loff; 230 231 KASSERT(lbn >= 0); /* no indirect blocks */ 232 KASSERT(ino > LFS_IFILE_INUM); 233 LFS_ASSERT_MAXINO(fs, ino); 234 235 DLOG((DLOG_RF, "update_meta: ino %d lbn %d size %d at 0x%jx\n", 236 (int)ino, (int)lbn, (int)size, (uintmax_t)ndaddr)); 237 238 if ((error = lfs_rf_valloc(fs, ino, vers, l, &vp, NULL)) != 0) 239 return error; 240 ip = VTOI(vp); 241 242 /* 243 * If block already exists, note its new location 244 * but do not account it as new. 245 */ 246 ulfs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL, NULL); 247 if (odaddr == UNASSIGNED) { 248 if ((error = lfs_balloc(vp, (lbn << lfs_sb_getbshift(fs)), 249 size, NOCRED, 0, &bp)) != 0) { 250 vput(vp); 251 return (error); 252 } 253 /* No need to write, the block is already on disk */ 254 if (bp->b_oflags & BO_DELWRI) { 255 LFS_UNLOCK_BUF(bp); 256 /* Account recovery of the previous version */ 257 lfs_sb_addavail(fs, lfs_btofsb(fs, bp->b_bcount)); 258 } 259 brelse(bp, BC_INVAL); 260 DLOG((DLOG_RF, "balloc ip->i_lfs_effnblks = %d," 261 " lfs_dino_getblocks(fs, ip->i_din) = %d\n", 262 (int)ip->i_lfs_effnblks, 263 (int)lfs_dino_getblocks(fs, ip->i_din))); 264 } else { 265 /* XXX fragextend? */ 266 DLOG((DLOG_RF, "block exists, no balloc\n")); 267 } 268 269 /* 270 * Extend the file, if it is not large enough already. 271 * XXX This is not exactly right, we don't know how much of the 272 * XXX last block is actually used. 273 * 274 * XXX We should be able to encode the actual data length of the 275 * XXX last block in fi_lastlength, since we can infer the 276 * XXX necessary block length from that using a variant of 277 * XXX lfs_blksize(). 278 */ 279 loff = lfs_lblktosize(fs, lbn); 280 if (loff >= (ULFS_NDADDR << lfs_sb_getbshift(fs))) { 281 /* No fragments */ 282 newsize = loff + 1; 283 } else { 284 /* Subtract only a fragment to account for block size */ 285 newsize = loff + size - lfs_fsbtob(fs, 1) + 1; 286 } 287 288 if (ip->i_size < newsize) { 289 DLOG((DLOG_RF, "ino %d size %d -> %d\n", 290 (int)ino, (int)ip->i_size, (int)newsize)); 291 lfs_dino_setsize(fs, ip->i_din, newsize); 292 ip->i_size = newsize; 293 /* 294 * tell vm our new size for the case the inode won't 295 * appear later. 296 */ 297 uvm_vnp_setsize(vp, newsize); 298 } 299 300 lfs_update_single(fs, NULL, vp, lbn, ndaddr, size); 301 302 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, ndaddr), bp); 303 sup->su_nbytes += size; 304 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, ndaddr), bp); 305 306 /* differences here should be due to UNWRITTEN indirect blocks. */ 307 if (vp->v_type != VLNK) { 308 if (!(ip->i_lfs_effnblks >= lfs_dino_getblocks(fs, ip->i_din)) 309 #if 0 310 || !(lfs_lblkno(fs, ip->i_size) > ULFS_NDADDR || 311 ip->i_lfs_effnblks == lfs_dino_getblocks(fs, ip->i_din)) 312 #endif /* 0 */ 313 ) { 314 vprint("vnode", vp); 315 printf("effnblks=%jd dino_getblocks=%jd\n", 316 (intmax_t)ip->i_lfs_effnblks, 317 (intmax_t)lfs_dino_getblocks(fs, ip->i_din)); 318 } 319 KASSERT(ip->i_lfs_effnblks >= lfs_dino_getblocks(fs, ip->i_din)); 320 #if 0 321 KASSERT(lfs_lblkno(fs, ip->i_size) > ULFS_NDADDR || 322 ip->i_lfs_effnblks == lfs_dino_getblocks(fs, ip->i_din)); 323 #endif /* 0 */ 324 } 325 326 #ifdef DEBUG 327 /* Now look again to make sure it worked */ 328 ulfs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL, NULL); 329 if (LFS_DBTOFSB(fs, odaddr) != ndaddr) 330 DLOG((DLOG_RF, "update_meta: failed setting ino %jd lbn %jd" 331 " to %jd\n", (intmax_t)ino, (intmax_t)lbn, (intmax_t)ndaddr)); 332 #endif /* DEBUG */ 333 vput(vp); 334 return 0; 335 } 336 337 /* 338 * Copy some the fields of the dinode as needed by update_inoblk(). 339 */ 340 static void 341 update_inoblk_copy_dinode(struct lfs *fs, 342 union lfs_dinode *dstu, const union lfs_dinode *srcu) 343 { 344 if (fs->lfs_is64) { 345 struct lfs64_dinode *dst = &dstu->u_64; 346 const struct lfs64_dinode *src = &srcu->u_64; 347 unsigned i; 348 349 /* 350 * Copy everything but the block pointers and di_blocks. 351 * XXX what about di_extb? 352 */ 353 dst->di_mode = src->di_mode; 354 dst->di_nlink = src->di_nlink; 355 dst->di_uid = src->di_uid; 356 dst->di_gid = src->di_gid; 357 dst->di_blksize = src->di_blksize; 358 dst->di_size = src->di_size; 359 dst->di_atime = src->di_atime; 360 dst->di_mtime = src->di_mtime; 361 dst->di_ctime = src->di_ctime; 362 dst->di_birthtime = src->di_birthtime; 363 dst->di_mtimensec = src->di_mtimensec; 364 dst->di_atimensec = src->di_atimensec; 365 dst->di_ctimensec = src->di_ctimensec; 366 dst->di_birthnsec = src->di_birthnsec; 367 dst->di_gen = src->di_gen; 368 dst->di_kernflags = src->di_kernflags; 369 dst->di_flags = src->di_flags; 370 dst->di_extsize = src->di_extsize; 371 dst->di_modrev = src->di_modrev; 372 dst->di_inumber = src->di_inumber; 373 for (i = 0; i < __arraycount(src->di_spare); i++) { 374 dst->di_spare[i] = src->di_spare[i]; 375 } 376 /* Short symlinks store their data in di_db. */ 377 if ((src->di_mode & LFS_IFMT) == LFS_IFLNK 378 && src->di_size < lfs_sb_getmaxsymlinklen(fs)) { 379 memcpy(dst->di_db, src->di_db, src->di_size); 380 } 381 } else { 382 struct lfs32_dinode *dst = &dstu->u_32; 383 const struct lfs32_dinode *src = &srcu->u_32; 384 385 /* Get mode, link count, size, and times */ 386 memcpy(dst, src, offsetof(struct lfs32_dinode, di_db[0])); 387 388 /* Then the rest, except di_blocks */ 389 dst->di_flags = src->di_flags; 390 dst->di_gen = src->di_gen; 391 dst->di_uid = src->di_uid; 392 dst->di_gid = src->di_gid; 393 dst->di_modrev = src->di_modrev; 394 395 /* Short symlinks store their data in di_db. */ 396 if ((src->di_mode & LFS_IFMT) == LFS_IFLNK 397 && src->di_size < lfs_sb_getmaxsymlinklen(fs)) { 398 memcpy(dst->di_db, src->di_db, src->di_size); 399 } 400 } 401 } 402 403 static int 404 update_inoblk(struct lfs_inofuncarg *lifa) 405 { 406 struct lfs *fs; 407 daddr_t offset, daddr; 408 struct lwp *l; 409 struct vnode *devvp, *vp; 410 struct inode *ip; 411 union lfs_dinode *dip; 412 struct buf *dbp, *ibp; 413 int error; 414 IFILE *ifp; 415 SEGUSE *sup; 416 unsigned i, num; 417 uint32_t gen, osn, nsn; 418 char *buf; 419 ino_t ino; 420 421 fs = lifa->fs; 422 offset = lifa->offset; 423 l = lifa->l; 424 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 425 426 /* 427 * Get the inode, update times and perms. 428 * DO NOT update disk blocks, we do that separately. 429 */ 430 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs), 431 0, &dbp); 432 if (error) { 433 DLOG((DLOG_RF, "update_inoblk: bread returned %d\n", error)); 434 return error; 435 } 436 buf = malloc(dbp->b_bcount, M_SEGMENT, M_WAITOK); 437 memcpy(buf, dbp->b_data, dbp->b_bcount); 438 brelse(dbp, BC_AGE); 439 num = LFS_INOPB(fs); 440 for (i = num; i-- > 0; ) { 441 dip = DINO_IN_BLOCK(fs, buf, i); 442 ino = lfs_dino_getinumber(fs, dip); 443 if (ino <= LFS_IFILE_INUM) 444 continue; 445 446 LFS_ASSERT_MAXINO(fs, ino); 447 448 /* Check generation number */ 449 LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp); 450 gen = lfs_if_getversion(fs, ifp); 451 brelse(ibp, 0); 452 if (lfs_dino_getgen(fs, dip) < gen) { 453 continue; 454 } 455 456 /* 457 * This inode is the newest generation. Load it. 458 */ 459 error = lfs_rf_valloc(fs, ino, lfs_dino_getgen(fs, dip), 460 l, &vp, dip); 461 if (error) { 462 DLOG((DLOG_RF, "update_inoblk: lfs_rf_valloc" 463 " returned %d\n", error)); 464 continue; 465 } 466 ip = VTOI(vp); 467 if (lfs_dino_getsize(fs, dip) != ip->i_size 468 && vp->v_type != VLNK) { 469 /* XXX What should we do with symlinks? */ 470 DLOG((DLOG_RF, " ino %jd size %jd -> %jd\n", 471 (intmax_t)ino, 472 (intmax_t)ip->i_size, 473 (intmax_t)lfs_dino_getsize(fs, dip))); 474 lfs_truncate(vp, lfs_dino_getsize(fs, dip), 0, 475 NOCRED); 476 } 477 update_inoblk_copy_dinode(fs, ip->i_din, dip); 478 479 ip->i_flags = lfs_dino_getflags(fs, dip); 480 ip->i_gen = lfs_dino_getgen(fs, dip); 481 ip->i_uid = lfs_dino_getuid(fs, dip); 482 ip->i_gid = lfs_dino_getgid(fs, dip); 483 484 ip->i_mode = lfs_dino_getmode(fs, dip); 485 ip->i_nlink = lfs_dino_getnlink(fs, dip); 486 ip->i_size = lfs_dino_getsize(fs, dip); 487 488 LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE); 489 490 /* Re-initialize to get type right */ 491 ulfs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p, 492 &vp); 493 494 /* Record change in location */ 495 LFS_IENTRY(ifp, fs, ino, ibp); 496 daddr = lfs_if_getdaddr(fs, ifp); 497 lfs_if_setdaddr(fs, ifp, offset); 498 error = LFS_BWRITE_LOG(ibp); /* Ifile */ 499 /* And do segment accounting */ 500 osn = lfs_dtosn(fs, daddr); 501 nsn = lfs_dtosn(fs, offset); 502 if (DADDR_IS_BAD(daddr) || osn != nsn) { 503 if (!DADDR_IS_BAD(daddr)) { 504 LFS_SEGENTRY(sup, fs, osn, ibp); 505 sup->su_nbytes -= DINOSIZE(fs); 506 LFS_WRITESEGENTRY(sup, fs, osn, ibp); 507 } 508 LFS_SEGENTRY(sup, fs, nsn, ibp); 509 sup->su_nbytes += DINOSIZE(fs); 510 LFS_WRITESEGENTRY(sup, fs, nsn, ibp); 511 } 512 vput(vp); 513 } 514 free(buf, M_SEGMENT); 515 516 return 0; 517 } 518 519 /* 520 * Note the highest generation number of each inode in the Ifile. 521 * This allows us to skip processing data for intermediate versions. 522 */ 523 static int 524 update_inogen(struct lfs_inofuncarg *lifa) 525 { 526 struct lfs *fs; 527 daddr_t offset; 528 struct vnode *devvp; 529 union lfs_dinode *dip; 530 struct buf *dbp, *ibp; 531 int error; 532 IFILE *ifp; 533 unsigned i, num; 534 535 fs = lifa->fs; 536 offset = lifa->offset; 537 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 538 539 /* Read inode block */ 540 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs), 541 0, &dbp); 542 if (error) { 543 DLOG((DLOG_RF, "update_inoblk: bread returned %d\n", error)); 544 return error; 545 } 546 547 /* Check each inode against ifile entry */ 548 num = LFS_INOPB(fs); 549 for (i = num; i-- > 0; ) { 550 dip = DINO_IN_BLOCK(fs, dbp->b_data, i); 551 if (lfs_dino_getinumber(fs, dip) == LFS_IFILE_INUM) 552 continue; 553 554 /* Update generation number */ 555 LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp); 556 if (lfs_if_getversion(fs, ifp) < lfs_dino_getgen(fs, dip)) 557 lfs_if_setversion(fs, ifp, lfs_dino_getgen(fs, dip)); 558 error = LFS_BWRITE_LOG(ibp); /* Ifile */ 559 if (error) 560 break; 561 } 562 brelse(dbp, 0); 563 564 return error; 565 } 566 567 static int 568 finfo_func_rfw(struct lfs_finfofuncarg *lffa) 569 { 570 struct lfs *fs; 571 FINFO *fip; 572 daddr_t *offsetp; 573 struct lwp *l; 574 int j; 575 size_t size; 576 ino_t ino; 577 578 fs = lffa->fs; 579 fip = lffa->finfop; 580 offsetp = lffa->offsetp; 581 l = lffa->l; 582 size = lfs_sb_getbsize(fs); 583 ino = lfs_fi_getino(fs, fip); 584 LFS_ASSERT_MAXINO(fs, ino); 585 for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) { 586 if (j == lfs_fi_getnblocks(fs, fip) - 1) 587 size = lfs_fi_getlastlength(fs, fip); 588 589 /* Account for and update any direct blocks */ 590 if (ino > LFS_IFILE_INUM && 591 lfs_fi_getblock(fs, fip, j) >= 0) { 592 update_meta(fs, ino, 593 lfs_fi_getversion(fs, fip), 594 lfs_fi_getblock(fs, fip, j), 595 *offsetp, size, l); 596 ++rblkcnt; 597 } 598 *offsetp += lfs_btofsb(fs, size); 599 } 600 601 return 0; 602 } 603 604 int 605 lfs_skip_superblock(struct lfs *fs, daddr_t *offsetp) 606 { 607 daddr_t offset; 608 int i; 609 610 /* 611 * If this is segment 0, skip the label. 612 * If the segment has a superblock and we're at the top 613 * of the segment, skip the superblock. 614 */ 615 offset = *offsetp; 616 if (offset == lfs_sb_gets0addr(fs)) { 617 offset += lfs_btofsb(fs, LFS_LABELPAD); 618 } 619 for (i = 0; i < LFS_MAXNUMSB; i++) { 620 if (offset == lfs_sb_getsboff(fs, i)) { 621 offset += lfs_btofsb(fs, LFS_SBPAD); 622 break; 623 } 624 } 625 *offsetp = offset; 626 return 0; 627 } 628 629 /* 630 * Read the partial sement at offset. 631 * 632 * If finfo_func and ino_func are both NULL, check the summary 633 * and data checksums. During roll forward, this must be done in its 634 * entirety before processing any blocks. 635 * 636 * If finfo_func is given, use that to process every file block 637 * in the segment summary. If ino_func is given, use that to process 638 * every inode block. 639 */ 640 int 641 lfs_parse_pseg(struct lfs *fs, daddr_t *offsetp, u_int64_t nextserial, 642 kauth_cred_t cred, int *pseg_flags, struct lwp *l, 643 int (*ino_func)(struct lfs_inofuncarg *), 644 int (*finfo_func)(struct lfs_finfofuncarg *), 645 int flags, void *arg) 646 { 647 struct vnode *devvp; 648 struct buf *bp, *dbp; 649 int error, ninos, i, j; 650 SEGSUM *ssp; 651 daddr_t offset, prevoffset; 652 IINFO *iip; 653 FINFO *fip; 654 size_t size; 655 uint32_t datasum, foundsum; 656 char *buf; 657 struct lfs_inofuncarg lifa; 658 struct lfs_finfofuncarg lffa; 659 660 KASSERT(fs != NULL); 661 KASSERT(offsetp != NULL); 662 663 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 664 665 /* Set up callback arguments */ 666 lifa.fs = fs; 667 /* lifa.offset = offset; */ 668 lifa.cred = cred; 669 lifa.l = l; 670 lifa.buf = malloc(lfs_sb_getbsize(fs), M_SEGMENT, M_WAITOK); 671 672 lifa.arg = arg; 673 674 lffa.fs = fs; 675 /* lffa.offsetp = offsetp; */ 676 /* lffa.finfop = finfop; */ 677 lffa.cred = cred; 678 lffa.l = l; 679 lffa.arg = arg; 680 681 prevoffset = *offsetp; 682 lfs_skip_superblock(fs, offsetp); 683 offset = *offsetp; 684 685 /* Read in the segment summary */ 686 buf = malloc(lfs_sb_getsumsize(fs), M_SEGMENT, M_WAITOK); 687 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getsumsize(fs), 688 0, &bp); 689 if (error) 690 goto err; 691 memcpy(buf, bp->b_data, bp->b_bcount); 692 brelse(bp, BC_AGE); 693 694 ssp = (SEGSUM *)buf; 695 696 if (lfs_ss_getmagic(fs, ssp) != SS_MAGIC) { 697 DLOG((DLOG_RF, "Bad magic at 0x%" PRIx64 "\n", 698 offset)); 699 offset = -1; 700 goto err; 701 } 702 703 if (flags & CKSEG_CKSUM) { 704 size_t sumstart; 705 706 sumstart = lfs_ss_getsumstart(fs); 707 if (lfs_ss_getsumsum(fs, ssp) != 708 cksum((char *)ssp + sumstart, 709 lfs_sb_getsumsize(fs) - sumstart)) { 710 DLOG((DLOG_RF, "Sumsum error at 0x%" PRIx64 "\n", 711 offset)); 712 offset = -1; 713 goto err; 714 } 715 } 716 717 #if 0 718 /* 719 * Under normal conditions, we should never be producing 720 * a partial segment with neither inode blocks nor data blocks. 721 * However, these do sometimes appear and they need not 722 * prevent us from continuing. 723 */ 724 if (lfs_ss_getnfinfo(fs, ssp) == 0 && 725 lfs_ss_getninos(fs, ssp) == 0) { 726 DLOG((DLOG_RF, "Empty pseg at 0x%" PRIx64 "\n", 727 offset)); 728 offset = -1; 729 goto err; 730 } 731 #endif /* 0 */ 732 733 if (lfs_sb_getversion(fs) == 1) { 734 if (lfs_ss_getcreate(fs, ssp) < lfs_sb_gettstamp(fs)) { 735 DLOG((DLOG_RF, "Old data at 0x%" PRIx64 "\n", offset)); 736 offset = -1; 737 goto err; 738 } 739 } else { 740 if (nextserial > 0 741 && lfs_ss_getserial(fs, ssp) != nextserial) { 742 DLOG((DLOG_RF, "Serial number at 0x%jx given as 0x%jx," 743 " expected 0x%jx\n", (intmax_t)offset, 744 (intmax_t)lfs_ss_getserial(fs, ssp), 745 (intmax_t)nextserial)); 746 offset = -1; 747 goto err; 748 } 749 if (lfs_ss_getident(fs, ssp) != lfs_sb_getident(fs)) { 750 DLOG((DLOG_RF, "Incorrect fsid (0x%x vs 0x%x) at 0x%" 751 PRIx64 "\n", lfs_ss_getident(fs, ssp), 752 lfs_sb_getident(fs), offset)); 753 offset = -1; 754 goto err; 755 } 756 } 757 758 #ifdef DIAGNOSTIC 759 if (lfs_ss_getnfinfo(fs, ssp) > lfs_sb_getssize(fs) / lfs_sb_getfsize(fs)) { 760 printf("At offset 0x%jx, nfinfo %jd > max frags %jd\n", 761 (intmax_t)offset, 762 (intmax_t)lfs_ss_getnfinfo(fs, ssp), 763 (intmax_t)lfs_sb_getssize(fs) / lfs_sb_getfsize(fs)); 764 } 765 #endif 766 KASSERT(lfs_ss_getnfinfo(fs, ssp) <= lfs_sb_getssize(fs) / lfs_sb_getfsize(fs)); 767 #ifdef DIAGNOSTIC 768 if (lfs_ss_getnfinfo(fs, ssp) > lfs_sb_getfsize(fs) / sizeof(FINFO32)) { 769 printf("At offset 0x%jx, nfinfo %jd > max entries %jd\n", 770 (intmax_t)offset, 771 (intmax_t)lfs_ss_getnfinfo(fs, ssp), 772 (intmax_t)lfs_sb_getssize(fs) / lfs_sb_getfsize(fs)); 773 } 774 #endif 775 KASSERT(lfs_ss_getnfinfo(fs, ssp) <= lfs_sb_getfsize(fs) / sizeof(FINFO32)); 776 777 if (pseg_flags) 778 *pseg_flags = lfs_ss_getflags(fs, ssp); 779 ninos = howmany(lfs_ss_getninos(fs, ssp), LFS_INOPB(fs)); 780 iip = SEGSUM_IINFOSTART(fs, buf); 781 fip = SEGSUM_FINFOBASE(fs, (SEGSUM *)buf); 782 783 /* Handle individual blocks */ 784 foundsum = 0; 785 offset += lfs_btofsb(fs, lfs_sb_getsumsize(fs)); 786 for (i = 0; i < lfs_ss_getnfinfo(fs, ssp) || ninos; ++i) { 787 /* Inode block? */ 788 if (ninos && lfs_ii_getblock(fs, iip) == offset) { 789 if (flags & CKSEG_CKSUM) { 790 /* Read in the head and add to the buffer */ 791 error = bread(devvp, LFS_FSBTODB(fs, offset), 792 lfs_sb_getbsize(fs), 0, &dbp); 793 if (error) { 794 offset = -1; 795 goto err; 796 } 797 foundsum = lfs_cksum_part(dbp->b_data, 798 sizeof(uint32_t), foundsum); 799 brelse(dbp, BC_AGE); 800 } else if (ino_func != NULL) { 801 lifa.offset = offset; 802 error = (*ino_func)(&lifa); 803 if (error != 0) { 804 offset = -1; 805 goto err; 806 } 807 } 808 809 offset += lfs_btofsb(fs, lfs_sb_getibsize(fs)); 810 iip = NEXTLOWER_IINFO(fs, iip); 811 --ninos; 812 --i; /* compensate for ++i in loop header */ 813 continue; 814 } 815 816 /* File block */ 817 size = lfs_sb_getbsize(fs); 818 if (flags & CKSEG_CKSUM) { 819 for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) { 820 if (j == lfs_fi_getnblocks(fs, fip) - 1) 821 size = lfs_fi_getlastlength(fs, fip); 822 error = bread(devvp, LFS_FSBTODB(fs, offset), 823 size, 0, &dbp); 824 if (error) { 825 offset = -1; 826 goto err; 827 } 828 foundsum = lfs_cksum_part(dbp->b_data, 829 sizeof(uint32_t), foundsum); 830 brelse(dbp, BC_AGE); 831 offset += lfs_btofsb(fs, size); 832 } 833 } else if (finfo_func != NULL) { 834 lffa.offsetp = &offset; 835 lffa.finfop = fip; 836 (*finfo_func)(&lffa); 837 } else { 838 int n = lfs_fi_getnblocks(fs, fip); 839 size = lfs_fi_getlastlength(fs, fip); 840 offset += lfs_btofsb(fs, lfs_sb_getbsize(fs) * (n - 1) 841 + size); 842 } 843 fip = NEXT_FINFO(fs, fip); 844 } 845 846 /* Checksum the array, compare */ 847 if (flags & CKSEG_CKSUM) { 848 datasum = lfs_ss_getdatasum(fs, ssp); 849 foundsum = lfs_cksum_fold(foundsum); 850 if (datasum != foundsum) { 851 DLOG((DLOG_RF, "Datasum error at 0x%" PRIx64 852 " (wanted %x got %x)\n", 853 offset, datasum, foundsum)); 854 offset = -1; 855 goto err; 856 } 857 } else { 858 /* Don't clog the buffer queue */ 859 mutex_enter(&lfs_lock); 860 if (locked_queue_count > LFS_MAX_BUFS || 861 locked_queue_bytes > LFS_MAX_BYTES) { 862 lfs_flush(fs, SEGM_CKP, 0); 863 } 864 mutex_exit(&lfs_lock); 865 } 866 867 /* 868 * If we're at the end of the segment, move to the next. 869 * A partial segment needs space for a segment header (1 fsb) 870 * and a full block ("frag" fsb). Thus, adding "frag" fsb should 871 * still be within the current segment (whereas frag + 1 might 872 * be at the start of the next segment). 873 * 874 * This needs to match the definition of LFS_PARTIAL_FITS 875 * in lfs_segment.c. 876 */ 877 if (lfs_dtosn(fs, offset + lfs_sb_getfrag(fs)) 878 != lfs_dtosn(fs, offset)) { 879 if (lfs_dtosn(fs, offset) == lfs_dtosn(fs, lfs_ss_getnext(fs, 880 ssp))) { 881 offset = -1; 882 goto err; 883 } 884 offset = lfs_ss_getnext(fs, ssp); 885 DLOG((DLOG_RF, "LFS roll forward: moving to offset 0x%" PRIx64 886 " -> segment %d\n", offset, lfs_dtosn(fs,offset))); 887 } 888 if (flags & CKSEG_AVAIL) 889 lfs_sb_subavail(fs, offset - prevoffset); 890 891 err: 892 free(lifa.buf, M_SEGMENT); 893 free(buf, M_SEGMENT); 894 895 *offsetp = offset; 896 return 0; 897 } 898 899 /* 900 * Roll forward. 901 */ 902 void 903 lfs_roll_forward(struct lfs *fs, struct mount *mp, struct lwp *l) 904 { 905 int flags, dirty; 906 daddr_t startoffset, offset, nextoffset, endpseg; 907 u_int64_t nextserial, startserial, endserial; 908 int sn, curseg; 909 struct proc *p; 910 kauth_cred_t cred; 911 SEGUSE *sup; 912 struct buf *bp; 913 914 p = l ? l->l_proc : NULL; 915 cred = p ? p->p_cred : NOCRED; 916 917 /* 918 * We don't roll forward for v1 filesystems, because 919 * of the danger that the clock was turned back between the last 920 * checkpoint and crash. This would roll forward garbage. 921 * 922 * v2 filesystems don't have this problem because they use a 923 * monotonically increasing serial number instead of a timestamp. 924 */ 925 rblkcnt = 0; 926 if ((lfs_sb_getpflags(fs) & LFS_PF_CLEAN) || !lfs_do_rfw 927 || lfs_sb_getversion(fs) <= 1 || p == NULL) 928 return; 929 930 DLOG((DLOG_RF, "%s: begin roll forward at serial 0x%jx\n", 931 lfs_sb_getfsmnt(fs), (intmax_t)lfs_sb_getserial(fs))); 932 DEBUG_CHECK_FREELIST(fs); 933 934 /* 935 * Phase I: Find the address of the last good partial 936 * segment that was written after the checkpoint. Mark 937 * the segments in question dirty, so they won't be 938 * reallocated. 939 */ 940 endpseg = startoffset = offset = lfs_sb_getoffset(fs); 941 flags = 0x0; 942 DLOG((DLOG_RF, "LFS roll forward phase 1: start at offset 0x%" 943 PRIx64 "\n", offset)); 944 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp); 945 if (!(sup->su_flags & SEGUSE_DIRTY)) 946 lfs_sb_subnclean(fs, 1); 947 sup->su_flags |= SEGUSE_DIRTY; 948 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp); 949 950 startserial = lfs_sb_getserial(fs); 951 endserial = nextserial = startserial + 1; 952 nextoffset = offset; 953 while (1) { 954 nextoffset = offset; 955 lfs_parse_pseg(fs, &nextoffset, nextserial, 956 cred, &flags, l, NULL, NULL, CKSEG_CKSUM, NULL); 957 if (nextoffset == -1) 958 break; 959 if (lfs_sntod(fs, offset) != lfs_sntod(fs, nextoffset)) { 960 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset), 961 bp); 962 if (!(sup->su_flags & SEGUSE_DIRTY)) 963 lfs_sb_subnclean(fs, 1); 964 sup->su_flags |= SEGUSE_DIRTY; 965 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp); 966 } 967 968 DLOG((DLOG_RF, "LFS roll forward phase 1: offset=0x%jx" 969 " serial=0x%jx\n", (intmax_t)nextoffset, 970 (intmax_t)nextserial)); 971 if (flags & SS_DIROP) { 972 DLOG((DLOG_RF, "lfs_mountfs: dirops at 0x%" 973 PRIx64 "\n", offset)); 974 if (!(flags & SS_CONT)) { 975 DLOG((DLOG_RF, "lfs_mountfs: dirops end " 976 "at 0x%" PRIx64 "\n", offset)); 977 } 978 } 979 offset = nextoffset; 980 ++nextserial; 981 982 if (!(flags & SS_CONT)) { 983 endpseg = nextoffset; 984 endserial = nextserial; 985 } 986 if (lfs_rfw_max_psegs > 0 987 && nextserial > startserial + lfs_rfw_max_psegs) 988 break; 989 } 990 if (flags & SS_CONT) { 991 DLOG((DLOG_RF, "LFS roll forward: warning: incomplete " 992 "dirops discarded (0x%jx < 0x%jx)\n", 993 endpseg, nextoffset)); 994 } 995 if (lfs_sb_getversion(fs) > 1) 996 lfs_sb_setserial(fs, endserial); 997 DLOG((DLOG_RF, "LFS roll forward phase 1: completed: " 998 "endpseg=0x%" PRIx64 "\n", endpseg)); 999 offset = startoffset; 1000 if (offset != endpseg) { 1001 /* Don't overwrite what we're trying to preserve */ 1002 lfs_sb_setoffset(fs, endpseg); 1003 lfs_sb_setcurseg(fs, lfs_sntod(fs, lfs_dtosn(fs, endpseg))); 1004 for (sn = curseg = lfs_dtosn(fs, lfs_sb_getcurseg(fs));;) { 1005 sn = (sn + 1) % lfs_sb_getnseg(fs); 1006 /* XXX could we just fail to roll forward? */ 1007 if (sn == curseg) 1008 panic("lfs_mountfs: no clean segments"); 1009 LFS_SEGENTRY(sup, fs, sn, bp); 1010 dirty = (sup->su_flags & SEGUSE_DIRTY); 1011 brelse(bp, 0); 1012 if (!dirty) 1013 break; 1014 } 1015 lfs_sb_setnextseg(fs, lfs_sntod(fs, sn)); 1016 /* Explicitly set this segment dirty */ 1017 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, endpseg), bp); 1018 sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE; 1019 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, endpseg), bp); 1020 1021 /* 1022 * Phase II: Identify the highest generation of each 1023 * inode. We will ignore inodes and data blocks 1024 * belonging to old versions. 1025 */ 1026 offset = startoffset; 1027 nextserial = startserial + 1; 1028 DLOG((DLOG_RF, "LFS roll forward phase 2 beginning\n")); 1029 while (offset > 0 && offset != endpseg) { 1030 lfs_parse_pseg(fs, &offset, nextserial++, cred, 1031 NULL, l, update_inogen, NULL, 1032 CKSEG_NONE, NULL); 1033 DEBUG_CHECK_FREELIST(fs); 1034 } 1035 1036 /* 1037 * Phase III: Update inodes. 1038 */ 1039 offset = startoffset; 1040 nextserial = startserial + 1; 1041 DLOG((DLOG_RF, "LFS roll forward phase 3 beginning\n")); 1042 while (offset > 0 && offset != endpseg) { 1043 lfs_parse_pseg(fs, &offset, nextserial++, cred, 1044 NULL, l, update_inoblk, NULL, 1045 CKSEG_NONE, NULL); 1046 DEBUG_CHECK_FREELIST(fs); 1047 } 1048 1049 /* 1050 * Phase IV: Roll forward, updating data blocks. 1051 */ 1052 offset = startoffset; 1053 nextserial = startserial + 1; 1054 DLOG((DLOG_RF, "LFS roll forward phase 4 beginning\n")); 1055 while (offset > 0 && offset != endpseg) { 1056 lfs_parse_pseg(fs, &offset, nextserial++, cred, 1057 NULL, l, NULL, finfo_func_rfw, 1058 CKSEG_AVAIL, NULL); 1059 DEBUG_CHECK_FREELIST(fs); 1060 } 1061 1062 /* 1063 * Finish: flush our changes to disk. 1064 */ 1065 lfs_sb_setserial(fs, endserial); 1066 1067 lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC); 1068 DLOG((DLOG_RF, "lfs_mountfs: roll forward " 1069 "examined %jd blocks\n", 1070 (intmax_t)(endpseg - startoffset))); 1071 } 1072 1073 /* Get rid of our vnodes, except the ifile */ 1074 drop_vnode_pages(mp, l); 1075 DLOG((DLOG_RF, "LFS roll forward complete\n")); 1076 printf("%s: roll forward recovered %d data blocks\n", 1077 lfs_sb_getfsmnt(fs), rblkcnt); 1078 1079 /* 1080 * At this point we have no more changes to write to disk. 1081 * Reset the "avail" count to match the segments as they 1082 * appear on disk, and the clean segment count. 1083 */ 1084 lfs_reset_avail(fs); 1085 } 1086 1087 static bool 1088 all_selector(void *cl, struct vnode *vp) 1089 { 1090 return true; 1091 } 1092 1093 /* 1094 * Dump any pages from vnodes that may have been put on 1095 * during truncation. 1096 */ 1097 static void 1098 drop_vnode_pages(struct mount *mp, struct lwp *l) 1099 { 1100 struct vnode_iterator *marker; 1101 struct lfs *fs; 1102 struct vnode *vp; 1103 1104 fs = VFSTOULFS(mp)->um_lfs; 1105 vfs_vnode_iterator_init(mp, &marker); 1106 while ((vp = vfs_vnode_iterator_next(marker, 1107 all_selector, NULL)) != NULL) { 1108 if (vp == fs->lfs_ivnode) 1109 continue; 1110 VOP_LOCK(vp, LK_EXCLUSIVE | LK_RETRY); 1111 uvm_vnp_setsize(vp, 0); 1112 uvm_vnp_setsize(vp, VTOI(vp)->i_size); 1113 VOP_UNLOCK(vp); 1114 vrele(vp); 1115 } 1116 vfs_vnode_iterator_destroy(marker); 1117 } 1118