1 /* $NetBSD: lfs_rfw.c,v 1.40 2025/10/20 04:20:37 perseant Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2025 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Konrad E. Schroder <perseant (at) hhhh.org>. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: lfs_rfw.c,v 1.40 2025/10/20 04:20:37 perseant Exp $"); 34 35 #if defined(_KERNEL_OPT) 36 #include "opt_quota.h" 37 #endif 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/namei.h> 42 #include <sys/proc.h> 43 #include <sys/kernel.h> 44 #include <sys/vnode.h> 45 #include <sys/mount.h> 46 #include <sys/kthread.h> 47 #include <sys/buf.h> 48 #include <sys/device.h> 49 #include <sys/file.h> 50 #include <sys/disklabel.h> 51 #include <sys/ioctl.h> 52 #include <sys/errno.h> 53 #include <sys/malloc.h> 54 #include <sys/pool.h> 55 #include <sys/socket.h> 56 #include <sys/stat.h> 57 #include <sys/syslog.h> 58 #include <sys/sysctl.h> 59 #include <sys/conf.h> 60 #include <sys/kauth.h> 61 62 #include <miscfs/specfs/specdev.h> 63 64 #include <ufs/lfs/ulfs_quotacommon.h> 65 #include <ufs/lfs/ulfs_inode.h> 66 #include <ufs/lfs/ulfsmount.h> 67 #include <ufs/lfs/ulfs_extern.h> 68 69 #include <uvm/uvm_extern.h> 70 71 #include <ufs/lfs/lfs.h> 72 #include <ufs/lfs/lfs_accessors.h> 73 #include <ufs/lfs/lfs_kernel.h> 74 #include <ufs/lfs/lfs_extern.h> 75 76 #include <miscfs/genfs/genfs.h> 77 #include <miscfs/genfs/genfs_node.h> 78 79 /* 80 * Roll-forward code. 81 */ 82 static bool all_selector(void *, struct vnode *); 83 static void drop_vnode_pages(struct mount *, struct lwp *); 84 static void update_inoblk_copy_dinode(struct lfs *, union lfs_dinode *, 85 const union lfs_dinode *); 86 static int update_inogen(struct lfs_inofuncarg *); 87 static int update_inoblk(struct lfs_inofuncarg *); 88 static int ino_func_setclean(struct lfs_inofuncarg *); 89 static int finfo_func_rfw(struct lfs_finfofuncarg *); 90 static int finfo_func_rewrite(struct lfs_finfofuncarg *); 91 static int finfo_func_setclean(struct lfs_finfofuncarg *); 92 93 static int update_meta(struct lfs *, ino_t, int, daddr_t, daddr_t, size_t, 94 struct lwp *l); 95 static int skip_superblock(struct lfs *, daddr_t *); 96 static int rewrite_block(struct lfs *, struct vnode *, daddr_t, daddr_t, size_t, int *); 97 #if 0 98 static bool lfs_isseq(const struct lfs *fs, long int lbn1, long int lbn2); 99 #endif 100 101 extern int lfs_do_rfw; 102 int rblkcnt; 103 int lfs_rfw_max_psegs = 0; 104 105 /* 106 * Allocate a particular inode with a particular version number, freeing 107 * any previous versions of this inode that may have gone before. 108 * Used by the roll-forward code. 109 * 110 * XXX this function does not have appropriate locking to be used on a live fs; 111 * XXX but something similar could probably be used for an "undelete" call. 112 * 113 * Called with the Ifile inode locked. 114 */ 115 int 116 lfs_rf_valloc(struct lfs *fs, ino_t ino, int vers, struct lwp *l, 117 struct vnode **vpp, union lfs_dinode *dip) 118 { 119 struct vattr va; 120 struct vnode *vp; 121 struct inode *ip; 122 int error; 123 124 KASSERT(ino > LFS_IFILE_INUM); 125 ASSERT_SEGLOCK(fs); /* XXX it doesn't, really */ 126 127 /* 128 * First, just try a vget. If the version number is the one we want, 129 * we don't have to do anything else. If the version number is wrong, 130 * take appropriate action. 131 */ 132 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, LK_EXCLUSIVE, &vp); 133 if (error == 0) { 134 DLOG((DLOG_RF, "lfs_rf_valloc[1]: ino %d vp %p\n", 135 (int)ino, vp)); 136 137 *vpp = vp; 138 ip = VTOI(vp); 139 DLOG((DLOG_RF, " ip->i_gen=%jd dip nlink %jd seeking" 140 " version %jd\n", (intmax_t)ip->i_gen, 141 (intmax_t)(dip == NULL ? -1 142 : lfs_dino_getnlink(fs, dip)), (intmax_t)vers)); 143 if (ip->i_gen == vers) { 144 /* 145 * We have what we wanted already. 146 */ 147 DLOG((DLOG_RF, " pre-existing\n")); 148 return 0; 149 } else if (ip->i_gen < vers && dip != NULL 150 && lfs_dino_getnlink(fs, dip) > 0) { 151 /* 152 * We have found a newer version. Truncate 153 * the old vnode to zero and re-initialize 154 * from the given dinode. 155 */ 156 DLOG((DLOG_RF, " replace old version %jd\n", 157 (intmax_t)ip->i_gen)); 158 lfs_truncate(vp, (off_t)0, 0, NOCRED); 159 ip->i_gen = vers; 160 vp->v_type = IFTOVT(lfs_dino_getmode(fs, dip)); 161 update_inoblk_copy_dinode(fs, ip->i_din, dip); 162 LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE); 163 return 0; 164 } else { 165 /* 166 * Not the right version and nothing to 167 * initialize from. Don't recover this data. 168 */ 169 DLOG((DLOG_RF, "ino %d: sought version %d, got %d\n", 170 (int)ino, (int)vers, 171 (int)lfs_dino_getgen(fs, ip->i_din))); 172 vput(vp); 173 *vpp = NULLVP; 174 return EEXIST; 175 } 176 } 177 178 /* 179 * No version of this inode was found in the cache. 180 * Make a new one from the dinode. We will add data blocks 181 * as they come in, so scrub any block addresses off of the 182 * inode and reset block counts to zero. 183 */ 184 if (dip == NULL) 185 return ENOENT; 186 187 vattr_null(&va); 188 va.va_type = IFTOVT(lfs_dino_getmode(fs, dip)); 189 va.va_mode = lfs_dino_getmode(fs, dip) & ALLPERMS; 190 va.va_fileid = ino; 191 va.va_gen = vers; 192 error = vcache_new(fs->lfs_ivnode->v_mount, NULL, &va, NOCRED, NULL, 193 &vp); 194 if (error) 195 return error; 196 error = vn_lock(vp, LK_EXCLUSIVE); 197 if (error) 198 goto err; 199 200 ip = VTOI(vp); 201 update_inoblk_copy_dinode(fs, ip->i_din, dip); 202 203 DLOG((DLOG_RF, "lfs_valloc[2] ino %d vp %p size=%lld effnblks=%d," 204 " blocks=%d\n", (int)ino, vp, (long long)ip->i_size, 205 (int)ip->i_lfs_effnblks, 206 (int)lfs_dino_getblocks(fs, ip->i_din))); 207 *vpp = vp; 208 return 0; 209 210 err: 211 vrele(vp); 212 *vpp = NULLVP; 213 return error; 214 } 215 216 /* 217 * Load the appropriate indirect block, and change the appropriate pointer. 218 * Mark the block dirty. Do segment and avail accounting. 219 */ 220 static int 221 update_meta(struct lfs *fs, ino_t ino, int vers, daddr_t lbn, 222 daddr_t ndaddr, size_t size, struct lwp *l) 223 { 224 int error; 225 struct vnode *vp; 226 struct inode *ip; 227 daddr_t odaddr; 228 struct indir a[ULFS_NIADDR]; 229 int num; 230 struct buf *bp; 231 SEGUSE *sup; 232 u_int64_t newsize, loff; 233 234 KASSERT(lbn >= 0); /* no indirect blocks */ 235 KASSERT(ino > LFS_IFILE_INUM); 236 237 DLOG((DLOG_RF, "update_meta: ino %d lbn %d size %d at 0x%jx\n", 238 (int)ino, (int)lbn, (int)size, (uintmax_t)ndaddr)); 239 240 if ((error = lfs_rf_valloc(fs, ino, vers, l, &vp, NULL)) != 0) 241 return error; 242 ip = VTOI(vp); 243 244 /* 245 * If block already exists, note its new location 246 * but do not account it as new. 247 */ 248 ulfs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL, NULL); 249 if (odaddr == UNASSIGNED) { 250 if ((error = lfs_balloc(vp, (lbn << lfs_sb_getbshift(fs)), 251 size, NOCRED, 0, &bp)) != 0) { 252 vput(vp); 253 return (error); 254 } 255 /* No need to write, the block is already on disk */ 256 if (bp->b_oflags & BO_DELWRI) { 257 LFS_UNLOCK_BUF(bp); 258 /* Account recovery of the previous version */ 259 lfs_sb_addavail(fs, lfs_btofsb(fs, bp->b_bcount)); 260 } 261 brelse(bp, BC_INVAL); 262 DLOG((DLOG_RF, "balloc ip->i_lfs_effnblks = %d," 263 " lfs_dino_getblocks(fs, ip->i_din) = %d\n", 264 (int)ip->i_lfs_effnblks, 265 (int)lfs_dino_getblocks(fs, ip->i_din))); 266 } else { 267 /* XXX fragextend? */ 268 DLOG((DLOG_RF, "block exists, no balloc\n")); 269 } 270 271 /* 272 * Extend the file, if it is not large enough already. 273 * XXX This is not exactly right, we don't know how much of the 274 * XXX last block is actually used. 275 * 276 * XXX We should be able to encode the actual data length of the 277 * XXX last block in fi_lastlength, since we can infer the 278 * XXX necessary block length from that using a variant of 279 * XXX lfs_blksize(). 280 */ 281 loff = lfs_lblktosize(fs, lbn); 282 if (loff >= (ULFS_NDADDR << lfs_sb_getbshift(fs))) { 283 /* No fragments */ 284 newsize = loff + 1; 285 } else { 286 /* Subtract only a fragment to account for block size */ 287 newsize = loff + size - lfs_fsbtob(fs, 1) + 1; 288 } 289 290 if (ip->i_size < newsize) { 291 DLOG((DLOG_RF, "ino %d size %d -> %d\n", 292 (int)ino, (int)ip->i_size, (int)newsize)); 293 lfs_dino_setsize(fs, ip->i_din, newsize); 294 ip->i_size = newsize; 295 /* 296 * tell vm our new size for the case the inode won't 297 * appear later. 298 */ 299 uvm_vnp_setsize(vp, newsize); 300 } 301 302 lfs_update_single(fs, NULL, vp, lbn, ndaddr, size); 303 304 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, ndaddr), bp); 305 sup->su_nbytes += size; 306 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, ndaddr), bp); 307 308 /* differences here should be due to UNWRITTEN indirect blocks. */ 309 if (vp->v_type != VLNK) { 310 if (!(ip->i_lfs_effnblks >= lfs_dino_getblocks(fs, ip->i_din)) 311 #if 0 312 || !(lfs_lblkno(fs, ip->i_size) > ULFS_NDADDR || 313 ip->i_lfs_effnblks == lfs_dino_getblocks(fs, ip->i_din)) 314 #endif /* 0 */ 315 ) { 316 vprint("vnode", vp); 317 printf("effnblks=%jd dino_getblocks=%jd\n", 318 (intmax_t)ip->i_lfs_effnblks, 319 (intmax_t)lfs_dino_getblocks(fs, ip->i_din)); 320 } 321 KASSERT(ip->i_lfs_effnblks >= lfs_dino_getblocks(fs, ip->i_din)); 322 #if 0 323 KASSERT(lfs_lblkno(fs, ip->i_size) > ULFS_NDADDR || 324 ip->i_lfs_effnblks == lfs_dino_getblocks(fs, ip->i_din)); 325 #endif /* 0 */ 326 } 327 328 #ifdef DEBUG 329 /* Now look again to make sure it worked */ 330 ulfs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL, NULL); 331 if (LFS_DBTOFSB(fs, odaddr) != ndaddr) 332 DLOG((DLOG_RF, "update_meta: failed setting ino %jd lbn %jd" 333 " to %jd\n", (intmax_t)ino, (intmax_t)lbn, (intmax_t)ndaddr)); 334 #endif /* DEBUG */ 335 vput(vp); 336 return 0; 337 } 338 339 /* 340 * Copy some the fields of the dinode as needed by update_inoblk(). 341 */ 342 static void 343 update_inoblk_copy_dinode(struct lfs *fs, 344 union lfs_dinode *dstu, const union lfs_dinode *srcu) 345 { 346 if (fs->lfs_is64) { 347 struct lfs64_dinode *dst = &dstu->u_64; 348 const struct lfs64_dinode *src = &srcu->u_64; 349 unsigned i; 350 351 /* 352 * Copy everything but the block pointers and di_blocks. 353 * XXX what about di_extb? 354 */ 355 dst->di_mode = src->di_mode; 356 dst->di_nlink = src->di_nlink; 357 dst->di_uid = src->di_uid; 358 dst->di_gid = src->di_gid; 359 dst->di_blksize = src->di_blksize; 360 dst->di_size = src->di_size; 361 dst->di_atime = src->di_atime; 362 dst->di_mtime = src->di_mtime; 363 dst->di_ctime = src->di_ctime; 364 dst->di_birthtime = src->di_birthtime; 365 dst->di_mtimensec = src->di_mtimensec; 366 dst->di_atimensec = src->di_atimensec; 367 dst->di_ctimensec = src->di_ctimensec; 368 dst->di_birthnsec = src->di_birthnsec; 369 dst->di_gen = src->di_gen; 370 dst->di_kernflags = src->di_kernflags; 371 dst->di_flags = src->di_flags; 372 dst->di_extsize = src->di_extsize; 373 dst->di_modrev = src->di_modrev; 374 dst->di_inumber = src->di_inumber; 375 for (i = 0; i < __arraycount(src->di_spare); i++) { 376 dst->di_spare[i] = src->di_spare[i]; 377 } 378 /* Short symlinks store their data in di_db. */ 379 if ((src->di_mode & LFS_IFMT) == LFS_IFLNK 380 && src->di_size < lfs_sb_getmaxsymlinklen(fs)) { 381 memcpy(dst->di_db, src->di_db, src->di_size); 382 } 383 } else { 384 struct lfs32_dinode *dst = &dstu->u_32; 385 const struct lfs32_dinode *src = &srcu->u_32; 386 387 /* Get mode, link count, size, and times */ 388 memcpy(dst, src, offsetof(struct lfs32_dinode, di_db[0])); 389 390 /* Then the rest, except di_blocks */ 391 dst->di_flags = src->di_flags; 392 dst->di_gen = src->di_gen; 393 dst->di_uid = src->di_uid; 394 dst->di_gid = src->di_gid; 395 dst->di_modrev = src->di_modrev; 396 397 /* Short symlinks store their data in di_db. */ 398 if ((src->di_mode & LFS_IFMT) == LFS_IFLNK 399 && src->di_size < lfs_sb_getmaxsymlinklen(fs)) { 400 memcpy(dst->di_db, src->di_db, src->di_size); 401 } 402 } 403 } 404 405 static int 406 update_inoblk(struct lfs_inofuncarg *lifa) 407 { 408 struct lfs *fs; 409 daddr_t offset, daddr; 410 struct lwp *l; 411 struct vnode *devvp, *vp; 412 struct inode *ip; 413 union lfs_dinode *dip; 414 struct buf *dbp, *ibp; 415 int error; 416 IFILE *ifp; 417 SEGUSE *sup; 418 unsigned i, num; 419 uint32_t gen, osn, nsn; 420 char *buf; 421 422 fs = lifa->fs; 423 offset = lifa->offset; 424 l = lifa->l; 425 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 426 427 /* 428 * Get the inode, update times and perms. 429 * DO NOT update disk blocks, we do that separately. 430 */ 431 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs), 432 0, &dbp); 433 if (error) { 434 DLOG((DLOG_RF, "update_inoblk: bread returned %d\n", error)); 435 return error; 436 } 437 buf = malloc(dbp->b_bcount, M_SEGMENT, M_WAITOK); 438 memcpy(buf, dbp->b_data, dbp->b_bcount); 439 brelse(dbp, BC_AGE); 440 num = LFS_INOPB(fs); 441 for (i = num; i-- > 0; ) { 442 dip = DINO_IN_BLOCK(fs, buf, i); 443 if (lfs_dino_getinumber(fs, dip) <= LFS_IFILE_INUM) 444 continue; 445 446 /* Check generation number */ 447 LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp); 448 gen = lfs_if_getversion(fs, ifp); 449 brelse(ibp, 0); 450 if (lfs_dino_getgen(fs, dip) < gen) { 451 continue; 452 } 453 454 /* 455 * This inode is the newest generation. Load it. 456 */ 457 error = lfs_rf_valloc(fs, lfs_dino_getinumber(fs, dip), 458 lfs_dino_getgen(fs, dip), 459 l, &vp, dip); 460 if (error) { 461 DLOG((DLOG_RF, "update_inoblk: lfs_rf_valloc" 462 " returned %d\n", error)); 463 continue; 464 } 465 ip = VTOI(vp); 466 if (lfs_dino_getsize(fs, dip) != ip->i_size 467 && vp->v_type != VLNK) { 468 /* XXX What should we do with symlinks? */ 469 DLOG((DLOG_RF, " ino %jd size %jd -> %jd\n", 470 (intmax_t)lfs_dino_getinumber(fs, dip), 471 (intmax_t)ip->i_size, 472 (intmax_t)lfs_dino_getsize(fs, dip))); 473 lfs_truncate(vp, lfs_dino_getsize(fs, dip), 0, 474 NOCRED); 475 } 476 update_inoblk_copy_dinode(fs, ip->i_din, dip); 477 478 ip->i_flags = lfs_dino_getflags(fs, dip); 479 ip->i_gen = lfs_dino_getgen(fs, dip); 480 ip->i_uid = lfs_dino_getuid(fs, dip); 481 ip->i_gid = lfs_dino_getgid(fs, dip); 482 483 ip->i_mode = lfs_dino_getmode(fs, dip); 484 ip->i_nlink = lfs_dino_getnlink(fs, dip); 485 ip->i_size = lfs_dino_getsize(fs, dip); 486 487 LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE); 488 489 /* Re-initialize to get type right */ 490 ulfs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p, 491 &vp); 492 493 /* Record change in location */ 494 LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp); 495 daddr = lfs_if_getdaddr(fs, ifp); 496 lfs_if_setdaddr(fs, ifp, offset); 497 error = LFS_BWRITE_LOG(ibp); /* Ifile */ 498 /* And do segment accounting */ 499 osn = lfs_dtosn(fs, daddr); 500 nsn = lfs_dtosn(fs, offset); 501 if (DADDR_IS_BAD(daddr) || osn != nsn) { 502 if (!DADDR_IS_BAD(daddr)) { 503 LFS_SEGENTRY(sup, fs, osn, ibp); 504 sup->su_nbytes -= DINOSIZE(fs); 505 LFS_WRITESEGENTRY(sup, fs, osn, ibp); 506 } 507 LFS_SEGENTRY(sup, fs, nsn, ibp); 508 sup->su_nbytes += DINOSIZE(fs); 509 LFS_WRITESEGENTRY(sup, fs, nsn, ibp); 510 } 511 vput(vp); 512 } 513 free(buf, M_SEGMENT); 514 515 return 0; 516 } 517 518 /* 519 * Note the highest generation number of each inode in the Ifile. 520 * This allows us to skip processing data for intermediate versions. 521 */ 522 static int 523 update_inogen(struct lfs_inofuncarg *lifa) 524 { 525 struct lfs *fs; 526 daddr_t offset; 527 struct vnode *devvp; 528 union lfs_dinode *dip; 529 struct buf *dbp, *ibp; 530 int error; 531 IFILE *ifp; 532 unsigned i, num; 533 534 fs = lifa->fs; 535 offset = lifa->offset; 536 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 537 538 /* Read inode block */ 539 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs), 540 0, &dbp); 541 if (error) { 542 DLOG((DLOG_RF, "update_inoblk: bread returned %d\n", error)); 543 return error; 544 } 545 546 /* Check each inode against ifile entry */ 547 num = LFS_INOPB(fs); 548 for (i = num; i-- > 0; ) { 549 dip = DINO_IN_BLOCK(fs, dbp->b_data, i); 550 if (lfs_dino_getinumber(fs, dip) == LFS_IFILE_INUM) 551 continue; 552 553 /* Update generation number */ 554 LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp); 555 if (lfs_if_getversion(fs, ifp) < lfs_dino_getgen(fs, dip)) 556 lfs_if_setversion(fs, ifp, lfs_dino_getgen(fs, dip)); 557 error = LFS_BWRITE_LOG(ibp); /* Ifile */ 558 if (error) 559 break; 560 } 561 brelse(dbp, 0); 562 563 return error; 564 } 565 566 static int 567 finfo_func_rfw(struct lfs_finfofuncarg *lffa) 568 { 569 struct lfs *fs; 570 FINFO *fip; 571 daddr_t *offsetp; 572 struct lwp *l; 573 int j; 574 size_t size; 575 576 fs = lffa->fs; 577 fip = lffa->finfop; 578 offsetp = lffa->offsetp; 579 l = lffa->l; 580 size = lfs_sb_getbsize(fs); 581 for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) { 582 if (j == lfs_fi_getnblocks(fs, fip) - 1) 583 size = lfs_fi_getlastlength(fs, fip); 584 585 /* Account for and update any direct blocks */ 586 if (lfs_fi_getino(fs, fip) > LFS_IFILE_INUM && 587 lfs_fi_getblock(fs, fip, j) >= 0) { 588 update_meta(fs, lfs_fi_getino(fs, fip), 589 lfs_fi_getversion(fs, fip), 590 lfs_fi_getblock(fs, fip, j), 591 *offsetp, size, l); 592 ++rblkcnt; 593 } 594 *offsetp += lfs_btofsb(fs, size); 595 } 596 597 return 0; 598 } 599 600 static int 601 skip_superblock(struct lfs *fs, daddr_t *offsetp) 602 { 603 daddr_t offset; 604 int i; 605 606 /* 607 * If this is segment 0, skip the label. 608 * If the segment has a superblock and we're at the top 609 * of the segment, skip the superblock. 610 */ 611 offset = *offsetp; 612 if (offset == lfs_sb_gets0addr(fs)) { 613 offset += lfs_btofsb(fs, LFS_LABELPAD); 614 } 615 for (i = 0; i < LFS_MAXNUMSB; i++) { 616 if (offset == lfs_sb_getsboff(fs, i)) { 617 offset += lfs_btofsb(fs, LFS_SBPAD); 618 break; 619 } 620 } 621 *offsetp = offset; 622 return 0; 623 } 624 625 /* 626 * Read the partial sement at offset. 627 * 628 * If finfo_func and ino_func are both NULL, check the summary 629 * and data checksums. During roll forward, this must be done in its 630 * entirety before processing any blocks. 631 * 632 * If finfo_func is given, use that to process every file block 633 * in the segment summary. If ino_func is given, use that to process 634 * every inode block. 635 */ 636 #define CKSEG_NONE 0x0000 637 #define CKSEG_CKSUM 0x0001 638 #define CKSEG_AVAIL 0x0002 639 640 int 641 lfs_parse_pseg(struct lfs *fs, daddr_t *offsetp, u_int64_t nextserial, 642 kauth_cred_t cred, int *pseg_flags, struct lwp *l, 643 int (*ino_func)(struct lfs_inofuncarg *), 644 int (*finfo_func)(struct lfs_finfofuncarg *), 645 int flags, void *arg) 646 { 647 struct vnode *devvp; 648 struct buf *bp, *dbp; 649 int error, ninos, i, j; 650 SEGSUM *ssp; 651 daddr_t offset, prevoffset; 652 IINFO *iip; 653 FINFO *fip; 654 size_t size; 655 uint32_t datasum, foundsum; 656 char *buf; 657 struct lfs_inofuncarg lifa; 658 struct lfs_finfofuncarg lffa; 659 660 KASSERT(fs != NULL); 661 KASSERT(offsetp != NULL); 662 663 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 664 665 /* Set up callback arguments */ 666 lifa.fs = fs; 667 /* lifa.offset = offset; */ 668 lifa.cred = cred; 669 lifa.l = l; 670 lifa.buf = malloc(lfs_sb_getbsize(fs), M_SEGMENT, M_WAITOK); 671 672 lifa.arg = arg; 673 674 lffa.fs = fs; 675 /* lffa.offsetp = offsetp; */ 676 /* lffa.finfop = finfop; */ 677 lffa.cred = cred; 678 lffa.l = l; 679 lffa.arg = arg; 680 681 prevoffset = *offsetp; 682 skip_superblock(fs, offsetp); 683 offset = *offsetp; 684 685 /* Read in the segment summary */ 686 buf = malloc(lfs_sb_getsumsize(fs), M_SEGMENT, M_WAITOK); 687 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getsumsize(fs), 688 0, &bp); 689 if (error) 690 goto err; 691 memcpy(buf, bp->b_data, bp->b_bcount); 692 brelse(bp, BC_AGE); 693 694 ssp = (SEGSUM *)buf; 695 696 /* 697 * Phase I: Check summary checksum. 698 */ 699 if (flags & CKSEG_CKSUM) { 700 size_t sumstart; 701 702 if (lfs_ss_getmagic(fs, ssp) != SS_MAGIC) { 703 DLOG((DLOG_RF, "Bad magic at 0x%" PRIx64 "\n", 704 offset)); 705 offset = -1; 706 goto err; 707 } 708 709 sumstart = lfs_ss_getsumstart(fs); 710 if (lfs_ss_getsumsum(fs, ssp) != 711 cksum((char *)ssp + sumstart, 712 lfs_sb_getsumsize(fs) - sumstart)) { 713 DLOG((DLOG_RF, "Sumsum error at 0x%" PRIx64 "\n", 714 offset)); 715 offset = -1; 716 goto err; 717 } 718 #if 0 719 /* 720 * Under normal conditions, we should never be producing 721 * a partial segment with neither inode blocks nor data blocks. 722 * However, these do sometimes appear and they need not 723 * prevent us from continuing. 724 */ 725 if (lfs_ss_getnfinfo(fs, ssp) == 0 && 726 lfs_ss_getninos(fs, ssp) == 0) { 727 DLOG((DLOG_RF, "Empty pseg at 0x%" PRIx64 "\n", 728 offset)); 729 offset = -1; 730 goto err; 731 } 732 #endif /* 0 */ 733 if (lfs_sb_getversion(fs) == 1) { 734 if (lfs_ss_getcreate(fs, ssp) < lfs_sb_gettstamp(fs)) { 735 DLOG((DLOG_RF, "Old data at 0x%" PRIx64 "\n", offset)); 736 offset = -1; 737 goto err; 738 } 739 } else { 740 if (nextserial > 0 741 && lfs_ss_getserial(fs, ssp) != nextserial) { 742 DLOG((DLOG_RF, "Serial number at 0x%jx given as 0x%jx," 743 " expected 0x%jx\n", (intmax_t)offset, 744 (intmax_t)lfs_ss_getserial(fs, ssp), 745 (intmax_t)nextserial)); 746 offset = -1; 747 goto err; 748 } 749 if (lfs_ss_getident(fs, ssp) != lfs_sb_getident(fs)) { 750 DLOG((DLOG_RF, "Incorrect fsid (0x%x vs 0x%x) at 0x%" 751 PRIx64 "\n", lfs_ss_getident(fs, ssp), 752 lfs_sb_getident(fs), offset)); 753 offset = -1; 754 goto err; 755 } 756 } 757 } 758 if (pseg_flags) 759 *pseg_flags = lfs_ss_getflags(fs, ssp); 760 offset += lfs_btofsb(fs, lfs_sb_getsumsize(fs)); 761 762 /* Handle individual blocks */ 763 foundsum = 0; 764 ninos = howmany(lfs_ss_getninos(fs, ssp), LFS_INOPB(fs)); 765 iip = SEGSUM_IINFOSTART(fs, buf); 766 fip = SEGSUM_FINFOBASE(fs, (SEGSUM *)buf); 767 KASSERT(lfs_ss_getnfinfo(fs, ssp) <= lfs_sb_getssize(fs) / lfs_sb_getfsize(fs)); 768 KASSERT(lfs_ss_getnfinfo(fs, ssp) <= lfs_sb_getfsize(fs) / sizeof(FINFO32)); 769 for (i = 0; i < lfs_ss_getnfinfo(fs, ssp) || ninos; ++i) { 770 /* Inode block? */ 771 if (ninos && lfs_ii_getblock(fs, iip) == offset) { 772 if (flags & CKSEG_CKSUM) { 773 /* Read in the head and add to the buffer */ 774 error = bread(devvp, LFS_FSBTODB(fs, offset), 775 lfs_sb_getbsize(fs), 0, &dbp); 776 if (error) { 777 offset = -1; 778 goto err; 779 } 780 foundsum = lfs_cksum_part(dbp->b_data, 781 sizeof(uint32_t), foundsum); 782 brelse(dbp, BC_AGE); 783 } else if (ino_func != NULL) { 784 lifa.offset = offset; 785 error = (*ino_func)(&lifa); 786 if (error != 0) { 787 offset = -1; 788 goto err; 789 } 790 } 791 792 offset += lfs_btofsb(fs, lfs_sb_getibsize(fs)); 793 iip = NEXTLOWER_IINFO(fs, iip); 794 --ninos; 795 --i; /* compensate for ++i in loop header */ 796 continue; 797 } 798 799 /* File block */ 800 size = lfs_sb_getbsize(fs); 801 if (flags & CKSEG_CKSUM) { 802 for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) { 803 if (j == lfs_fi_getnblocks(fs, fip) - 1) 804 size = lfs_fi_getlastlength(fs, fip); 805 error = bread(devvp, LFS_FSBTODB(fs, offset), 806 size, 0, &dbp); 807 if (error) { 808 offset = -1; 809 goto err; 810 } 811 foundsum = lfs_cksum_part(dbp->b_data, 812 sizeof(uint32_t), foundsum); 813 brelse(dbp, BC_AGE); 814 offset += lfs_btofsb(fs, size); 815 } 816 } else if (finfo_func != NULL) { 817 lffa.offsetp = &offset; 818 lffa.finfop = fip; 819 (*finfo_func)(&lffa); 820 } else { 821 int n = lfs_fi_getnblocks(fs, fip); 822 size = lfs_fi_getlastlength(fs, fip); 823 offset += lfs_btofsb(fs, lfs_sb_getbsize(fs) * (n - 1) 824 + size); 825 } 826 fip = NEXT_FINFO(fs, fip); 827 } 828 829 /* Checksum the array, compare */ 830 if (flags & CKSEG_CKSUM) { 831 datasum = lfs_ss_getdatasum(fs, ssp); 832 foundsum = lfs_cksum_fold(foundsum); 833 if (datasum != foundsum) { 834 DLOG((DLOG_RF, "Datasum error at 0x%" PRIx64 835 " (wanted %x got %x)\n", 836 offset, datasum, foundsum)); 837 offset = -1; 838 goto err; 839 } 840 } else { 841 /* Don't clog the buffer queue */ 842 mutex_enter(&lfs_lock); 843 if (locked_queue_count > LFS_MAX_BUFS || 844 locked_queue_bytes > LFS_MAX_BYTES) { 845 lfs_flush(fs, SEGM_CKP, 0); 846 } 847 mutex_exit(&lfs_lock); 848 } 849 850 /* 851 * If we're at the end of the segment, move to the next. 852 * A partial segment needs space for a segment header (1 fsb) 853 * and a full block ("frag" fsb). Thus, adding "frag" fsb should 854 * still be within the current segment (whereas frag + 1 might 855 * be at the start of the next segment). 856 * 857 * This needs to match the definition of LFS_PARTIAL_FITS 858 * in lfs_segment.c. 859 */ 860 if (lfs_dtosn(fs, offset + lfs_sb_getfrag(fs)) 861 != lfs_dtosn(fs, offset)) { 862 if (lfs_dtosn(fs, offset) == lfs_dtosn(fs, lfs_ss_getnext(fs, 863 ssp))) { 864 offset = -1; 865 goto err; 866 } 867 offset = lfs_ss_getnext(fs, ssp); 868 DLOG((DLOG_RF, "LFS roll forward: moving to offset 0x%" PRIx64 869 " -> segment %d\n", offset, lfs_dtosn(fs,offset))); 870 } 871 if (flags & CKSEG_AVAIL) 872 lfs_sb_subavail(fs, offset - prevoffset); 873 874 err: 875 free(lifa.buf, M_SEGMENT); 876 free(buf, M_SEGMENT); 877 878 *offsetp = offset; 879 return 0; 880 } 881 882 /* 883 * Roll forward. 884 */ 885 void 886 lfs_roll_forward(struct lfs *fs, struct mount *mp, struct lwp *l) 887 { 888 int flags, dirty; 889 daddr_t startoffset, offset, nextoffset, endpseg; 890 u_int64_t nextserial, startserial, endserial; 891 int sn, curseg; 892 struct proc *p; 893 kauth_cred_t cred; 894 SEGUSE *sup; 895 struct buf *bp; 896 897 p = l ? l->l_proc : NULL; 898 cred = p ? p->p_cred : NOCRED; 899 900 /* 901 * We don't roll forward for v1 filesystems, because 902 * of the danger that the clock was turned back between the last 903 * checkpoint and crash. This would roll forward garbage. 904 * 905 * v2 filesystems don't have this problem because they use a 906 * monotonically increasing serial number instead of a timestamp. 907 */ 908 rblkcnt = 0; 909 if ((lfs_sb_getpflags(fs) & LFS_PF_CLEAN) || !lfs_do_rfw 910 || lfs_sb_getversion(fs) <= 1 || p == NULL) 911 return; 912 913 DLOG((DLOG_RF, "%s: begin roll forward at serial 0x%jx\n", 914 lfs_sb_getfsmnt(fs), (intmax_t)lfs_sb_getserial(fs))); 915 DEBUG_CHECK_FREELIST(fs); 916 917 /* 918 * Phase I: Find the address of the last good partial 919 * segment that was written after the checkpoint. Mark 920 * the segments in question dirty, so they won't be 921 * reallocated. 922 */ 923 endpseg = startoffset = offset = lfs_sb_getoffset(fs); 924 flags = 0x0; 925 DLOG((DLOG_RF, "LFS roll forward phase 1: start at offset 0x%" 926 PRIx64 "\n", offset)); 927 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp); 928 if (!(sup->su_flags & SEGUSE_DIRTY)) 929 lfs_sb_subnclean(fs, 1); 930 sup->su_flags |= SEGUSE_DIRTY; 931 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp); 932 933 startserial = lfs_sb_getserial(fs); 934 endserial = nextserial = startserial + 1; 935 nextoffset = offset; 936 while (1) { 937 nextoffset = offset; 938 lfs_parse_pseg(fs, &nextoffset, nextserial, 939 cred, &flags, l, NULL, NULL, CKSEG_CKSUM, NULL); 940 if (nextoffset == -1) 941 break; 942 if (lfs_sntod(fs, offset) != lfs_sntod(fs, nextoffset)) { 943 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset), 944 bp); 945 if (!(sup->su_flags & SEGUSE_DIRTY)) 946 lfs_sb_subnclean(fs, 1); 947 sup->su_flags |= SEGUSE_DIRTY; 948 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp); 949 } 950 951 DLOG((DLOG_RF, "LFS roll forward phase 1: offset=0x%jx" 952 " serial=0x%jx\n", (intmax_t)nextoffset, 953 (intmax_t)nextserial)); 954 if (flags & SS_DIROP) { 955 DLOG((DLOG_RF, "lfs_mountfs: dirops at 0x%" 956 PRIx64 "\n", offset)); 957 if (!(flags & SS_CONT)) { 958 DLOG((DLOG_RF, "lfs_mountfs: dirops end " 959 "at 0x%" PRIx64 "\n", offset)); 960 } 961 } 962 offset = nextoffset; 963 ++nextserial; 964 965 if (!(flags & SS_CONT)) { 966 endpseg = nextoffset; 967 endserial = nextserial; 968 } 969 if (lfs_rfw_max_psegs > 0 970 && nextserial > startserial + lfs_rfw_max_psegs) 971 break; 972 } 973 if (flags & SS_CONT) { 974 DLOG((DLOG_RF, "LFS roll forward: warning: incomplete " 975 "dirops discarded (0x%jx < 0x%jx)\n", 976 endpseg, nextoffset)); 977 } 978 if (lfs_sb_getversion(fs) > 1) 979 lfs_sb_setserial(fs, endserial); 980 DLOG((DLOG_RF, "LFS roll forward phase 1: completed: " 981 "endpseg=0x%" PRIx64 "\n", endpseg)); 982 offset = startoffset; 983 if (offset != endpseg) { 984 /* Don't overwrite what we're trying to preserve */ 985 lfs_sb_setoffset(fs, endpseg); 986 lfs_sb_setcurseg(fs, lfs_sntod(fs, lfs_dtosn(fs, endpseg))); 987 for (sn = curseg = lfs_dtosn(fs, lfs_sb_getcurseg(fs));;) { 988 sn = (sn + 1) % lfs_sb_getnseg(fs); 989 /* XXX could we just fail to roll forward? */ 990 if (sn == curseg) 991 panic("lfs_mountfs: no clean segments"); 992 LFS_SEGENTRY(sup, fs, sn, bp); 993 dirty = (sup->su_flags & SEGUSE_DIRTY); 994 brelse(bp, 0); 995 if (!dirty) 996 break; 997 } 998 lfs_sb_setnextseg(fs, lfs_sntod(fs, sn)); 999 /* Explicitly set this segment dirty */ 1000 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, endpseg), bp); 1001 sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE; 1002 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, endpseg), bp); 1003 1004 /* 1005 * Phase II: Identify the highest generation of each 1006 * inode. We will ignore inodes and data blocks 1007 * belonging to old versions. 1008 */ 1009 offset = startoffset; 1010 nextserial = startserial + 1; 1011 DLOG((DLOG_RF, "LFS roll forward phase 2 beginning\n")); 1012 while (offset > 0 && offset != endpseg) { 1013 lfs_parse_pseg(fs, &offset, nextserial++, cred, 1014 NULL, l, update_inogen, NULL, 1015 CKSEG_NONE, NULL); 1016 DEBUG_CHECK_FREELIST(fs); 1017 } 1018 1019 /* 1020 * Phase III: Update inodes. 1021 */ 1022 offset = startoffset; 1023 nextserial = startserial + 1; 1024 DLOG((DLOG_RF, "LFS roll forward phase 3 beginning\n")); 1025 while (offset > 0 && offset != endpseg) { 1026 lfs_parse_pseg(fs, &offset, nextserial++, cred, 1027 NULL, l, update_inoblk, NULL, 1028 CKSEG_NONE, NULL); 1029 DEBUG_CHECK_FREELIST(fs); 1030 } 1031 1032 /* 1033 * Phase IV: Roll forward, updating data blocks. 1034 */ 1035 offset = startoffset; 1036 nextserial = startserial + 1; 1037 DLOG((DLOG_RF, "LFS roll forward phase 4 beginning\n")); 1038 while (offset > 0 && offset != endpseg) { 1039 lfs_parse_pseg(fs, &offset, nextserial++, cred, 1040 NULL, l, NULL, finfo_func_rfw, 1041 CKSEG_AVAIL, NULL); 1042 DEBUG_CHECK_FREELIST(fs); 1043 } 1044 1045 /* 1046 * Finish: flush our changes to disk. 1047 */ 1048 lfs_sb_setserial(fs, endserial); 1049 1050 lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC); 1051 DLOG((DLOG_RF, "lfs_mountfs: roll forward " 1052 "examined %jd blocks\n", 1053 (intmax_t)(endpseg - startoffset))); 1054 } 1055 1056 /* Get rid of our vnodes, except the ifile */ 1057 drop_vnode_pages(mp, l); 1058 DLOG((DLOG_RF, "LFS roll forward complete\n")); 1059 printf("%s: roll forward recovered %d data blocks\n", 1060 lfs_sb_getfsmnt(fs), rblkcnt); 1061 1062 /* 1063 * At this point we have no more changes to write to disk. 1064 * Reset the "avail" count to match the segments as they 1065 * appear on disk, and the clean segment count. 1066 */ 1067 lfs_reset_avail(fs); 1068 } 1069 1070 static bool 1071 all_selector(void *cl, struct vnode *vp) 1072 { 1073 return true; 1074 } 1075 1076 /* 1077 * Dump any pages from vnodes that may have been put on 1078 * during truncation. 1079 */ 1080 static void 1081 drop_vnode_pages(struct mount *mp, struct lwp *l) 1082 { 1083 struct vnode_iterator *marker; 1084 struct lfs *fs; 1085 struct vnode *vp; 1086 1087 fs = VFSTOULFS(mp)->um_lfs; 1088 vfs_vnode_iterator_init(mp, &marker); 1089 while ((vp = vfs_vnode_iterator_next(marker, 1090 all_selector, NULL)) != NULL) { 1091 if (vp == fs->lfs_ivnode) 1092 continue; 1093 VOP_LOCK(vp, LK_EXCLUSIVE | LK_RETRY); 1094 uvm_vnp_setsize(vp, 0); 1095 uvm_vnp_setsize(vp, VTOI(vp)->i_size); 1096 VOP_UNLOCK(vp); 1097 vrele(vp); 1098 } 1099 vfs_vnode_iterator_destroy(marker); 1100 } 1101 1102 static int 1103 ino_func_setclean(struct lfs_inofuncarg *lifa) 1104 { 1105 struct lfs *fs; 1106 daddr_t offset; 1107 struct vnode *devvp, *vp; 1108 union lfs_dinode *dip; 1109 struct buf *dbp, *ibp; 1110 int error; 1111 IFILE *ifp; 1112 unsigned i, num; 1113 daddr_t true_addr; 1114 ino_t ino; 1115 1116 fs = lifa->fs; 1117 offset = lifa->offset; 1118 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 1119 1120 /* Read inode block */ 1121 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs), 1122 0, &dbp); 1123 if (error) { 1124 DLOG((DLOG_RF, "ino_func_setclean: bread returned %d\n", 1125 error)); 1126 return error; 1127 } 1128 memcpy(lifa->buf, dbp->b_data, dbp->b_bcount); 1129 brelse(dbp, BC_AGE); 1130 1131 /* Check each inode against ifile entry */ 1132 num = LFS_INOPB(fs); 1133 for (i = num; i-- > 0; ) { 1134 dip = DINO_IN_BLOCK(fs, lifa->buf, i); 1135 ino = lfs_dino_getinumber(fs, dip); 1136 if (ino == LFS_IFILE_INUM) { 1137 /* Check address against superblock */ 1138 true_addr = lfs_sb_getidaddr(fs); 1139 } else { 1140 /* Not ifile. Check address against ifile. */ 1141 LFS_IENTRY(ifp, fs, ino, ibp); 1142 true_addr = lfs_if_getdaddr(fs, ifp); 1143 brelse(ibp, 0); 1144 } 1145 if (offset != true_addr) 1146 continue; 1147 1148 /* XXX We can use fastvget here! */ 1149 1150 /* 1151 * An inode we need to relocate. 1152 * Get it if we can. 1153 */ 1154 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, 1155 LK_EXCLUSIVE | LK_NOWAIT, &vp); 1156 if (error) 1157 continue; 1158 1159 KASSERT(VTOI(vp)->i_gen == lfs_dino_getgen(fs, dip)); 1160 lfs_setclean(fs, vp); 1161 VOP_UNLOCK(vp); 1162 vrele(vp); 1163 1164 } 1165 1166 return error; 1167 } 1168 1169 static int 1170 ino_func_rewrite(struct lfs_inofuncarg *lifa) 1171 { 1172 struct lfs *fs; 1173 daddr_t offset; 1174 struct vnode *devvp, *vp; 1175 union lfs_dinode *dip; 1176 struct buf *dbp, *ibp; 1177 int error; 1178 IFILE *ifp; 1179 unsigned i, num; 1180 daddr_t true_addr; 1181 ino_t ino; 1182 1183 fs = lifa->fs; 1184 offset = lifa->offset; 1185 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 1186 1187 /* Read inode block */ 1188 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs), 1189 0, &dbp); 1190 if (error) { 1191 DLOG((DLOG_RF, "ino_func_rewrite: bread returned %d\n", 1192 error)); 1193 return error; 1194 } 1195 memcpy(lifa->buf, dbp->b_data, dbp->b_bcount); 1196 brelse(dbp, BC_AGE); 1197 1198 /* Check each inode against ifile entry */ 1199 num = LFS_INOPB(fs); 1200 for (i = num; i-- > 0; ) { 1201 dip = DINO_IN_BLOCK(fs, lifa->buf, i); 1202 ino = lfs_dino_getinumber(fs, dip); 1203 if (ino == LFS_IFILE_INUM) { 1204 /* Check address against superblock */ 1205 true_addr = lfs_sb_getidaddr(fs); 1206 } else { 1207 /* Not ifile. Check address against ifile. */ 1208 LFS_IENTRY(ifp, fs, ino, ibp); 1209 true_addr = lfs_if_getdaddr(fs, ifp); 1210 brelse(ibp, 0); 1211 } 1212 if (offset != true_addr) 1213 continue; 1214 1215 if (ino == LFS_IFILE_INUM) 1216 continue; 1217 1218 /* XXX We can use fastvget here! */ 1219 1220 /* 1221 * An inode we need to relocate. 1222 * Get it if we can. 1223 */ 1224 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, 1225 LK_EXCLUSIVE | LK_NOWAIT, &vp); 1226 if (error) 1227 continue; 1228 1229 KASSERT(VTOI(vp)->i_gen == lfs_dino_getgen(fs, dip)); 1230 1231 if (!(VTOI(vp)->i_state & IN_CLEANING)) { 1232 lfs_setclean(fs, vp); 1233 lfs_writeinode(fs, fs->lfs_sp, VTOI(vp)); 1234 } 1235 1236 VOP_UNLOCK(vp); 1237 vrele(vp); 1238 1239 } 1240 1241 return error; 1242 } 1243 1244 static int 1245 rewrite_block(struct lfs *fs, struct vnode *vp, daddr_t lbn, daddr_t offset, size_t size, int *have_finfop) 1246 { 1247 daddr_t daddr; 1248 int error; 1249 struct buf *bp; 1250 struct inode *ip; 1251 1252 KASSERT(have_finfop != NULL); 1253 1254 /* Look up current location of this block. */ 1255 error = VOP_BMAP(vp, lbn, NULL, &daddr, NULL); 1256 if (error) 1257 return error; 1258 1259 /* Skip any block that is not here. */ 1260 if (offset != 0 && LFS_DBTOFSB(fs, daddr) != offset) 1261 return ESTALE; 1262 1263 /* 1264 * It is (was recently) here. Read the block. 1265 */ 1266 //size = lfs_blksize(fs, VTOI(vp), lbn); 1267 error = bread(vp, lbn, size, 0, &bp); 1268 if (error) 1269 return error; 1270 1271 if (vp == fs->lfs_ivnode) { 1272 VOP_BWRITE(vp, bp); 1273 } else { 1274 /* Get ready to write. */ 1275 if (!*have_finfop) { 1276 ip = VTOI(vp); 1277 lfs_acquire_finfo(fs, ip->i_number, ip->i_gen); 1278 fs->lfs_sp->vp = vp; 1279 *have_finfop = 1; 1280 } 1281 1282 KASSERT(bp->b_vp == vp); 1283 /* bp->b_cflags |= BC_INVAL; */ /* brelse will kill the buffer */ 1284 lfs_bwrite_ext(bp, BW_CLEAN); 1285 KASSERT(bp->b_vp == vp); 1286 mutex_enter(&bufcache_lock); 1287 while (lfs_gatherblock(fs->lfs_sp, bp, &bufcache_lock)) { 1288 KASSERT(bp->b_vp != NULL); 1289 } 1290 mutex_exit(&bufcache_lock); 1291 1292 KASSERT(bp->b_flags & B_GATHERED); 1293 KASSERT(fs->lfs_sp->cbpp[-1] == bp); 1294 } 1295 return 0; 1296 } 1297 1298 static int 1299 finfo_func_rewrite(struct lfs_finfofuncarg *lffa) 1300 { 1301 struct lfs *fs; 1302 FINFO *fip; 1303 daddr_t *offsetp; 1304 int j, have_finfo, error; 1305 size_t size, bytes; 1306 ino_t ino; 1307 uint32_t gen; 1308 struct vnode *vp; 1309 daddr_t lbn; 1310 int *fragsp; 1311 1312 fs = lffa->fs; 1313 fip = lffa->finfop; 1314 offsetp = lffa->offsetp; 1315 fragsp = (int *)lffa->arg; 1316 1317 /* Get the inode and check its version. */ 1318 ino = lfs_fi_getino(fs, fip); 1319 gen = lfs_fi_getversion(fs, fip); 1320 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, LK_EXCLUSIVE|LK_NOWAIT, 1321 &vp); 1322 1323 /* 1324 * If we can't, or if version is wrong, or it has dirop blocks on it, 1325 * we can't relocate its blocks; but we still have to count 1326 * blocks through the partial segment to return the right offset. 1327 * XXX actually we can move DIROP vnodes' *old* data, as long 1328 * XXX as we are sure that we are moving *only* the old data---? 1329 */ 1330 if (error || VTOI(vp)->i_gen != gen || (vp->v_uflag & VU_DIROP)) { 1331 if (error == 0) 1332 error = ESTALE; 1333 1334 if (vp != NULL) { 1335 VOP_UNLOCK(vp); 1336 vrele(vp); 1337 vp = NULL; 1338 } 1339 bytes = ((lfs_fi_getnblocks(fs, fip) - 1) << lfs_sb_getbshift(fs)) 1340 + lfs_fi_getlastlength(fs, fip); 1341 *offsetp += lfs_btofsb(fs, bytes); 1342 1343 return error; 1344 } 1345 1346 /* 1347 * We have the vnode and its version is correct. 1348 * Take a cleaning reference; and loop through the blocks 1349 * and rewrite them. 1350 */ 1351 lfs_setclean(fs, vp); 1352 size = lfs_sb_getbsize(fs); 1353 have_finfo = 0; 1354 for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) { 1355 if (j == lfs_fi_getnblocks(fs, fip) - 1) 1356 size = lfs_fi_getlastlength(fs, fip); 1357 /* 1358 * An error of ESTALE indicates that there was nothing 1359 * to rewrite; this is not a problem. Any other error 1360 * causes us to skip the rest of this FINFO. 1361 */ 1362 if (vp != NULL && error == 0) { 1363 lbn = lfs_fi_getblock(fs, fip, j); 1364 error = rewrite_block(fs, vp, lbn, *offsetp, 1365 size, &have_finfo); 1366 if (error == ESTALE) 1367 error = 0; 1368 if (fragsp != NULL && error == 0) 1369 *fragsp += lfs_btofsb(fs, size); 1370 } 1371 *offsetp += lfs_btofsb(fs, size); 1372 } 1373 1374 /* 1375 * If we acquired finfo, release it and write the blocks. 1376 */ 1377 if (have_finfo) { 1378 lfs_updatemeta(fs->lfs_sp); 1379 fs->lfs_sp->vp = NULL; 1380 lfs_release_finfo(fs); 1381 lfs_writeinode(fs, fs->lfs_sp, VTOI(vp)); 1382 } 1383 1384 /* Release vnode */ 1385 VOP_UNLOCK(vp); 1386 vrele(vp); 1387 1388 return error; 1389 } 1390 1391 static int 1392 finfo_func_setclean(struct lfs_finfofuncarg *lffa) 1393 { 1394 struct lfs *fs; 1395 FINFO *fip; 1396 daddr_t *offsetp; 1397 int error; 1398 size_t bytes; 1399 ino_t ino; 1400 uint32_t gen; 1401 struct vnode *vp; 1402 1403 fs = lffa->fs; 1404 fip = lffa->finfop; 1405 offsetp = lffa->offsetp; 1406 1407 /* Get the inode and check its version. */ 1408 ino = lfs_fi_getino(fs, fip); 1409 gen = lfs_fi_getversion(fs, fip); 1410 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, LK_EXCLUSIVE|LK_NOWAIT, 1411 &vp); 1412 1413 /* If we have it and its version is right, take a cleaning reference */ 1414 if (error == 0 && VTOI(vp)->i_gen == gen) 1415 lfs_setclean(fs, vp); 1416 1417 if (vp != NULL) { 1418 VOP_UNLOCK(vp); 1419 vrele(vp); 1420 vp = NULL; 1421 } 1422 1423 /* Skip to the next block */ 1424 bytes = ((lfs_fi_getnblocks(fs, fip) - 1) << lfs_sb_getbshift(fs)) 1425 + lfs_fi_getlastlength(fs, fip); 1426 *offsetp += lfs_btofsb(fs, bytes); 1427 1428 return error; 1429 } 1430 1431 /* 1432 * Use the partial-segment parser to rewrite (clean) a segment. 1433 */ 1434 int 1435 lfs_rewrite_segment(struct lfs *fs, int sn, int *fragsp, kauth_cred_t cred, struct lwp *l) 1436 { 1437 daddr_t ooffset, offset, endpseg; 1438 1439 ASSERT_SEGLOCK(fs); 1440 1441 offset = lfs_sntod(fs, sn); 1442 skip_superblock(fs, &offset); 1443 endpseg = lfs_sntod(fs, sn + 1); 1444 1445 while (offset > 0 && offset != endpseg) { 1446 /* First check summary validity (XXX unnecessary?) */ 1447 ooffset = offset; 1448 lfs_parse_pseg(fs, &offset, 0, cred, NULL, l, 1449 NULL, NULL, CKSEG_CKSUM, NULL); 1450 if (offset == ooffset) 1451 break; 1452 1453 /* 1454 * Valid, proceed. 1455 * 1456 * First write the file blocks, marking their 1457 * inodes IN_CLEANING. 1458 */ 1459 offset = ooffset; 1460 lfs_parse_pseg(fs, &offset, 0, cred, NULL, l, 1461 NULL, finfo_func_rewrite, 1462 CKSEG_NONE, fragsp); 1463 1464 /* 1465 * Now go back and pick up any inodes that 1466 * were not already marked IN_CLEANING, and 1467 * write them as well. 1468 */ 1469 offset = ooffset; 1470 lfs_parse_pseg(fs, &offset, 0, cred, NULL, l, 1471 ino_func_rewrite, NULL, 1472 CKSEG_NONE, fragsp); 1473 } 1474 return 0; 1475 } 1476 1477 /* 1478 * Rewrite the contents of one or more segments, in preparation for 1479 * marking them clean. 1480 */ 1481 int 1482 lfs_rewrite_segments(struct lfs *fs, int *snn, int len, int *directp, int *offsetp, struct lwp *l) 1483 { 1484 kauth_cred_t cred; 1485 int i, error; 1486 struct buf *bp; 1487 SEGUSE *sup; 1488 daddr_t offset, endpseg; 1489 1490 ASSERT_NO_SEGLOCK(fs); 1491 1492 cred = l ? l->l_cred : NOCRED; 1493 1494 /* Prevent new dirops and acquire the cleaner lock. */ 1495 lfs_writer_enter(fs, "rewritesegs"); 1496 if ((error = lfs_cleanerlock(fs)) != 0) { 1497 lfs_writer_leave(fs); 1498 return error; 1499 } 1500 1501 /* 1502 * Pre-reference vnodes now that we have cleaner lock 1503 * but before we take the segment lock. We don't want to 1504 * mix cleaning blocks with flushed vnodes. 1505 */ 1506 for (i = 0; i < len; i++) { 1507 error = 0; 1508 /* Refuse to clean segments that are ACTIVE */ 1509 LFS_SEGENTRY(sup, fs, snn[i], bp); 1510 if (sup->su_flags & SEGUSE_ACTIVE 1511 || !(sup->su_flags & SEGUSE_DIRTY)) 1512 error = EINVAL; 1513 1514 brelse(bp, 0); 1515 if (error) 1516 break; 1517 1518 offset = lfs_sntod(fs, snn[i]); 1519 skip_superblock(fs, &offset); 1520 endpseg = lfs_sntod(fs, snn[i] + 1); 1521 1522 while (offset > 0 && offset != endpseg) { 1523 lfs_parse_pseg(fs, &offset, 0, cred, NULL, l, 1524 ino_func_setclean, finfo_func_setclean, 1525 CKSEG_NONE, NULL); 1526 } 1527 } 1528 1529 /* 1530 * Actually rewrite the contents of the segment. 1531 */ 1532 lfs_seglock(fs, SEGM_CLEAN); 1533 1534 for (i = 0; i < len; i++) { 1535 error = 0; 1536 /* Refuse to clean segments that are ACTIVE */ 1537 LFS_SEGENTRY(sup, fs, snn[i], bp); 1538 if (sup->su_flags & SEGUSE_ACTIVE 1539 || !(sup->su_flags & SEGUSE_DIRTY)) 1540 error = EINVAL; 1541 1542 brelse(bp, 0); 1543 if (error) 1544 break; 1545 1546 error = lfs_rewrite_segment(fs, snn[i], directp, cred, l); 1547 if (error) { 1548 printf(" rewrite_segment returned %d\n", error); 1549 break; 1550 } 1551 } 1552 while (lfs_writeseg(fs, fs->lfs_sp)) 1553 ; 1554 1555 *offsetp = lfs_btofsb(fs, fs->lfs_sp->bytes_written); 1556 lfs_segunlock(fs); 1557 lfs_cleanerunlock(fs); 1558 lfs_writer_leave(fs); 1559 1560 return error; 1561 } 1562 1563 #if 0 1564 static bool 1565 lfs_isseq(const struct lfs *fs, long int lbn1, long int lbn2) 1566 { 1567 return lbn2 == lbn1 + lfs_sb_getfrag(__UNCONST(fs)); 1568 } 1569 1570 /* 1571 * Rewrite the contents of a file in order to coalesce it. 1572 * We don't bother rewriting indirect blocks because they will have to 1573 * be rewritten anyway when we rewrite the direct blocks. 1574 */ 1575 int 1576 lfs_rewrite_file(struct lfs *fs, ino_t ino, struct lwp *l) 1577 { 1578 daddr_t lbn, hiblk, daddr; 1579 int i, error, num, run; 1580 struct vnode *vp; 1581 struct indir indirs[ULFS_NIADDR+2]; 1582 size_t size; 1583 1584 ASSERT_SEGLOCK(fs); 1585 1586 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, LK_EXCLUSIVE, &vp); 1587 if (error) 1588 return error; 1589 1590 lfs_acquire_finfo(fs, ino, VTOI(vp)->i_gen); 1591 for (lbn = 0, hiblk = VTOI(vp)->i_lfs_hiblk; lbn < hiblk; ++lbn) { 1592 error = ulfs_bmaparray(vp, lbn, &daddr, &indirs[0], &num, &run, 1593 lfs_isseq); 1594 if (daddr == UNASSIGNED) 1595 continue; 1596 for (i = 0; i <= run; i++) { 1597 size = lfs_blksize(fs, VTOI(vp), lbn); 1598 error = rewrite_block(fs, vp, lbn++, 0x0, size, NULL); 1599 if (error) 1600 break; 1601 } 1602 } 1603 lfs_release_finfo(fs); 1604 while (lfs_writeseg(fs, fs->lfs_sp)) 1605 ; 1606 lfs_segunlock(fs); 1607 1608 return error; 1609 } 1610 #endif /* 0 */ 1611 1612 1613 static int 1614 ino_func_checkempty(struct lfs_inofuncarg *lifa) 1615 { 1616 struct lfs *fs; 1617 daddr_t offset; 1618 struct vnode *devvp; 1619 union lfs_dinode *dip; 1620 struct buf *dbp, *ibp; 1621 int error; 1622 IFILE *ifp; 1623 unsigned i, num; 1624 daddr_t true_addr; 1625 ino_t ino; 1626 1627 fs = lifa->fs; 1628 offset = lifa->offset; 1629 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 1630 1631 /* Read inode block */ 1632 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs), 1633 0, &dbp); 1634 if (error) { 1635 DLOG((DLOG_RF, "ino_func_checkempty: bread returned %d\n", 1636 error)); 1637 return error; 1638 } 1639 1640 /* Check each inode against ifile entry */ 1641 num = LFS_INOPB(fs); 1642 for (i = num; i-- > 0; ) { 1643 dip = DINO_IN_BLOCK(fs, dbp->b_data, i); 1644 ino = lfs_dino_getinumber(fs, dip); 1645 if (ino == LFS_IFILE_INUM) { 1646 /* Check address against superblock */ 1647 true_addr = lfs_sb_getidaddr(fs); 1648 } else { 1649 /* Not ifile. Check address against ifile. */ 1650 LFS_IENTRY(ifp, fs, ino, ibp); 1651 true_addr = lfs_if_getdaddr(fs, ifp); 1652 brelse(ibp, 0); 1653 } 1654 if (offset == true_addr) { 1655 error = EEXIST; 1656 break; 1657 } 1658 } 1659 brelse(dbp, BC_AGE); 1660 1661 return error; 1662 } 1663 1664 static int 1665 finfo_func_checkempty(struct lfs_finfofuncarg *lffa) 1666 { 1667 struct lfs *fs; 1668 FINFO *fip; 1669 daddr_t *offsetp; 1670 int j, error; 1671 size_t size, bytes; 1672 ino_t ino; 1673 uint32_t gen; 1674 struct vnode *vp; 1675 daddr_t lbn, daddr; 1676 1677 fs = lffa->fs; 1678 fip = lffa->finfop; 1679 offsetp = lffa->offsetp; 1680 1681 /* Get the inode and check its version. */ 1682 ino = lfs_fi_getino(fs, fip); 1683 gen = lfs_fi_getversion(fs, fip); 1684 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, LK_EXCLUSIVE, &vp); 1685 1686 /* 1687 * If we can't, or if version is wrong, this FINFO does not refer 1688 * to a live file. Skip over it and continue. 1689 */ 1690 if (error || VTOI(vp)->i_gen != gen) { 1691 if (error == 0) 1692 error = ESTALE; 1693 1694 if (vp != NULL) { 1695 VOP_UNLOCK(vp); 1696 vrele(vp); 1697 vp = NULL; 1698 } 1699 bytes = ((lfs_fi_getnblocks(fs, fip) - 1) 1700 << lfs_sb_getbshift(fs)) 1701 + lfs_fi_getlastlength(fs, fip); 1702 *offsetp += lfs_btofsb(fs, bytes); 1703 1704 return error; 1705 } 1706 1707 /* 1708 * We have the vnode and its version is correct. 1709 * Loop through the blocks and check their currency. 1710 */ 1711 size = lfs_sb_getbsize(fs); 1712 for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) { 1713 if (j == lfs_fi_getnblocks(fs, fip) - 1) 1714 size = lfs_fi_getlastlength(fs, fip); 1715 if (vp != NULL) { 1716 lbn = lfs_fi_getblock(fs, fip, j); 1717 1718 /* Look up current location of this block. */ 1719 error = VOP_BMAP(vp, lbn, NULL, &daddr, NULL); 1720 if (error) 1721 break; 1722 1723 /* If it is here, the segment is not empty. */ 1724 if (LFS_DBTOFSB(fs, daddr) == *offsetp) { 1725 error = EEXIST; 1726 break; 1727 } 1728 } 1729 *offsetp += lfs_btofsb(fs, size); 1730 } 1731 1732 /* Release vnode */ 1733 VOP_UNLOCK(vp); 1734 vrele(vp); 1735 1736 return error; 1737 } 1738 1739 int 1740 lfs_checkempty(struct lfs *fs, int sn, kauth_cred_t cred, struct lwp *l) 1741 { 1742 daddr_t offset, endpseg; 1743 int error; 1744 1745 ASSERT_SEGLOCK(fs); 1746 1747 offset = lfs_sntod(fs, sn); 1748 skip_superblock(fs, &offset); 1749 endpseg = lfs_sntod(fs, sn + 1); 1750 1751 while (offset > 0 && offset < endpseg) { 1752 error = lfs_parse_pseg(fs, &offset, 0, cred, NULL, l, 1753 ino_func_checkempty, 1754 finfo_func_checkempty, 1755 CKSEG_NONE, NULL); 1756 if (error) 1757 return error; 1758 } 1759 return 0; 1760 } 1761