1 /* $NetBSD: lfs_kclean.c,v 1.1 2025/11/06 15:54:27 perseant Exp $ */ 2 3 /*- 4 * Copyright (c) 2025 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Konrad E. Schroder <perseant (at) hhhh.org>. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: lfs_kclean.c,v 1.1 2025/11/06 15:54:27 perseant Exp $"); 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/namei.h> 38 #include <sys/proc.h> 39 #include <sys/kernel.h> 40 #include <sys/vnode.h> 41 #include <sys/conf.h> 42 #include <sys/kauth.h> 43 #include <sys/buf.h> 44 #include <sys/kthread.h> 45 46 #include <ufs/lfs/ulfs_inode.h> 47 #include <ufs/lfs/ulfsmount.h> 48 #include <ufs/lfs/ulfs_extern.h> 49 50 #include <ufs/lfs/lfs.h> 51 #include <ufs/lfs/lfs_accessors.h> 52 #include <ufs/lfs/lfs_kernel.h> 53 #include <ufs/lfs/lfs_extern.h> 54 55 static int ino_func_setclean(struct lfs_inofuncarg *); 56 static int finfo_func_rewrite(struct lfs_finfofuncarg *); 57 static int finfo_func_setclean(struct lfs_finfofuncarg *); 58 static int rewrite_block(struct lfs *, struct vnode *, daddr_t, daddr_t, 59 size_t, int *); 60 61 static int clean(struct lfs *); 62 static long segselect_cb_rosenblum(struct lfs *, int, SEGUSE *, long); 63 static long segselect_greedy(struct lfs *, int, SEGUSE *); 64 static long segselect_cb_time(struct lfs *, int, SEGUSE *); 65 #if 0 66 static long segselect_cb_serial(struct lfs *, int, SEGUSE *); 67 #endif 68 69 struct lwp * lfs_cleaner_daemon = NULL; 70 extern kcondvar_t lfs_allclean_wakeup; 71 static int lfs_ncleaners = 0; 72 73 static int 74 ino_func_setclean(struct lfs_inofuncarg *lifa) 75 { 76 struct lfs *fs; 77 daddr_t offset; 78 struct vnode *devvp, *vp; 79 union lfs_dinode *dip; 80 struct buf *dbp, *ibp; 81 int error; 82 IFILE *ifp; 83 unsigned i, num; 84 daddr_t true_addr; 85 ino_t ino; 86 87 fs = lifa->fs; 88 offset = lifa->offset; 89 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 90 91 /* Read inode block */ 92 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs), 93 0, &dbp); 94 if (error) { 95 DLOG((DLOG_RF, "ino_func_setclean: bread returned %d\n", 96 error)); 97 return error; 98 } 99 memcpy(lifa->buf, dbp->b_data, dbp->b_bcount); 100 brelse(dbp, BC_AGE); 101 102 /* Check each inode against ifile entry */ 103 num = LFS_INOPB(fs); 104 for (i = num; i-- > 0; ) { 105 dip = DINO_IN_BLOCK(fs, lifa->buf, i); 106 ino = lfs_dino_getinumber(fs, dip); 107 if (ino == LFS_IFILE_INUM) { 108 /* Check address against superblock */ 109 true_addr = lfs_sb_getidaddr(fs); 110 } else { 111 /* Not ifile. Check address against ifile. */ 112 LFS_IENTRY(ifp, fs, ino, ibp); 113 true_addr = lfs_if_getdaddr(fs, ifp); 114 brelse(ibp, 0); 115 } 116 if (offset != true_addr) 117 continue; 118 119 LFS_ASSERT_MAXINO(fs, ino); 120 121 /* XXX We can use fastvget here! */ 122 123 /* 124 * An inode we need to relocate. 125 * Get it if we can. 126 */ 127 if (ino == LFS_IFILE_INUM) 128 vp = fs->lfs_ivnode; 129 else 130 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, 131 LK_EXCLUSIVE | LK_NOWAIT, &vp); 132 if (error) 133 continue; 134 135 KASSERT(VTOI(vp)->i_gen == lfs_dino_getgen(fs, dip)); 136 lfs_setclean(fs, vp); 137 if (vp != fs->lfs_ivnode) { 138 VOP_UNLOCK(vp); 139 vrele(vp); 140 } 141 } 142 143 return error; 144 } 145 146 static int 147 ino_func_rewrite(struct lfs_inofuncarg *lifa) 148 { 149 struct lfs *fs; 150 daddr_t offset; 151 struct vnode *devvp, *vp; 152 union lfs_dinode *dip; 153 struct buf *dbp, *ibp; 154 int error; 155 IFILE *ifp; 156 unsigned i, num; 157 daddr_t true_addr; 158 ino_t ino; 159 160 fs = lifa->fs; 161 offset = lifa->offset; 162 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 163 164 /* Read inode block */ 165 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs), 166 0, &dbp); 167 if (error) { 168 DLOG((DLOG_RF, "ino_func_rewrite: bread returned %d\n", 169 error)); 170 return error; 171 } 172 memcpy(lifa->buf, dbp->b_data, dbp->b_bcount); 173 brelse(dbp, BC_AGE); 174 175 /* Check each inode against ifile entry */ 176 num = LFS_INOPB(fs); 177 for (i = num; i-- > 0; ) { 178 dip = DINO_IN_BLOCK(fs, lifa->buf, i); 179 ino = lfs_dino_getinumber(fs, dip); 180 if (ino == LFS_IFILE_INUM) { 181 /* Check address against superblock */ 182 true_addr = lfs_sb_getidaddr(fs); 183 } else { 184 /* Not ifile. Check address against ifile. */ 185 LFS_IENTRY(ifp, fs, ino, ibp); 186 true_addr = lfs_if_getdaddr(fs, ifp); 187 brelse(ibp, 0); 188 } 189 if (offset != true_addr) 190 continue; 191 192 if (ino == LFS_IFILE_INUM) 193 continue; 194 195 LFS_ASSERT_MAXINO(fs, ino); 196 197 /* XXX We can use fastvget here! */ 198 199 /* 200 * An inode we need to relocate. 201 * Get it if we can. 202 */ 203 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, 204 LK_EXCLUSIVE | LK_NOWAIT, &vp); 205 if (error) 206 continue; 207 208 KASSERT(VTOI(vp)->i_gen == lfs_dino_getgen(fs, dip)); 209 210 if (!(VTOI(vp)->i_state & IN_CLEANING)) { 211 lfs_setclean(fs, vp); 212 lfs_writeinode(fs, fs->lfs_sp, VTOI(vp)); 213 } 214 215 VOP_UNLOCK(vp); 216 vrele(vp); 217 218 } 219 220 return error; 221 } 222 223 static int 224 rewrite_block(struct lfs *fs, struct vnode *vp, daddr_t lbn, daddr_t offset, size_t size, int *have_finfop) 225 { 226 daddr_t daddr; 227 int error; 228 struct buf *bp; 229 struct inode *ip; 230 231 KASSERT(have_finfop != NULL); 232 233 /* Look up current location of this block. */ 234 error = VOP_BMAP(vp, lbn, NULL, &daddr, NULL); 235 if (error) 236 return error; 237 238 /* Skip any block that is not here. */ 239 if (offset != 0 && LFS_DBTOFSB(fs, daddr) != offset) 240 return ESTALE; 241 242 /* 243 * It is (was recently) here. Read the block. 244 */ 245 //size = lfs_blksize(fs, VTOI(vp), lbn); 246 error = bread(vp, lbn, size, 0, &bp); 247 if (error) 248 return error; 249 250 if (vp == fs->lfs_ivnode) { 251 VOP_BWRITE(vp, bp); 252 } else { 253 /* Get ready to write. */ 254 if (!*have_finfop) { 255 ip = VTOI(vp); 256 lfs_acquire_finfo(fs, ip->i_number, ip->i_gen); 257 fs->lfs_sp->vp = vp; 258 *have_finfop = 1; 259 } 260 261 KASSERT(bp->b_vp == vp); 262 /* bp->b_cflags |= BC_INVAL; */ /* brelse will kill the buffer */ 263 lfs_bwrite_ext(bp, BW_CLEAN); 264 KASSERT(bp->b_vp == vp); 265 mutex_enter(&bufcache_lock); 266 while (lfs_gatherblock(fs->lfs_sp, bp, &bufcache_lock)) { 267 KASSERT(bp->b_vp != NULL); 268 } 269 mutex_exit(&bufcache_lock); 270 271 KASSERT(bp->b_flags & B_GATHERED); 272 KASSERT(fs->lfs_sp->cbpp[-1] == bp); 273 } 274 return 0; 275 } 276 277 static int 278 finfo_func_rewrite(struct lfs_finfofuncarg *lffa) 279 { 280 struct lfs *fs; 281 FINFO *fip; 282 daddr_t *offsetp; 283 int j, have_finfo, error; 284 size_t size, bytes; 285 ino_t ino; 286 uint32_t gen; 287 struct vnode *vp; 288 daddr_t lbn; 289 int *fragsp; 290 291 fs = lffa->fs; 292 fip = lffa->finfop; 293 offsetp = lffa->offsetp; 294 fragsp = (int *)lffa->arg; 295 296 /* Get the inode and check its version. */ 297 ino = lfs_fi_getino(fs, fip); 298 gen = lfs_fi_getversion(fs, fip); 299 error = 0; 300 if (ino == LFS_IFILE_INUM) 301 vp = fs->lfs_ivnode; 302 else { 303 LFS_ASSERT_MAXINO(fs, ino); 304 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, 305 LK_EXCLUSIVE|LK_NOWAIT, &vp); 306 } 307 308 /* 309 * If we can't, or if version is wrong, or it has dirop blocks on it, 310 * we can't relocate its blocks; but we still have to count 311 * blocks through the partial segment to return the right offset. 312 * XXX actually we can move DIROP vnodes' *old* data, as long 313 * XXX as we are sure that we are moving *only* the old data---? 314 */ 315 if (error || VTOI(vp)->i_gen != gen || (vp->v_uflag & VU_DIROP)) { 316 if (error == 0) 317 error = ESTALE; 318 319 if (vp != NULL && vp != fs->lfs_ivnode) { 320 VOP_UNLOCK(vp); 321 vrele(vp); 322 } 323 vp = NULL; 324 bytes = ((lfs_fi_getnblocks(fs, fip) - 1) << lfs_sb_getbshift(fs)) 325 + lfs_fi_getlastlength(fs, fip); 326 *offsetp += lfs_btofsb(fs, bytes); 327 328 return error; 329 } 330 331 /* 332 * We have the vnode and its version is correct. 333 * Take a cleaning reference; and loop through the blocks 334 * and rewrite them. 335 */ 336 lfs_setclean(fs, vp); 337 size = lfs_sb_getbsize(fs); 338 have_finfo = 0; 339 for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) { 340 if (j == lfs_fi_getnblocks(fs, fip) - 1) 341 size = lfs_fi_getlastlength(fs, fip); 342 /* 343 * An error of ESTALE indicates that there was nothing 344 * to rewrite; this is not a problem. Any other error 345 * causes us to skip the rest of this FINFO. 346 */ 347 if (vp != NULL && error == 0) { 348 lbn = lfs_fi_getblock(fs, fip, j); 349 error = rewrite_block(fs, vp, lbn, *offsetp, 350 size, &have_finfo); 351 if (error == ESTALE) 352 error = 0; 353 if (fragsp != NULL && error == 0) 354 *fragsp += lfs_btofsb(fs, size); 355 } 356 *offsetp += lfs_btofsb(fs, size); 357 } 358 359 /* 360 * If we acquired finfo, release it and write the blocks. 361 */ 362 if (have_finfo) { 363 lfs_updatemeta(fs->lfs_sp); 364 fs->lfs_sp->vp = NULL; 365 lfs_release_finfo(fs); 366 lfs_writeinode(fs, fs->lfs_sp, VTOI(vp)); 367 } 368 369 /* Release vnode */ 370 if (vp != fs->lfs_ivnode) { 371 VOP_UNLOCK(vp); 372 vrele(vp); 373 } 374 375 return error; 376 } 377 378 static int 379 finfo_func_setclean(struct lfs_finfofuncarg *lffa) 380 { 381 struct lfs *fs; 382 FINFO *fip; 383 daddr_t *offsetp; 384 int error; 385 size_t bytes; 386 ino_t ino; 387 uint32_t gen; 388 struct vnode *vp; 389 390 fs = lffa->fs; 391 fip = lffa->finfop; 392 offsetp = lffa->offsetp; 393 394 /* Get the inode and check its version. */ 395 ino = lfs_fi_getino(fs, fip); 396 gen = lfs_fi_getversion(fs, fip); 397 error = 0; 398 if (ino == LFS_IFILE_INUM) 399 vp = fs->lfs_ivnode; 400 else { 401 LFS_ASSERT_MAXINO(fs, ino); 402 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, 403 LK_EXCLUSIVE|LK_NOWAIT, &vp); 404 } 405 406 /* If we have it and its version is right, take a cleaning reference */ 407 if (error == 0 && VTOI(vp)->i_gen == gen) 408 lfs_setclean(fs, vp); 409 410 if (vp == fs->lfs_ivnode) 411 vp = NULL; 412 else if (vp != NULL) { 413 VOP_UNLOCK(vp); 414 vrele(vp); 415 vp = NULL; 416 } 417 418 /* Skip to the next block */ 419 bytes = ((lfs_fi_getnblocks(fs, fip) - 1) << lfs_sb_getbshift(fs)) 420 + lfs_fi_getlastlength(fs, fip); 421 *offsetp += lfs_btofsb(fs, bytes); 422 423 return error; 424 } 425 426 /* 427 * Use the partial-segment parser to rewrite (clean) a segment. 428 */ 429 int 430 lfs_rewrite_segment(struct lfs *fs, int sn, int *fragsp, kauth_cred_t cred, struct lwp *l) 431 { 432 daddr_t ooffset, offset, endpseg; 433 434 ASSERT_SEGLOCK(fs); 435 436 offset = lfs_sntod(fs, sn); 437 lfs_skip_superblock(fs, &offset); 438 endpseg = lfs_sntod(fs, sn + 1); 439 440 while (offset > 0 && offset != endpseg) { 441 /* First check summary validity (XXX unnecessary?) */ 442 ooffset = offset; 443 lfs_parse_pseg(fs, &offset, 0, cred, NULL, l, 444 NULL, NULL, CKSEG_CKSUM, NULL); 445 if (offset == ooffset) 446 break; 447 448 /* 449 * Valid, proceed. 450 * 451 * First write the file blocks, marking their 452 * inodes IN_CLEANING. 453 */ 454 offset = ooffset; 455 lfs_parse_pseg(fs, &offset, 0, cred, NULL, l, 456 NULL, finfo_func_rewrite, 457 CKSEG_NONE, fragsp); 458 459 /* 460 * Now go back and pick up any inodes that 461 * were not already marked IN_CLEANING, and 462 * write them as well. 463 */ 464 offset = ooffset; 465 lfs_parse_pseg(fs, &offset, 0, cred, NULL, l, 466 ino_func_rewrite, NULL, 467 CKSEG_NONE, fragsp); 468 } 469 return 0; 470 } 471 472 /* 473 * Rewrite the contents of one or more segments, in preparation for 474 * marking them clean. 475 */ 476 int 477 lfs_rewrite_segments(struct lfs *fs, int *snn, int len, int *directp, int *offsetp, struct lwp *l) 478 { 479 kauth_cred_t cred; 480 int i, error; 481 struct buf *bp; 482 SEGUSE *sup; 483 daddr_t offset, endpseg; 484 485 ASSERT_NO_SEGLOCK(fs); 486 487 cred = l ? l->l_cred : NOCRED; 488 489 /* Prevent new dirops and acquire the cleaner lock. */ 490 lfs_writer_enter(fs, "rewritesegs"); 491 if ((error = lfs_cleanerlock(fs)) != 0) { 492 lfs_writer_leave(fs); 493 return error; 494 } 495 496 /* 497 * Pre-reference vnodes now that we have cleaner lock 498 * but before we take the segment lock. We don't want to 499 * mix cleaning blocks with flushed vnodes. 500 */ 501 for (i = 0; i < len; i++) { 502 error = 0; 503 /* Refuse to clean segments that are ACTIVE */ 504 LFS_SEGENTRY(sup, fs, snn[i], bp); 505 if (sup->su_flags & SEGUSE_ACTIVE 506 || !(sup->su_flags & SEGUSE_DIRTY)) 507 error = EINVAL; 508 509 brelse(bp, 0); 510 if (error) 511 break; 512 513 offset = lfs_sntod(fs, snn[i]); 514 lfs_skip_superblock(fs, &offset); 515 endpseg = lfs_sntod(fs, snn[i] + 1); 516 517 while (offset > 0 && offset != endpseg) { 518 lfs_parse_pseg(fs, &offset, 0, cred, NULL, l, 519 ino_func_setclean, finfo_func_setclean, 520 CKSEG_NONE, NULL); 521 } 522 } 523 524 /* 525 * Actually rewrite the contents of the segment. 526 */ 527 lfs_seglock(fs, SEGM_CLEAN); 528 529 for (i = 0; i < len; i++) { 530 error = 0; 531 /* Refuse to clean segments that are ACTIVE */ 532 LFS_SEGENTRY(sup, fs, snn[i], bp); 533 if (sup->su_flags & SEGUSE_ACTIVE 534 || !(sup->su_flags & SEGUSE_DIRTY)) 535 error = EINVAL; 536 537 brelse(bp, 0); 538 if (error) 539 break; 540 541 error = lfs_rewrite_segment(fs, snn[i], directp, cred, l); 542 if (error) { 543 printf(" rewrite_segment returned %d\n", error); 544 break; 545 } 546 } 547 while (lfs_writeseg(fs, fs->lfs_sp)) 548 ; 549 550 *offsetp = lfs_btofsb(fs, fs->lfs_sp->bytes_written); 551 lfs_segunlock(fs); 552 lfs_cleanerunlock(fs); 553 lfs_writer_leave(fs); 554 555 return error; 556 } 557 558 #if 0 559 static bool 560 lfs_isseq(const struct lfs *fs, long int lbn1, long int lbn2) 561 { 562 return lbn2 == lbn1 + lfs_sb_getfrag(__UNCONST(fs)); 563 } 564 565 /* 566 * Rewrite the contents of a file in order to coalesce it. 567 * We don't bother rewriting indirect blocks because they will have to 568 * be rewritten anyway when we rewrite the direct blocks. 569 */ 570 int 571 lfs_rewrite_file(struct lfs *fs, ino_t ino, struct lwp *l) 572 { 573 daddr_t lbn, hiblk, daddr; 574 int i, error, num, run; 575 struct vnode *vp; 576 struct indir indirs[ULFS_NIADDR+2]; 577 size_t size; 578 579 ASSERT_SEGLOCK(fs); 580 581 LFS_ASSERT_MAXINO(fs, ino); 582 583 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, LK_EXCLUSIVE, &vp); 584 if (error) 585 return error; 586 587 lfs_acquire_finfo(fs, ino, VTOI(vp)->i_gen); 588 for (lbn = 0, hiblk = VTOI(vp)->i_lfs_hiblk; lbn < hiblk; ++lbn) { 589 error = ulfs_bmaparray(vp, lbn, &daddr, &indirs[0], &num, &run, 590 lfs_isseq); 591 if (daddr == UNASSIGNED) 592 continue; 593 for (i = 0; i <= run; i++) { 594 size = lfs_blksize(fs, VTOI(vp), lbn); 595 error = rewrite_block(fs, vp, lbn++, 0x0, size, NULL); 596 if (error) 597 break; 598 } 599 } 600 lfs_release_finfo(fs); 601 while (lfs_writeseg(fs, fs->lfs_sp)) 602 ; 603 lfs_segunlock(fs); 604 605 return error; 606 } 607 #endif /* 0 */ 608 609 610 static int 611 ino_func_checkempty(struct lfs_inofuncarg *lifa) 612 { 613 struct lfs *fs; 614 daddr_t offset; 615 struct vnode *devvp; 616 union lfs_dinode *dip; 617 struct buf *dbp, *ibp; 618 int error; 619 IFILE *ifp; 620 unsigned i, num; 621 daddr_t true_addr; 622 ino_t ino; 623 624 fs = lifa->fs; 625 offset = lifa->offset; 626 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 627 628 /* Read inode block */ 629 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs), 630 0, &dbp); 631 if (error) { 632 DLOG((DLOG_RF, "ino_func_checkempty: bread returned %d\n", 633 error)); 634 return error; 635 } 636 637 /* Check each inode against ifile entry */ 638 num = LFS_INOPB(fs); 639 for (i = num; i-- > 0; ) { 640 dip = DINO_IN_BLOCK(fs, dbp->b_data, i); 641 ino = lfs_dino_getinumber(fs, dip); 642 if (ino == LFS_IFILE_INUM) { 643 /* Check address against superblock */ 644 true_addr = lfs_sb_getidaddr(fs); 645 } else { 646 /* Not ifile. Check address against ifile. */ 647 LFS_IENTRY(ifp, fs, ino, ibp); 648 true_addr = lfs_if_getdaddr(fs, ifp); 649 brelse(ibp, 0); 650 } 651 if (offset == true_addr) { 652 error = EEXIST; 653 break; 654 } 655 } 656 brelse(dbp, BC_AGE); 657 658 return error; 659 } 660 661 static int 662 finfo_func_checkempty(struct lfs_finfofuncarg *lffa) 663 { 664 struct lfs *fs; 665 FINFO *fip; 666 daddr_t *offsetp; 667 int j, error; 668 size_t size, bytes; 669 ino_t ino; 670 uint32_t gen; 671 struct vnode *vp; 672 daddr_t lbn, daddr; 673 674 fs = lffa->fs; 675 fip = lffa->finfop; 676 offsetp = lffa->offsetp; 677 678 /* Get the inode and check its version. */ 679 ino = lfs_fi_getino(fs, fip); 680 gen = lfs_fi_getversion(fs, fip); 681 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, LK_EXCLUSIVE, &vp); 682 683 /* 684 * If we can't, or if version is wrong, this FINFO does not refer 685 * to a live file. Skip over it and continue. 686 */ 687 if (error || VTOI(vp)->i_gen != gen) { 688 if (error == 0) 689 error = ESTALE; 690 691 if (vp != NULL) { 692 VOP_UNLOCK(vp); 693 vrele(vp); 694 vp = NULL; 695 } 696 bytes = ((lfs_fi_getnblocks(fs, fip) - 1) 697 << lfs_sb_getbshift(fs)) 698 + lfs_fi_getlastlength(fs, fip); 699 *offsetp += lfs_btofsb(fs, bytes); 700 701 return error; 702 } 703 704 /* 705 * We have the vnode and its version is correct. 706 * Loop through the blocks and check their currency. 707 */ 708 size = lfs_sb_getbsize(fs); 709 for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) { 710 if (j == lfs_fi_getnblocks(fs, fip) - 1) 711 size = lfs_fi_getlastlength(fs, fip); 712 if (vp != NULL) { 713 lbn = lfs_fi_getblock(fs, fip, j); 714 715 /* Look up current location of this block. */ 716 error = VOP_BMAP(vp, lbn, NULL, &daddr, NULL); 717 if (error) 718 break; 719 720 /* If it is here, the segment is not empty. */ 721 if (LFS_DBTOFSB(fs, daddr) == *offsetp) { 722 error = EEXIST; 723 break; 724 } 725 } 726 *offsetp += lfs_btofsb(fs, size); 727 } 728 729 /* Release vnode */ 730 VOP_UNLOCK(vp); 731 vrele(vp); 732 733 return error; 734 } 735 736 int 737 lfs_checkempty(struct lfs *fs, int sn, kauth_cred_t cred, struct lwp *l) 738 { 739 daddr_t offset, endpseg; 740 int error; 741 742 ASSERT_SEGLOCK(fs); 743 744 offset = lfs_sntod(fs, sn); 745 lfs_skip_superblock(fs, &offset); 746 endpseg = lfs_sntod(fs, sn + 1); 747 748 while (offset > 0 && offset < endpseg) { 749 error = lfs_parse_pseg(fs, &offset, 0, cred, NULL, l, 750 ino_func_checkempty, 751 finfo_func_checkempty, 752 CKSEG_NONE, NULL); 753 if (error) 754 return error; 755 } 756 return 0; 757 } 758 759 static long 760 segselect_greedy(struct lfs *fs, int sn, SEGUSE *sup) 761 { 762 return lfs_sb_getssize(fs) - sup->su_nbytes; 763 } 764 765 __inline static long 766 segselect_cb_rosenblum(struct lfs *fs, int sn, SEGUSE *sup, long age) 767 { 768 long benefit, cost; 769 770 benefit = (int64_t)lfs_sb_getssize(fs) - sup->su_nbytes - 771 (sup->su_nsums + 1) * lfs_sb_getfsize(fs); 772 if (sup->su_flags & SEGUSE_SUPERBLOCK) 773 benefit -= LFS_SBPAD; 774 if (lfs_sb_getbsize(fs) > lfs_sb_getfsize(fs)) /* fragmentation */ 775 benefit -= (lfs_sb_getbsize(fs) / 2); 776 if (benefit <= 0) { 777 return 0; 778 } 779 780 cost = lfs_sb_getssize(fs) + sup->su_nbytes; 781 return (256 * benefit * age) / cost; 782 } 783 784 static long 785 segselect_cb_time(struct lfs *fs, int sn, SEGUSE *sup) 786 { 787 long age; 788 789 age = time_second - sup->su_lastmod; 790 if (age < 0) 791 age = 0; 792 return segselect_cb_rosenblum(fs, sn, sup, age); 793 } 794 795 #if 0 796 /* 797 * Same as the time comparator, but fetch the serial number from the 798 * segment header to compare. 799 * 800 * This is ugly. Whether serial number or wall time is better is a 801 * worthy question, but if we want to use serial number to compute 802 * age, we should record the serial number in su_lastmod instead of 803 * the time. 804 */ 805 static long 806 segselect_cb_serial(struct lfs *fs, int sn, SEGUSE *sup) 807 { 808 struct buf *bp; 809 uint32_t magic; 810 uint64_t age, serial; 811 daddr_t addr; 812 813 addr = lfs_segtod(fs, sn); 814 lfs_skip_superblock(fs, &addr); 815 bread(fs->lfs_devvp, LFS_FSBTODB(fs, addr), 816 lfs_sb_getsumsize(fs), 0, &bp); 817 magic = lfs_ss_getmagic(fs, ((SEGSUM *)bp->b_data)); 818 serial = lfs_ss_getserial(fs, ((SEGSUM *)bp->b_data)); 819 brelse(bp, 0); 820 821 if (magic != SS_MAGIC) 822 return 0; 823 824 age = lfs_sb_getserial(fs) - serial; 825 return segselect_cb_rosenblum(fs, sn, sup, age); 826 } 827 #endif 828 829 void 830 lfs_cleanerd(void *arg) 831 { 832 mount_iterator_t *iter; 833 struct mount *mp; 834 struct lfs *fs; 835 struct vfsops *vfs = NULL; 836 int lfsc; 837 int cleaned_something = 0; 838 839 mutex_enter(&lfs_lock); 840 KASSERTMSG(lfs_cleaner_daemon == NULL, 841 "more than one LFS cleaner daemon"); 842 lfs_cleaner_daemon = curlwp; 843 mutex_exit(&lfs_lock); 844 845 /* Take an extra reference to the LFS vfsops. */ 846 vfs = vfs_getopsbyname(MOUNT_LFS); 847 848 mutex_enter(&lfs_lock); 849 for (;;) { 850 KASSERT(mutex_owned(&lfs_lock)); 851 if (cleaned_something == 0) 852 cv_timedwait(&lfs_allclean_wakeup, &lfs_lock, hz/10 + 1); 853 KASSERT(mutex_owned(&lfs_lock)); 854 cleaned_something = 0; 855 856 KASSERT(mutex_owned(&lfs_lock)); 857 mutex_exit(&lfs_lock); 858 859 /* 860 * Look through the list of LFSs to see if any of them 861 * need cleaning. 862 */ 863 mountlist_iterator_init(&iter); 864 lfsc = 0; 865 while ((mp = mountlist_iterator_next(iter)) != NULL) { 866 KASSERT(!mutex_owned(&lfs_lock)); 867 if (strncmp(mp->mnt_stat.f_fstypename, MOUNT_LFS, 868 sizeof(mp->mnt_stat.f_fstypename)) == 0) { 869 fs = VFSTOULFS(mp)->um_lfs; 870 871 mutex_enter(&lfs_lock); 872 if (fs->lfs_clean_selector != NULL) 873 ++lfsc; 874 mutex_exit(&lfs_lock); 875 cleaned_something += clean(fs); 876 } 877 } 878 if (lfsc == 0) { 879 mutex_enter(&lfs_lock); 880 lfs_cleaner_daemon = NULL; 881 mutex_exit(&lfs_lock); 882 mountlist_iterator_destroy(iter); 883 break; 884 } 885 mountlist_iterator_destroy(iter); 886 887 mutex_enter(&lfs_lock); 888 } 889 KASSERT(!mutex_owned(&lfs_lock)); 890 891 /* Give up our extra reference so the module can be unloaded. */ 892 mutex_enter(&vfs_list_lock); 893 if (vfs != NULL) 894 vfs->vfs_refcount--; 895 mutex_exit(&vfs_list_lock); 896 897 /* Done! */ 898 kthread_exit(0); 899 } 900 901 /* 902 * Look at the file system to see whether it needs cleaning, and if it does, 903 * clean a segment. 904 */ 905 static int 906 clean(struct lfs *fs) 907 { 908 struct buf *bp; 909 SEGUSE *sup; 910 int sn, maxsn, nclean, nready, nempty, nerror, nzero, again, target; 911 long prio, maxprio, maxeprio, thresh; 912 long (*func)(struct lfs *, int, SEGUSE *); 913 uint32_t __debugused segflags = 0; 914 daddr_t oldsn, bfree, avail; 915 int direct, offset; 916 917 func = fs->lfs_clean_selector; 918 if (func == NULL) 919 return 0; 920 921 thresh = fs->lfs_autoclean.thresh; 922 if (fs->lfs_flags & LFS_MUSTCLEAN) 923 thresh = 0; 924 else if (thresh < 0) { 925 /* 926 * Compute a priority threshold based on availability ratio. 927 * XXX These numbers only makes sense for the greedy cleaner. 928 * What is an appropriate threshold for the cost-benefit 929 * cleaner? 930 */ 931 bfree = lfs_sb_getbfree(fs) 932 + lfs_segtod(fs, 1) * lfs_sb_getminfree(fs); 933 avail = lfs_sb_getavail(fs) - fs->lfs_ravail - fs->lfs_favail; 934 if (avail > bfree) 935 return 0; 936 thresh = lfs_sb_getssize(fs) * (bfree - avail) 937 / (lfs_sb_getsize(fs) - avail); 938 if (thresh > lfs_sb_getsumsize(fs) + 5 * lfs_sb_getbsize(fs)) 939 thresh = lfs_sb_getsumsize(fs) + 5 * lfs_sb_getbsize(fs); 940 if (thresh > lfs_sb_getssize(fs) - lfs_sb_getbsize(fs)) 941 return 0; 942 } 943 944 target = fs->lfs_autoclean.target; 945 if (target <= 0) { 946 /* Default to half a segment target */ 947 target = lfs_segtod(fs, 1) / 2; 948 } 949 950 oldsn = lfs_dtosn(fs, lfs_sb_getoffset(fs)); 951 952 again = 0; 953 maxprio = maxeprio = -1; 954 nzero = nclean = nready = nempty = nerror = 0; 955 for (sn = 0; sn < lfs_sb_getnseg(fs); sn++) { 956 957 prio = 0; 958 LFS_SEGENTRY(sup, fs, sn, bp); 959 if (sup->su_flags & SEGUSE_ACTIVE) 960 prio = 0; 961 else if (!(sup->su_flags & SEGUSE_DIRTY)) 962 ++nclean; 963 else if (sup->su_flags & SEGUSE_READY) 964 ++nready; 965 else if (sup->su_flags & SEGUSE_EMPTY) 966 ++nempty; 967 else if (sup->su_nbytes == 0) 968 ++nzero; 969 else 970 prio = (*func)(fs, sn, sup); 971 972 if (sup->su_flags & SEGUSE_ERROR) { 973 if (prio > maxeprio) 974 maxeprio = prio; 975 prio = 0; 976 ++nerror; 977 } 978 979 if (prio > maxprio) { 980 maxprio = prio; 981 maxsn = sn; 982 segflags = sup->su_flags; 983 } 984 brelse(bp, 0); 985 } 986 DLOG((DLOG_CLEAN, "%s clean=%d/%d zero=%d empty=%d ready=%d maxsn=%d maxprio=%ld/%ld segflags=0x%lx\n", 987 (maxprio > thresh ? "YES" : "NO "), 988 nclean, (int)lfs_sb_getnseg(fs), nzero, nempty, nready, 989 maxsn, maxprio, (unsigned long)thresh, 990 (unsigned long)segflags)); 991 992 /* 993 * If we are trying to clean the segment we cleaned last, 994 * cleaning did not work. Mark this segment SEGUSE_ERROR 995 * and try again. 996 */ 997 if (maxprio > 0 && fs->lfs_lastcleaned == maxsn) { 998 LFS_SEGENTRY(sup, fs, maxsn, bp); 999 sup->su_flags |= SEGUSE_ERROR; 1000 LFS_WRITESEGENTRY(sup, fs, sn, bp); 1001 return 1; 1002 } 1003 1004 /* 1005 * If there were nothing but error segments, clear error. 1006 * We will wait to try again. 1007 */ 1008 if (maxprio == 0 && maxeprio > 0) { 1009 DLOG((DLOG_CLEAN, "clear error on %d segments, try again\n", 1010 nerror)); 1011 lfs_seguse_clrflag_all(fs, SEGUSE_ERROR); 1012 } 1013 1014 /* Rewrite the highest-priority segment */ 1015 if (maxprio > thresh) { 1016 direct = offset = 0; 1017 (void)lfs_rewrite_segments(fs, &maxsn, 1, 1018 &direct, &offset, curlwp); 1019 DLOG((DLOG_CLEAN, " direct=%d offset=%d\n", direct, offset)); 1020 again += direct; 1021 fs->lfs_clean_accum += offset; 1022 1023 /* Don't clean this again immediately */ 1024 fs->lfs_lastcleaned = maxsn; 1025 } 1026 1027 /* 1028 * If we are in dire straits but we have segments already 1029 * empty, force a double checkpoint to reclaim them. 1030 */ 1031 if (fs->lfs_flags & LFS_MUSTCLEAN) { 1032 if (nready + nempty > 0) { 1033 printf("force checkpoint with nready=%d nempty=%d nzero=%d\n", 1034 nready, nempty, nzero); 1035 lfs_segwrite(fs->lfs_ivnode->v_mount, 1036 SEGM_CKP | SEGM_FORCE_CKP | SEGM_SYNC); 1037 lfs_segwrite(fs->lfs_ivnode->v_mount, 1038 SEGM_CKP | SEGM_FORCE_CKP | SEGM_SYNC); 1039 ++again; 1040 } 1041 } else if (fs->lfs_clean_accum > target) { 1042 DLOG((DLOG_CLEAN, "checkpoint to flush\n")); 1043 lfs_segwrite(fs->lfs_ivnode->v_mount, SEGM_CKP); 1044 fs->lfs_clean_accum = 0; 1045 } else if (lfs_dtosn(fs, lfs_sb_getoffset(fs)) != oldsn 1046 || nempty + nready > LFS_MAX_ACTIVE) { /* XXX arbitrary */ 1047 DLOG((DLOG_CLEAN, "write to promote empty segments\n")); 1048 lfs_segwrite(fs->lfs_ivnode->v_mount, SEGM_CKP); 1049 fs->lfs_clean_accum = 0; 1050 } 1051 1052 return again; 1053 } 1054 1055 /* 1056 * Rewrite a file in its entirety. 1057 * 1058 * Generally this would be done to coalesce a file that is scattered 1059 * around the disk; but if the "scramble" flag is set, instead rewrite 1060 * only the even-numbered blocks, which provides the opposite effect 1061 * for testing purposes. 1062 * 1063 * It is the caller's responsibility to check the bounds of the inode 1064 * numbers. 1065 */ 1066 int 1067 lfs_rewrite_file(struct lfs *fs, ino_t *inoa, int len, bool scramble, 1068 int *directp, int *offsetp) 1069 { 1070 daddr_t hiblk, lbn; 1071 struct vnode *vp; 1072 struct inode *ip; 1073 struct buf *bp; 1074 int i, error, flags; 1075 1076 *directp = 0; 1077 if ((error = lfs_cleanerlock(fs)) != 0) 1078 return error; 1079 flags = SEGM_PROT; 1080 lfs_seglock(fs, flags); 1081 for (i = 0; i < len; ++i) { 1082 error = VFS_VGET(fs->lfs_ivnode->v_mount, inoa[i], LK_EXCLUSIVE, &vp); 1083 if (error) 1084 goto out; 1085 1086 ip = VTOI(vp); 1087 if ((vp->v_uflag & VU_DIROP) || (ip->i_flags & IN_ADIROP)) { 1088 VOP_UNLOCK(vp); 1089 vrele(vp); 1090 error = EAGAIN; 1091 goto out; 1092 } 1093 1094 /* Highest block in this inode */ 1095 hiblk = lfs_lblkno(fs, ip->i_size + lfs_sb_getbsize(fs) - 1) - 1; 1096 1097 for (lbn = 0; lbn <= hiblk; ++lbn) { 1098 if (scramble && (lbn & 0x01)) 1099 continue; 1100 1101 if (lfs_needsflush(fs)) { 1102 lfs_segwrite(fs->lfs_ivnode->v_mount, flags); 1103 } 1104 1105 error = bread(vp, lbn, lfs_blksize(fs, ip, lbn), 0, &bp); 1106 if (error) 1107 break; 1108 1109 /* bp->b_cflags |= BC_INVAL; */ 1110 lfs_bwrite_ext(bp, (flags & SEGM_CLEAN ? BW_CLEAN : 0)); 1111 *directp += lfs_btofsb(fs, bp->b_bcount); 1112 } 1113 1114 /* Done with this vnode */ 1115 VOP_UNLOCK(vp); 1116 vrele(vp); 1117 if (error) 1118 break; 1119 } 1120 out: 1121 lfs_segwrite(fs->lfs_ivnode->v_mount, flags); 1122 *offsetp += lfs_btofsb(fs, fs->lfs_sp->bytes_written); 1123 lfs_segunlock(fs); 1124 lfs_cleanerunlock(fs); 1125 1126 return error; 1127 } 1128 1129 int 1130 lfs_cleanctl(struct lfs *fs, struct lfs_autoclean_params *params) 1131 { 1132 long (*cleanfunc)(struct lfs *, int, SEGUSE *); 1133 1134 fs->lfs_autoclean = *params; 1135 1136 cleanfunc = NULL; 1137 switch (fs->lfs_autoclean.mode) { 1138 case LFS_CLEANMODE_NONE: 1139 cleanfunc = NULL; 1140 break; 1141 1142 case LFS_CLEANMODE_GREEDY: 1143 cleanfunc = segselect_greedy; 1144 break; 1145 1146 case LFS_CLEANMODE_CB: 1147 cleanfunc = segselect_cb_time; 1148 break; 1149 1150 default: 1151 return EINVAL; 1152 } 1153 1154 mutex_enter(&lfs_lock); 1155 if (fs->lfs_clean_selector == NULL && cleanfunc != NULL) 1156 if (++lfs_ncleaners == 1) { 1157 printf("Starting cleaner thread\n"); 1158 if (lfs_cleaner_daemon == NULL && 1159 kthread_create(PRI_BIO, 0, NULL, 1160 lfs_cleanerd, NULL, NULL, 1161 "lfs_cleaner") != 0) 1162 panic("fork lfs_cleaner"); 1163 } 1164 if (fs->lfs_clean_selector != NULL && cleanfunc == NULL) 1165 if (--lfs_ncleaners == 0) { 1166 printf("Stopping cleaner thread\n"); 1167 kthread_join(lfs_cleaner_daemon); 1168 } 1169 fs->lfs_clean_selector = cleanfunc; 1170 mutex_exit(&lfs_lock); 1171 1172 return 0; 1173 } 1174