1 /* $NetBSD: lfs_kclean.c,v 1.4 2026/01/05 05:02:47 perseant Exp $ */ 2 3 /*- 4 * Copyright (c) 2025 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Konrad E. Schroder <perseant (at) hhhh.org>. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: lfs_kclean.c,v 1.4 2026/01/05 05:02:47 perseant Exp $"); 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/namei.h> 38 #include <sys/proc.h> 39 #include <sys/kernel.h> 40 #include <sys/vnode.h> 41 #include <sys/conf.h> 42 #include <sys/kauth.h> 43 #include <sys/buf.h> 44 #include <sys/kthread.h> 45 46 #include <ufs/lfs/ulfs_inode.h> 47 #include <ufs/lfs/ulfsmount.h> 48 #include <ufs/lfs/ulfs_extern.h> 49 50 #include <ufs/lfs/lfs.h> 51 #include <ufs/lfs/lfs_accessors.h> 52 #include <ufs/lfs/lfs_kernel.h> 53 #include <ufs/lfs/lfs_extern.h> 54 55 static int ino_func_setclean(struct lfs_inofuncarg *); 56 static int finfo_func_rewrite(struct lfs_finfofuncarg *); 57 static int finfo_func_setclean(struct lfs_finfofuncarg *); 58 static int rewrite_block(struct lfs *, struct vnode *, daddr_t, daddr_t, 59 size_t, int *); 60 61 static int ino_func_rewrite(struct lfs_inofuncarg *); 62 static int ino_func_setclean(struct lfs_inofuncarg *); 63 static int ino_func_checkempty(struct lfs_inofuncarg *); 64 65 static int clean(struct lfs *); 66 static long segselect_cb_rosenblum(struct lfs *, int, SEGUSE *, long); 67 static long segselect_greedy(struct lfs *, int, SEGUSE *); 68 static long segselect_cb_time(struct lfs *, int, SEGUSE *); 69 #if 0 70 static long segselect_cb_serial(struct lfs *, int, SEGUSE *); 71 #endif 72 static int check_clean_list(struct lfs *, ino_t); 73 74 struct lwp * lfs_cleaner_daemon = NULL; 75 extern kcondvar_t lfs_allclean_wakeup; 76 static int lfs_ncleaners = 0; 77 78 static int 79 ino_func_setclean(struct lfs_inofuncarg *lifa) 80 { 81 struct lfs *fs; 82 daddr_t offset; 83 struct vnode *devvp, *vp; 84 union lfs_dinode *dip; 85 struct buf *dbp, *ibp; 86 int error; 87 IFILE *ifp; 88 unsigned i, num; 89 daddr_t true_addr; 90 ino_t ino; 91 92 fs = lifa->fs; 93 offset = lifa->offset; 94 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 95 96 /* Read inode block */ 97 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs), 98 0, &dbp); 99 if (error) { 100 DLOG((DLOG_RF, "ino_func_setclean: bread returned %d\n", 101 error)); 102 return error; 103 } 104 memcpy(lifa->buf, dbp->b_data, dbp->b_bcount); 105 brelse(dbp, BC_AGE); 106 107 /* Check each inode against ifile entry */ 108 num = LFS_INOPB(fs); 109 for (i = num; i-- > 0; ) { 110 dip = DINO_IN_BLOCK(fs, lifa->buf, i); 111 ino = lfs_dino_getinumber(fs, dip); 112 if (ino == LFS_IFILE_INUM) { 113 /* Check address against superblock */ 114 true_addr = lfs_sb_getidaddr(fs); 115 } else { 116 /* Not ifile. Check address against ifile. */ 117 LFS_IENTRY(ifp, fs, ino, ibp); 118 true_addr = lfs_if_getdaddr(fs, ifp); 119 brelse(ibp, 0); 120 } 121 if (offset != true_addr) 122 continue; 123 124 LFS_ASSERT_MAXINO(fs, ino); 125 126 /* XXX We can use fastvget here! */ 127 128 /* 129 * An inode we need to relocate. 130 * Get it if we can. 131 */ 132 if (ino == LFS_IFILE_INUM) 133 vp = fs->lfs_ivnode; 134 else 135 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, 136 LK_EXCLUSIVE | LK_NOWAIT, &vp); 137 if (error) 138 continue; 139 140 KASSERT(VTOI(vp)->i_gen == lfs_dino_getgen(fs, dip)); 141 lfs_setclean(fs, vp); 142 if (vp != fs->lfs_ivnode) { 143 VOP_UNLOCK(vp); 144 vrele(vp); 145 } 146 } 147 148 return error; 149 } 150 151 static int 152 ino_func_rewrite(struct lfs_inofuncarg *lifa) 153 { 154 struct lfs *fs; 155 daddr_t offset; 156 struct vnode *devvp, *vp; 157 union lfs_dinode *dip; 158 struct buf *dbp, *ibp; 159 int error; 160 IFILE *ifp; 161 unsigned i, num; 162 daddr_t true_addr; 163 ino_t ino; 164 165 fs = lifa->fs; 166 offset = lifa->offset; 167 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 168 169 /* Read inode block */ 170 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs), 171 0, &dbp); 172 if (error) { 173 DLOG((DLOG_RF, "ino_func_rewrite: bread returned %d\n", 174 error)); 175 return error; 176 } 177 memcpy(lifa->buf, dbp->b_data, dbp->b_bcount); 178 brelse(dbp, BC_AGE); 179 180 /* Check each inode against ifile entry */ 181 num = LFS_INOPB(fs); 182 for (i = num; i-- > 0; ) { 183 dip = DINO_IN_BLOCK(fs, lifa->buf, i); 184 ino = lfs_dino_getinumber(fs, dip); 185 if (ino == LFS_IFILE_INUM) { 186 /* Check address against superblock */ 187 true_addr = lfs_sb_getidaddr(fs); 188 } else { 189 /* Not ifile. Check address against ifile. */ 190 LFS_IENTRY(ifp, fs, ino, ibp); 191 true_addr = lfs_if_getdaddr(fs, ifp); 192 brelse(ibp, 0); 193 } 194 if (offset != true_addr) 195 continue; 196 197 if (ino == LFS_IFILE_INUM) 198 continue; 199 200 LFS_ASSERT_MAXINO(fs, ino); 201 202 /* XXX We can use fastvget here! */ 203 204 /* 205 * An inode we need to relocate. 206 * Get it if we can. 207 */ 208 error = check_clean_list(fs, ino); 209 if (error) 210 continue; 211 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, 212 LK_EXCLUSIVE | LK_NOWAIT, &vp); 213 if (error) 214 continue; 215 216 KASSERT(VTOI(vp)->i_gen == lfs_dino_getgen(fs, dip)); 217 218 if (!(VTOI(vp)->i_state & IN_CLEANING)) { 219 lfs_setclean(fs, vp); 220 lfs_writeinode(fs, fs->lfs_sp, VTOI(vp)); 221 } 222 223 VOP_UNLOCK(vp); 224 vrele(vp); 225 226 } 227 228 return error; 229 } 230 231 static int 232 rewrite_block(struct lfs *fs, struct vnode *vp, daddr_t lbn, daddr_t offset, size_t size, int *have_finfop) 233 { 234 daddr_t daddr; 235 int error; 236 struct buf *bp; 237 struct inode *ip; 238 239 KASSERT(have_finfop != NULL); 240 241 /* Look up current location of this block. */ 242 error = VOP_BMAP(vp, lbn, NULL, &daddr, NULL); 243 if (error) 244 return error; 245 246 /* Skip any block that is not here. */ 247 if (offset != 0 && LFS_DBTOFSB(fs, daddr) != offset) 248 return ESTALE; 249 250 /* 251 * It is (was recently) here. Read the block. 252 */ 253 //size = lfs_blksize(fs, VTOI(vp), lbn); 254 error = bread(vp, lbn, size, 0, &bp); 255 if (error) 256 return error; 257 258 if (vp == fs->lfs_ivnode) { 259 VOP_BWRITE(vp, bp); 260 } else { 261 /* Get ready to write. */ 262 if (!*have_finfop) { 263 ip = VTOI(vp); 264 lfs_acquire_finfo(fs, ip->i_number, ip->i_gen); 265 fs->lfs_sp->vp = vp; 266 *have_finfop = 1; 267 } 268 269 KASSERT(bp->b_vp == vp); 270 /* bp->b_cflags |= BC_INVAL; */ /* brelse will kill the buffer */ 271 error = lfs_bwrite_ext(bp, BW_CLEAN); 272 if (error) 273 return error; 274 KASSERT(bp->b_vp == vp); 275 mutex_enter(&bufcache_lock); 276 while (lfs_gatherblock(fs->lfs_sp, bp, &bufcache_lock)) { 277 KASSERT(bp->b_vp != NULL); 278 } 279 mutex_exit(&bufcache_lock); 280 281 KASSERT(bp->b_flags & B_GATHERED); 282 KASSERT(fs->lfs_sp->cbpp[-1] == bp); 283 } 284 return 0; 285 } 286 287 static int 288 check_clean_list(struct lfs *fs, ino_t ino) 289 { 290 struct inode *ip; 291 292 /* 293 * Look for the inode on the clean list. 294 * If it is not there, we can't lock it without risking a deadlock. 295 */ 296 TAILQ_FOREACH(ip, &fs->lfs_cleanhd, i_lfs_clean) { 297 if (ip->i_number == ino) { 298 return 0; 299 } 300 } 301 return EWOULDBLOCK; 302 } 303 304 static int 305 finfo_func_rewrite(struct lfs_finfofuncarg *lffa) 306 { 307 struct lfs *fs; 308 FINFO *fip; 309 daddr_t *offsetp; 310 int j, have_finfo, error; 311 size_t size, bytes; 312 ino_t ino; 313 uint32_t gen; 314 struct vnode *vp; 315 daddr_t lbn; 316 int *fragsp; 317 318 fs = lffa->fs; 319 fip = lffa->finfop; 320 offsetp = lffa->offsetp; 321 fragsp = (int *)lffa->arg; 322 323 /* Get the inode and check its version. */ 324 ino = lfs_fi_getino(fs, fip); 325 gen = lfs_fi_getversion(fs, fip); 326 error = 0; 327 if (ino == LFS_IFILE_INUM) 328 vp = fs->lfs_ivnode; 329 else { 330 LFS_ASSERT_MAXINO(fs, ino); 331 error = check_clean_list(fs, ino); 332 if (error) 333 vp = NULL; 334 else 335 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, 336 LK_EXCLUSIVE|LK_NOWAIT, &vp); 337 } 338 339 /* 340 * If we can't, or if version is wrong, or it has dirop blocks on it, 341 * we can't relocate its blocks; but we still have to count 342 * blocks through the partial segment to return the right offset. 343 * XXX actually we can move DIROP vnodes' *old* data, as long 344 * XXX as we are sure that we are moving *only* the old data---? 345 */ 346 if (error || VTOI(vp)->i_gen != gen || (vp->v_uflag & VU_DIROP)) { 347 if (error == 0) 348 error = ESTALE; 349 350 if (vp != NULL && vp != fs->lfs_ivnode) { 351 VOP_UNLOCK(vp); 352 vrele(vp); 353 } 354 vp = NULL; 355 bytes = ((lfs_fi_getnblocks(fs, fip) - 1) << lfs_sb_getbshift(fs)) 356 + lfs_fi_getlastlength(fs, fip); 357 *offsetp += lfs_btofsb(fs, bytes); 358 359 return error; 360 } 361 362 /* 363 * We have the vnode and its version is correct. 364 * Take a cleaning reference; and loop through the blocks 365 * and rewrite them. 366 */ 367 lfs_setclean(fs, vp); 368 size = lfs_sb_getbsize(fs); 369 have_finfo = 0; 370 for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) { 371 if (j == lfs_fi_getnblocks(fs, fip) - 1) 372 size = lfs_fi_getlastlength(fs, fip); 373 /* 374 * An error of ESTALE indicates that there was nothing 375 * to rewrite; this is not a problem. Any other error 376 * causes us to skip the rest of this FINFO. 377 */ 378 if (vp != NULL && error == 0) { 379 lbn = lfs_fi_getblock(fs, fip, j); 380 error = rewrite_block(fs, vp, lbn, *offsetp, 381 size, &have_finfo); 382 if (error == ESTALE) 383 error = 0; 384 if (fragsp != NULL && error == 0) 385 *fragsp += lfs_btofsb(fs, size); 386 } 387 *offsetp += lfs_btofsb(fs, size); 388 } 389 390 /* 391 * If we acquired finfo, release it and write the blocks. 392 */ 393 if (have_finfo) { 394 lfs_updatemeta(fs->lfs_sp); 395 fs->lfs_sp->vp = NULL; 396 lfs_release_finfo(fs); 397 lfs_writeinode(fs, fs->lfs_sp, VTOI(vp)); 398 } 399 400 /* Release vnode */ 401 if (vp != fs->lfs_ivnode) { 402 VOP_UNLOCK(vp); 403 vrele(vp); 404 } 405 406 return error; 407 } 408 409 static int 410 finfo_func_setclean(struct lfs_finfofuncarg *lffa) 411 { 412 struct lfs *fs; 413 FINFO *fip; 414 daddr_t *offsetp; 415 int error; 416 size_t bytes; 417 ino_t ino; 418 uint32_t gen; 419 struct vnode *vp; 420 421 fs = lffa->fs; 422 fip = lffa->finfop; 423 offsetp = lffa->offsetp; 424 425 /* Get the inode and check its version. */ 426 ino = lfs_fi_getino(fs, fip); 427 gen = lfs_fi_getversion(fs, fip); 428 error = 0; 429 if (ino == LFS_IFILE_INUM) 430 vp = fs->lfs_ivnode; 431 else { 432 LFS_ASSERT_MAXINO(fs, ino); 433 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, 434 LK_EXCLUSIVE|LK_NOWAIT, &vp); 435 } 436 437 /* If we have it and its version is right, take a cleaning reference */ 438 if (error == 0 && VTOI(vp)->i_gen == gen) 439 lfs_setclean(fs, vp); 440 441 if (vp == fs->lfs_ivnode) 442 vp = NULL; 443 else if (vp != NULL) { 444 VOP_UNLOCK(vp); 445 vrele(vp); 446 vp = NULL; 447 } 448 449 /* Skip to the next block */ 450 bytes = ((lfs_fi_getnblocks(fs, fip) - 1) << lfs_sb_getbshift(fs)) 451 + lfs_fi_getlastlength(fs, fip); 452 *offsetp += lfs_btofsb(fs, bytes); 453 454 return error; 455 } 456 457 /* 458 * Use the partial-segment parser to rewrite (clean) a segment. 459 */ 460 int 461 lfs_rewrite_segment(struct lfs *fs, int sn, int *fragsp, kauth_cred_t cred, struct lwp *l) 462 { 463 daddr_t ooffset, offset, endpseg; 464 465 ASSERT_SEGLOCK(fs); 466 467 offset = lfs_sntod(fs, sn); 468 lfs_skip_superblock(fs, &offset); 469 endpseg = lfs_sntod(fs, sn + 1); 470 471 while (offset > 0 && offset != endpseg) { 472 /* First check summary validity (XXX unnecessary?) */ 473 ooffset = offset; 474 lfs_parse_pseg(fs, &offset, 0, cred, NULL, l, 475 NULL, NULL, CKSEG_CKSUM, NULL); 476 if (offset == ooffset) 477 break; 478 479 /* 480 * Valid, proceed. 481 * 482 * First write the file blocks, marking their 483 * inodes IN_CLEANING. 484 */ 485 offset = ooffset; 486 lfs_parse_pseg(fs, &offset, 0, cred, NULL, l, 487 NULL, finfo_func_rewrite, 488 CKSEG_NONE, fragsp); 489 490 /* 491 * Now go back and pick up any inodes that 492 * were not already marked IN_CLEANING, and 493 * write them as well. 494 */ 495 offset = ooffset; 496 lfs_parse_pseg(fs, &offset, 0, cred, NULL, l, 497 ino_func_rewrite, NULL, 498 CKSEG_NONE, fragsp); 499 } 500 return 0; 501 } 502 503 /* 504 * Rewrite the contents of one or more segments, in preparation for 505 * marking them clean. 506 */ 507 int 508 lfs_rewrite_segments(struct lfs *fs, int *snn, int len, int *directp, int *offsetp, struct lwp *l) 509 { 510 kauth_cred_t cred; 511 int i, error; 512 struct buf *bp; 513 SEGUSE *sup; 514 daddr_t offset, endpseg; 515 516 ASSERT_NO_SEGLOCK(fs); 517 518 cred = l ? l->l_cred : NOCRED; 519 520 /* Prevent new dirops and acquire the cleaner lock. */ 521 lfs_writer_enter(fs, "rewritesegs"); 522 if ((error = lfs_cleanerlock(fs)) != 0) { 523 lfs_writer_leave(fs); 524 return error; 525 } 526 527 /* 528 * Pre-reference vnodes now that we have cleaner lock 529 * but before we take the segment lock. We don't want to 530 * mix cleaning blocks with flushed vnodes. 531 */ 532 for (i = 0; i < len; i++) { 533 error = 0; 534 /* Refuse to clean segments that are ACTIVE */ 535 LFS_SEGENTRY(sup, fs, snn[i], bp); 536 if (sup->su_flags & SEGUSE_ACTIVE 537 || !(sup->su_flags & SEGUSE_DIRTY)) 538 error = EINVAL; 539 540 brelse(bp, 0); 541 if (error) 542 break; 543 544 offset = lfs_sntod(fs, snn[i]); 545 lfs_skip_superblock(fs, &offset); 546 endpseg = lfs_sntod(fs, snn[i] + 1); 547 548 while (offset > 0 && offset != endpseg) { 549 lfs_parse_pseg(fs, &offset, 0, cred, NULL, l, 550 ino_func_setclean, finfo_func_setclean, 551 CKSEG_NONE, NULL); 552 } 553 } 554 555 /* 556 * Actually rewrite the contents of the segment. 557 */ 558 lfs_seglock(fs, SEGM_CLEAN); 559 560 for (i = 0; i < len; i++) { 561 error = 0; 562 /* Refuse to clean segments that are ACTIVE */ 563 LFS_SEGENTRY(sup, fs, snn[i], bp); 564 if (sup->su_flags & SEGUSE_ACTIVE 565 || !(sup->su_flags & SEGUSE_DIRTY)) 566 error = EINVAL; 567 568 brelse(bp, 0); 569 if (error) 570 break; 571 572 error = lfs_rewrite_segment(fs, snn[i], directp, cred, l); 573 if (error) 574 break; 575 } 576 while (lfs_writeseg(fs, fs->lfs_sp)) 577 ; 578 579 *offsetp = lfs_btofsb(fs, fs->lfs_sp->bytes_written); 580 lfs_segunlock(fs); 581 lfs_cleanerunlock(fs); 582 lfs_writer_leave(fs); 583 584 return error; 585 } 586 587 static int 588 ino_func_checkempty(struct lfs_inofuncarg *lifa) 589 { 590 struct lfs *fs; 591 daddr_t offset; 592 struct vnode *devvp; 593 union lfs_dinode *dip; 594 struct buf *dbp, *ibp; 595 int error; 596 IFILE *ifp; 597 unsigned i, num; 598 daddr_t true_addr; 599 ino_t ino; 600 601 fs = lifa->fs; 602 offset = lifa->offset; 603 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 604 605 /* Read inode block */ 606 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs), 607 0, &dbp); 608 if (error) { 609 DLOG((DLOG_RF, "ino_func_checkempty: bread returned %d\n", 610 error)); 611 return error; 612 } 613 614 /* Check each inode against ifile entry */ 615 num = LFS_INOPB(fs); 616 for (i = num; i-- > 0; ) { 617 dip = DINO_IN_BLOCK(fs, dbp->b_data, i); 618 ino = lfs_dino_getinumber(fs, dip); 619 if (ino == LFS_IFILE_INUM) { 620 /* Check address against superblock */ 621 true_addr = lfs_sb_getidaddr(fs); 622 } else { 623 /* Not ifile. Check address against ifile. */ 624 LFS_IENTRY(ifp, fs, ino, ibp); 625 true_addr = lfs_if_getdaddr(fs, ifp); 626 brelse(ibp, 0); 627 } 628 if (offset == true_addr) { 629 error = EEXIST; 630 break; 631 } 632 } 633 brelse(dbp, BC_AGE); 634 635 return error; 636 } 637 638 static int 639 finfo_func_checkempty(struct lfs_finfofuncarg *lffa) 640 { 641 struct lfs *fs; 642 FINFO *fip; 643 daddr_t *offsetp; 644 int j, error; 645 size_t size, bytes; 646 ino_t ino; 647 uint32_t gen; 648 struct vnode *vp; 649 daddr_t lbn, daddr; 650 651 fs = lffa->fs; 652 fip = lffa->finfop; 653 offsetp = lffa->offsetp; 654 655 /* Get the inode and check its version. */ 656 ino = lfs_fi_getino(fs, fip); 657 gen = lfs_fi_getversion(fs, fip); 658 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, LK_EXCLUSIVE|LK_NOWAIT, &vp); 659 660 /* 661 * If we can't, or if version is wrong, this FINFO does not refer 662 * to a live file. Skip over it and continue. 663 */ 664 if (error || VTOI(vp)->i_gen != gen) { 665 if (error == 0) 666 error = ESTALE; 667 668 if (vp != NULL) { 669 VOP_UNLOCK(vp); 670 vrele(vp); 671 vp = NULL; 672 } 673 bytes = ((lfs_fi_getnblocks(fs, fip) - 1) 674 << lfs_sb_getbshift(fs)) 675 + lfs_fi_getlastlength(fs, fip); 676 *offsetp += lfs_btofsb(fs, bytes); 677 678 return error; 679 } 680 681 /* 682 * We have the vnode and its version is correct. 683 * Loop through the blocks and check their currency. 684 */ 685 size = lfs_sb_getbsize(fs); 686 for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) { 687 if (j == lfs_fi_getnblocks(fs, fip) - 1) 688 size = lfs_fi_getlastlength(fs, fip); 689 if (vp != NULL) { 690 lbn = lfs_fi_getblock(fs, fip, j); 691 692 /* Look up current location of this block. */ 693 error = VOP_BMAP(vp, lbn, NULL, &daddr, NULL); 694 if (error) 695 break; 696 697 /* If it is here, the segment is not empty. */ 698 if (LFS_DBTOFSB(fs, daddr) == *offsetp) { 699 error = EEXIST; 700 break; 701 } 702 } 703 *offsetp += lfs_btofsb(fs, size); 704 } 705 706 /* Release vnode */ 707 VOP_UNLOCK(vp); 708 vrele(vp); 709 710 return error; 711 } 712 713 int 714 lfs_checkempty(struct lfs *fs, int sn, kauth_cred_t cred, struct lwp *l) 715 { 716 daddr_t offset, endpseg; 717 int error; 718 719 ASSERT_SEGLOCK(fs); 720 721 offset = lfs_sntod(fs, sn); 722 lfs_skip_superblock(fs, &offset); 723 endpseg = lfs_sntod(fs, sn + 1); 724 725 while (offset > 0 && offset < endpseg) { 726 error = lfs_parse_pseg(fs, &offset, 0, cred, NULL, l, 727 ino_func_checkempty, 728 finfo_func_checkempty, 729 CKSEG_NONE, NULL); 730 if (error) 731 return error; 732 } 733 return 0; 734 } 735 736 static long 737 segselect_greedy(struct lfs *fs, int sn, SEGUSE *sup) 738 { 739 return lfs_sb_getssize(fs) - sup->su_nbytes; 740 } 741 742 __inline static long 743 segselect_cb_rosenblum(struct lfs *fs, int sn, SEGUSE *sup, long age) 744 { 745 long benefit, cost; 746 747 benefit = (int64_t)lfs_sb_getssize(fs) - sup->su_nbytes - 748 (sup->su_nsums + 1) * lfs_sb_getfsize(fs); 749 if (sup->su_flags & SEGUSE_SUPERBLOCK) 750 benefit -= LFS_SBPAD; 751 if (lfs_sb_getbsize(fs) > lfs_sb_getfsize(fs)) /* fragmentation */ 752 benefit -= (lfs_sb_getbsize(fs) / 2); 753 if (benefit <= 0) { 754 return 0; 755 } 756 757 cost = lfs_sb_getssize(fs) + sup->su_nbytes; 758 return (256 * benefit * age) / cost; 759 } 760 761 static long 762 segselect_cb_time(struct lfs *fs, int sn, SEGUSE *sup) 763 { 764 long age; 765 766 age = time_second - sup->su_lastmod; 767 if (age < 0) 768 age = 0; 769 return segselect_cb_rosenblum(fs, sn, sup, age); 770 } 771 772 #if 0 773 /* 774 * Same as the time comparator, but fetch the serial number from the 775 * segment header to compare. 776 * 777 * This is ugly. Whether serial number or wall time is better is a 778 * worthy question, but if we want to use serial number to compute 779 * age, we should record the serial number in su_lastmod instead of 780 * the time. 781 */ 782 static long 783 segselect_cb_serial(struct lfs *fs, int sn, SEGUSE *sup) 784 { 785 struct buf *bp; 786 uint32_t magic; 787 uint64_t age, serial; 788 daddr_t addr; 789 790 addr = lfs_segtod(fs, sn); 791 lfs_skip_superblock(fs, &addr); 792 bread(fs->lfs_devvp, LFS_FSBTODB(fs, addr), 793 lfs_sb_getsumsize(fs), 0, &bp); 794 magic = lfs_ss_getmagic(fs, ((SEGSUM *)bp->b_data)); 795 serial = lfs_ss_getserial(fs, ((SEGSUM *)bp->b_data)); 796 brelse(bp, 0); 797 798 if (magic != SS_MAGIC) 799 return 0; 800 801 age = lfs_sb_getserial(fs) - serial; 802 return segselect_cb_rosenblum(fs, sn, sup, age); 803 } 804 #endif 805 806 void 807 lfs_cleanerd(void *arg) 808 { 809 mount_iterator_t *iter; 810 struct mount *mp; 811 struct lfs *fs; 812 struct vfsops *vfs = NULL; 813 int lfsc; 814 int cleaned_something = 0; 815 816 /* Take an extra reference to the LFS vfsops. */ 817 vfs = vfs_getopsbyname(MOUNT_LFS); 818 819 mutex_enter(&lfs_lock); 820 for (;;) { 821 KASSERT(mutex_owned(&lfs_lock)); 822 if (cleaned_something == 0) 823 cv_timedwait(&lfs_allclean_wakeup, &lfs_lock, hz/10 + 1); 824 KASSERT(mutex_owned(&lfs_lock)); 825 cleaned_something = 0; 826 827 KASSERT(mutex_owned(&lfs_lock)); 828 mutex_exit(&lfs_lock); 829 830 /* 831 * Look through the list of LFSs to see if any of them 832 * need cleaning. 833 */ 834 mountlist_iterator_init(&iter); 835 lfsc = 0; 836 while ((mp = mountlist_iterator_next(iter)) != NULL) { 837 KASSERT(!mutex_owned(&lfs_lock)); 838 if (strncmp(mp->mnt_stat.f_fstypename, MOUNT_LFS, 839 sizeof(mp->mnt_stat.f_fstypename)) == 0) { 840 fs = VFSTOULFS(mp)->um_lfs; 841 842 mutex_enter(&lfs_lock); 843 if (fs->lfs_clean_selector == NULL) { 844 /* Notify cleanctl */ 845 if (fs->lfs_autoclean_status) { 846 fs->lfs_autoclean_status = 847 LFS_AUTOCLEAN_STATUS_OFF; 848 cv_broadcast(&fs->lfs_cleanquitcv); 849 } 850 } else 851 ++lfsc; 852 mutex_exit(&lfs_lock); 853 cleaned_something += clean(fs); 854 } 855 } 856 if (lfsc == 0) { 857 mutex_enter(&lfs_lock); 858 lfs_cleaner_daemon = NULL; 859 mutex_exit(&lfs_lock); 860 mountlist_iterator_destroy(iter); 861 break; 862 } 863 mountlist_iterator_destroy(iter); 864 865 mutex_enter(&lfs_lock); 866 } 867 KASSERT(!mutex_owned(&lfs_lock)); 868 869 /* Give up our extra reference so the module can be unloaded. */ 870 mutex_enter(&vfs_list_lock); 871 if (vfs != NULL) 872 vfs->vfs_refcount--; 873 mutex_exit(&vfs_list_lock); 874 875 /* Done! */ 876 kthread_exit(0); 877 } 878 879 /* 880 * Look at the file system to see whether it needs cleaning, and if it does, 881 * clean a segment. 882 */ 883 static int 884 clean(struct lfs *fs) 885 { 886 struct buf *bp; 887 SEGUSE *sup; 888 int sn, maxsn, nclean, nready, nempty, nerror, nzero, again, target; 889 long prio, maxprio, maxeprio, thresh; 890 long (*func)(struct lfs *, int, SEGUSE *); 891 uint32_t __debugused segflags = 0; 892 daddr_t oldsn, bfree, avail; 893 int direct, offset; 894 895 mutex_enter(&lfs_lock); 896 func = fs->lfs_clean_selector; 897 mutex_exit(&lfs_lock); 898 if (func == NULL) 899 return 1; /* Run again so we get cleaned up immediately */ 900 901 thresh = fs->lfs_autoclean.thresh; 902 if (fs->lfs_flags & LFS_MUSTCLEAN) 903 thresh = 0; 904 else if (thresh < 0) { 905 /* 906 * Compute a priority threshold based on availability ratio. 907 * XXX These numbers only makes sense for the greedy cleaner. 908 * What is an appropriate threshold for the cost-benefit 909 * cleaner? 910 */ 911 bfree = lfs_sb_getbfree(fs) 912 + lfs_segtod(fs, 1) * lfs_sb_getminfree(fs); 913 avail = lfs_sb_getavail(fs) - fs->lfs_ravail - fs->lfs_favail; 914 if (avail > bfree) 915 return 0; 916 thresh = lfs_sb_getssize(fs) * (bfree - avail) 917 / (lfs_sb_getsize(fs) - avail); 918 if (thresh > lfs_sb_getsumsize(fs) + 5 * lfs_sb_getbsize(fs)) 919 thresh = lfs_sb_getsumsize(fs) + 5 * lfs_sb_getbsize(fs); 920 if (thresh > lfs_sb_getssize(fs) - lfs_sb_getbsize(fs)) 921 return 0; 922 } 923 924 target = fs->lfs_autoclean.target; 925 if (target <= 0) { 926 /* Default to half a segment target */ 927 target = lfs_segtod(fs, 1) / 2; 928 } 929 930 oldsn = lfs_dtosn(fs, lfs_sb_getoffset(fs)); 931 932 again = 0; 933 maxprio = maxeprio = -1; 934 nzero = nclean = nready = nempty = nerror = 0; 935 for (sn = 0; sn < lfs_sb_getnseg(fs); sn++) { 936 937 prio = 0; 938 LFS_SEGENTRY(sup, fs, sn, bp); 939 if (sup->su_flags & SEGUSE_ACTIVE) 940 prio = 0; 941 else if (!(sup->su_flags & SEGUSE_DIRTY)) 942 ++nclean; 943 else if (sup->su_flags & SEGUSE_READY) 944 ++nready; 945 else if (sup->su_flags & SEGUSE_EMPTY) 946 ++nempty; 947 else if (sup->su_nbytes == 0) 948 ++nzero; 949 else 950 prio = (*func)(fs, sn, sup); 951 952 if (sup->su_flags & SEGUSE_ERROR) { 953 if (prio > maxeprio) 954 maxeprio = prio; 955 prio = 0; 956 ++nerror; 957 } 958 959 if (prio > maxprio) { 960 maxprio = prio; 961 maxsn = sn; 962 segflags = sup->su_flags; 963 } 964 brelse(bp, 0); 965 } 966 DLOG((DLOG_CLEAN, "%s clean=%d/%d zero=%d empty=%d ready=%d maxsn=%d maxprio=%ld/%ld segflags=0x%lx\n", 967 (maxprio > thresh ? "YES" : "NO "), 968 nclean, (int)lfs_sb_getnseg(fs), nzero, nempty, nready, 969 maxsn, maxprio, (unsigned long)thresh, 970 (unsigned long)segflags)); 971 972 /* 973 * If we are trying to clean the segment we cleaned last, 974 * cleaning did not work. Mark this segment SEGUSE_ERROR 975 * and try again. 976 */ 977 if (maxprio > 0 && fs->lfs_lastcleaned == maxsn) { 978 LFS_SEGENTRY(sup, fs, maxsn, bp); 979 sup->su_flags |= SEGUSE_ERROR; 980 LFS_WRITESEGENTRY(sup, fs, sn, bp); 981 return 1; 982 } 983 984 /* 985 * If there were nothing but error segments, clear error. 986 * We will wait to try again. 987 */ 988 if (maxprio == 0 && maxeprio > 0) { 989 DLOG((DLOG_CLEAN, "clear error on %d segments, try again\n", 990 nerror)); 991 lfs_seguse_clrflag_all(fs, SEGUSE_ERROR); 992 } 993 994 /* Rewrite the highest-priority segment */ 995 if (maxprio > thresh) { 996 direct = offset = 0; 997 (void)lfs_rewrite_segments(fs, &maxsn, 1, 998 &direct, &offset, curlwp); 999 DLOG((DLOG_CLEAN, " direct=%d offset=%d\n", direct, offset)); 1000 again += direct; 1001 fs->lfs_clean_accum += offset; 1002 1003 /* Don't clean this again immediately */ 1004 fs->lfs_lastcleaned = maxsn; 1005 } 1006 1007 /* 1008 * If we are in dire straits but we have segments already 1009 * empty, force a double checkpoint to reclaim them. 1010 */ 1011 if (fs->lfs_flags & LFS_MUSTCLEAN) { 1012 if (nready + nempty > 0) { 1013 DLOG((DLOG_CLEAN, "force checkpoint with nready=%d nempty=%d nzero=%d\n", 1014 nready, nempty, nzero)); 1015 lfs_segwrite(fs->lfs_ivnode->v_mount, 1016 SEGM_CKP | SEGM_FORCE_CKP | SEGM_SYNC); 1017 lfs_segwrite(fs->lfs_ivnode->v_mount, 1018 SEGM_CKP | SEGM_FORCE_CKP | SEGM_SYNC); 1019 ++again; 1020 } 1021 } else if (fs->lfs_clean_accum > target) { 1022 DLOG((DLOG_CLEAN, "checkpoint to flush\n")); 1023 lfs_segwrite(fs->lfs_ivnode->v_mount, SEGM_CKP); 1024 fs->lfs_clean_accum = 0; 1025 } else if (lfs_dtosn(fs, lfs_sb_getoffset(fs)) != oldsn 1026 || nempty + nready > LFS_MAX_ACTIVE) { /* XXX arbitrary */ 1027 DLOG((DLOG_CLEAN, "write to promote empty segments\n")); 1028 lfs_segwrite(fs->lfs_ivnode->v_mount, SEGM_CKP); 1029 fs->lfs_clean_accum = 0; 1030 } 1031 1032 return again; 1033 } 1034 1035 /* 1036 * Rewrite a file in its entirety. 1037 * 1038 * Generally this would be done to coalesce a file that is scattered 1039 * around the disk; but if the "scramble" flag is set, instead rewrite 1040 * only the even-numbered blocks, which provides the opposite effect 1041 * for testing purposes. 1042 * 1043 * It is the caller's responsibility to check the bounds of the inode 1044 * numbers. 1045 */ 1046 int 1047 lfs_rewrite_file(struct lfs *fs, ino_t *inoa, int len, bool scramble, 1048 int *directp, int *offsetp) 1049 { 1050 daddr_t hiblk, lbn; 1051 struct vnode *vp; 1052 struct inode *ip; 1053 struct buf *bp; 1054 int i, error; 1055 1056 KASSERT(directp != NULL); 1057 KASSERT(offsetp != NULL); 1058 1059 *directp = 0; 1060 if ((error = lfs_cleanerlock(fs)) != 0) 1061 return error; 1062 lfs_seglock(fs, 0); 1063 for (i = 0; i < len; ++i) { 1064 error = VFS_VGET(fs->lfs_ivnode->v_mount, inoa[i], 1065 LK_EXCLUSIVE | LK_NOWAIT, &vp); 1066 if (error) 1067 goto out; 1068 1069 ip = VTOI(vp); 1070 if ((vp->v_uflag & VU_DIROP) || (ip->i_flags & IN_ADIROP)) { 1071 VOP_UNLOCK(vp); 1072 vrele(vp); 1073 error = EAGAIN; 1074 goto out; 1075 } 1076 1077 /* Highest block in this inode */ 1078 hiblk = lfs_lblkno(fs, ip->i_size + lfs_sb_getbsize(fs) - 1) - 1; 1079 1080 for (lbn = 0; lbn <= hiblk; ++lbn) { 1081 if (scramble && (lbn & 0x01)) 1082 continue; 1083 1084 if (lfs_needsflush(fs)) { 1085 lfs_segwrite(fs->lfs_ivnode->v_mount, 0); 1086 } 1087 1088 error = bread(vp, lbn, lfs_blksize(fs, ip, lbn), 0, &bp); 1089 if (error) 1090 break; 1091 1092 /* bp->b_cflags |= BC_INVAL; */ 1093 lfs_bwrite_ext(bp, 0); 1094 *directp += lfs_btofsb(fs, bp->b_bcount); 1095 } 1096 1097 /* Done with this vnode */ 1098 VOP_UNLOCK(vp); 1099 vrele(vp); 1100 if (error) 1101 break; 1102 } 1103 out: 1104 lfs_segwrite(fs->lfs_ivnode->v_mount, 0); 1105 *offsetp += lfs_btofsb(fs, fs->lfs_sp->bytes_written); 1106 lfs_segunlock(fs); 1107 lfs_cleanerunlock(fs); 1108 1109 return error; 1110 } 1111 1112 int 1113 lfs_cleanctl(struct lfs *fs, struct lfs_autoclean_params *params) 1114 { 1115 long (*cleanfunc)(struct lfs *, int, SEGUSE *); 1116 1117 fs->lfs_autoclean = *params; 1118 1119 cleanfunc = NULL; 1120 switch (fs->lfs_autoclean.mode) { 1121 case LFS_CLEANMODE_NONE: 1122 cleanfunc = NULL; 1123 break; 1124 1125 case LFS_CLEANMODE_GREEDY: 1126 cleanfunc = segselect_greedy; 1127 break; 1128 1129 case LFS_CLEANMODE_CB: 1130 cleanfunc = segselect_cb_time; 1131 break; 1132 1133 default: 1134 return EINVAL; 1135 } 1136 1137 mutex_enter(&lfs_lock); 1138 while (cleanfunc == NULL && 1139 fs->lfs_autoclean_status != LFS_AUTOCLEAN_STATUS_OFF) { 1140 cv_wait(&fs->lfs_cleanquitcv, &lfs_lock); 1141 } 1142 if (fs->lfs_clean_selector == NULL && cleanfunc != NULL) 1143 if (++lfs_ncleaners == 1) { 1144 if (lfs_cleaner_daemon == NULL && 1145 kthread_create(PRI_BIO, 0, NULL, 1146 lfs_cleanerd, NULL, 1147 &lfs_cleaner_daemon, 1148 "lfs_cleaner") != 0) 1149 panic("fork lfs_cleaner"); 1150 } 1151 if (fs->lfs_clean_selector != NULL && cleanfunc == NULL) { 1152 if (--lfs_ncleaners == 0) { 1153 #if 0 1154 kthread_join(lfs_cleaner_daemon); 1155 lfs_cleaner_daemon = NULL; 1156 #endif /* 0 */ 1157 } 1158 } 1159 fs->lfs_clean_selector = cleanfunc; 1160 mutex_exit(&lfs_lock); 1161 1162 return 0; 1163 } 1164