1 /* $NetBSD: lfs_bio.c,v 1.151 2025/10/20 04:20:37 perseant Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Konrad E. Schroder <perseant (at) hhhh.org>. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 /* 32 * Copyright (c) 1991, 1993 33 * The Regents of the University of California. All rights reserved. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. Neither the name of the University nor the names of its contributors 44 * may be used to endorse or promote products derived from this software 45 * without specific prior written permission. 46 * 47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * SUCH DAMAGE. 58 * 59 * @(#)lfs_bio.c 8.10 (Berkeley) 6/10/95 60 */ 61 62 #include <sys/cdefs.h> 63 __KERNEL_RCSID(0, "$NetBSD: lfs_bio.c,v 1.151 2025/10/20 04:20:37 perseant Exp $"); 64 65 #include <sys/param.h> 66 #include <sys/systm.h> 67 #include <sys/proc.h> 68 #include <sys/buf.h> 69 #include <sys/vnode.h> 70 #include <sys/resourcevar.h> 71 #include <sys/mount.h> 72 #include <sys/kernel.h> 73 #include <sys/kauth.h> 74 75 #include <ufs/lfs/ulfs_inode.h> 76 #include <ufs/lfs/ulfsmount.h> 77 #include <ufs/lfs/ulfs_extern.h> 78 79 #include <ufs/lfs/lfs.h> 80 #include <ufs/lfs/lfs_accessors.h> 81 #include <ufs/lfs/lfs_extern.h> 82 #include <ufs/lfs/lfs_kernel.h> 83 84 #include <uvm/uvm_extern.h> 85 86 /* 87 * LFS block write function. 88 * 89 * XXX 90 * No write cost accounting is done. 91 * This is almost certainly wrong for synchronous operations and NFS. 92 * 93 * protected by lfs_lock. 94 */ 95 int locked_queue_count = 0; /* Count of locked-down buffers. */ 96 long locked_queue_bytes = 0L; /* Total size of locked buffers. */ 97 int lfs_subsys_pages = 0L; /* Total number LFS-written pages */ 98 int lfs_fs_pagetrip = 0; /* # of pages to trip per-fs write */ 99 int lfs_writing = 0; /* Set if already kicked off a writer 100 because of buffer space */ 101 int locked_queue_waiters = 0; /* Number of processes waiting on lq */ 102 103 /* Lock and condition variables for above. */ 104 kcondvar_t locked_queue_cv; 105 kcondvar_t lfs_writing_cv; 106 kmutex_t lfs_lock; 107 extern kcondvar_t lfs_writerd_cv; 108 109 extern int lfs_dostats; 110 111 /* 112 * reserved number/bytes of locked buffers 113 */ 114 int locked_queue_rcount = 0; 115 long locked_queue_rbytes = 0L; 116 117 static int lfs_fits_buf(struct lfs *, int, int); 118 static int lfs_reservebuf(struct lfs *, struct vnode *vp, struct vnode *vp2, 119 int, int); 120 static int lfs_reserveavail(struct lfs *, struct vnode *vp, struct vnode *vp2, 121 int); 122 123 static int 124 lfs_fits_buf(struct lfs *fs, int n, int bytes) 125 { 126 int count_fit, bytes_fit; 127 128 ASSERT_NO_SEGLOCK(fs); 129 KASSERT(mutex_owned(&lfs_lock)); 130 131 count_fit = 132 (locked_queue_count + locked_queue_rcount + n <= LFS_WAIT_BUFS); 133 bytes_fit = 134 (locked_queue_bytes + locked_queue_rbytes + bytes <= LFS_WAIT_BYTES); 135 136 #ifdef DEBUG 137 if (!count_fit) { 138 DLOG((DLOG_AVAIL, "lfs_fits_buf: no fit count: %d + %d + %d >= %d\n", 139 locked_queue_count, locked_queue_rcount, 140 n, LFS_WAIT_BUFS)); 141 } 142 if (!bytes_fit) { 143 DLOG((DLOG_AVAIL, "lfs_fits_buf: no fit bytes: %ld + %ld + %d >= %ld\n", 144 locked_queue_bytes, locked_queue_rbytes, 145 bytes, LFS_WAIT_BYTES)); 146 } 147 #endif /* DEBUG */ 148 149 return (count_fit && bytes_fit); 150 } 151 152 /* ARGSUSED */ 153 static int 154 lfs_reservebuf(struct lfs *fs, struct vnode *vp, 155 struct vnode *vp2, int n, int bytes) 156 { 157 int cantwait; 158 159 ASSERT_MAYBE_SEGLOCK(fs); 160 KASSERT(locked_queue_rcount >= 0); 161 KASSERT(locked_queue_rbytes >= 0); 162 163 cantwait = (VTOI(vp)->i_state & IN_ADIROP) || fs->lfs_unlockvp == vp; 164 mutex_enter(&lfs_lock); 165 while (!cantwait && n > 0 && !lfs_fits_buf(fs, n, bytes)) { 166 int error; 167 168 DLOG((DLOG_FLUSH, "lfs_reservebuf: flush filesystem %p with checkpoint\n", fs)); 169 lfs_flush(fs, SEGM_CKP, 0); 170 171 DLOG((DLOG_AVAIL, "lfs_reservebuf: waiting: count=%d, bytes=%ld\n", 172 locked_queue_count, locked_queue_bytes)); 173 ++locked_queue_waiters; 174 cv_broadcast(&lfs_writerd_cv); 175 error = cv_timedwait_sig(&locked_queue_cv, &lfs_lock, 176 hz * LFS_BUFWAIT); 177 --locked_queue_waiters; 178 if (error && error != EWOULDBLOCK) { 179 mutex_exit(&lfs_lock); 180 return error; 181 } 182 } 183 184 locked_queue_rcount += n; 185 locked_queue_rbytes += bytes; 186 187 if (n < 0 && locked_queue_waiters > 0) { 188 DLOG((DLOG_AVAIL, "lfs_reservebuf: broadcast: count=%d, bytes=%ld\n", 189 locked_queue_count, locked_queue_bytes)); 190 cv_broadcast(&locked_queue_cv); 191 } 192 193 mutex_exit(&lfs_lock); 194 195 KASSERT(locked_queue_rcount >= 0); 196 KASSERT(locked_queue_rbytes >= 0); 197 198 return 0; 199 } 200 201 /* 202 * Try to reserve some blocks, prior to performing a sensitive operation that 203 * requires the vnode lock to be honored. If there is not enough space, wait 204 * for the space to become available. 205 * 206 * Called with vp locked. (Note nowever that if fsb < 0, vp is ignored.) 207 */ 208 static int 209 lfs_reserveavail(struct lfs *fs, struct vnode *vp, 210 struct vnode *vp2, int fsb) 211 { 212 CLEANERINFO *cip; 213 struct buf *bp; 214 int error, slept; 215 int cantwait; 216 217 ASSERT_MAYBE_SEGLOCK(fs); 218 slept = 0; 219 mutex_enter(&lfs_lock); 220 cantwait = (VTOI(vp)->i_state & IN_ADIROP) || fs->lfs_unlockvp == vp; 221 while (!cantwait && fsb > 0 && 222 !lfs_fits(fs, fsb + fs->lfs_ravail + fs->lfs_favail)) { 223 mutex_exit(&lfs_lock); 224 225 if (!slept) { 226 DLOG((DLOG_AVAIL, "lfs_reserve: waiting for %ld (bfree = %jd," 227 " est_bfree = %jd)\n", 228 fsb + fs->lfs_ravail + fs->lfs_favail, 229 (intmax_t)lfs_sb_getbfree(fs), 230 (intmax_t)LFS_EST_BFREE(fs))); 231 } 232 ++slept; 233 234 /* Wake up the cleaner */ 235 LFS_CLEANERINFO(cip, fs, bp); 236 LFS_SYNC_CLEANERINFO(cip, fs, bp, 0); 237 lfs_wakeup_cleaner(fs); 238 239 mutex_enter(&lfs_lock); 240 /* Cleaner might have run while we were reading, check again */ 241 if (lfs_fits(fs, fsb + fs->lfs_ravail + fs->lfs_favail)) 242 break; 243 244 error = mtsleep(&fs->lfs_availsleep, PCATCH | PUSER, 245 "lfs_reserve", 0, &lfs_lock); 246 if (error) { 247 mutex_exit(&lfs_lock); 248 return error; 249 } 250 } 251 #ifdef DEBUG 252 if (slept) { 253 DLOG((DLOG_AVAIL, "lfs_reserve: woke up\n")); 254 } 255 #endif 256 fs->lfs_ravail += fsb; 257 mutex_exit(&lfs_lock); 258 259 return 0; 260 } 261 262 #ifdef DIAGNOSTIC 263 int lfs_rescount; 264 int lfs_rescountdirop; 265 #endif 266 267 int 268 lfs_reserve(struct lfs *fs, struct vnode *vp, struct vnode *vp2, int fsb) 269 { 270 int error; 271 272 ASSERT_MAYBE_SEGLOCK(fs); 273 if (vp2) { 274 /* Make sure we're not in the process of reclaiming vp2 */ 275 mutex_enter(&lfs_lock); 276 while(fs->lfs_flags & LFS_UNDIROP) { 277 mtsleep(&fs->lfs_flags, PRIBIO + 1, "lfsrundirop", 0, 278 &lfs_lock); 279 } 280 mutex_exit(&lfs_lock); 281 } 282 283 KASSERT(fsb < 0 || VOP_ISLOCKED(vp)); 284 KASSERT(vp2 == NULL || fsb < 0 || VOP_ISLOCKED(vp2)); 285 KASSERT(vp2 == NULL || vp2 != fs->lfs_unlockvp); 286 287 #ifdef DIAGNOSTIC 288 mutex_enter(&lfs_lock); 289 if (fsb > 0) 290 lfs_rescount++; 291 else if (fsb < 0) 292 lfs_rescount--; 293 if (lfs_rescount < 0) 294 panic("lfs_rescount"); 295 mutex_exit(&lfs_lock); 296 #endif 297 298 error = lfs_reserveavail(fs, vp, vp2, fsb); 299 if (error) 300 return error; 301 302 /* 303 * XXX just a guess. should be more precise. 304 */ 305 error = lfs_reservebuf(fs, vp, vp2, fsb, lfs_fsbtob(fs, fsb)); 306 if (error) 307 lfs_reserveavail(fs, vp, vp2, -fsb); 308 309 return error; 310 } 311 312 int 313 lfs_max_bufs(void) 314 { 315 316 return LFS_MAX_RESOURCE(buf_nbuf(), 1); 317 } 318 319 int 320 lfs_wait_bufs(void) 321 { 322 323 return LFS_WAIT_RESOURCE(buf_nbuf(), 1); 324 } 325 326 int 327 lfs_bwrite(void *v) 328 { 329 struct vop_bwrite_args /* { 330 struct vnode *a_vp; 331 struct buf *a_bp; 332 } */ *ap = v; 333 struct buf *bp = ap->a_bp; 334 335 KASSERTMSG((VTOI(bp->b_vp)->i_lfs->lfs_ronly || 336 !(bp->b_flags & B_ASYNC)), 337 "bawrite LFS buffer"); 338 return lfs_bwrite_ext(bp, 0); 339 } 340 341 /* 342 * Determine if there is enough room currently available to write fsb 343 * blocks. We need enough blocks for the new blocks, the current 344 * inode blocks (including potentially the ifile inode), a summary block, 345 * and the segment usage table, plus an ifile block. 346 */ 347 int 348 lfs_fits(struct lfs *fs, int fsb) 349 { 350 int64_t needed; 351 352 ASSERT_NO_SEGLOCK(fs); 353 needed = fsb + lfs_btofsb(fs, lfs_sb_getsumsize(fs)) + 354 ((howmany(lfs_sb_getuinodes(fs) + 1, LFS_INOPB(fs)) + 355 lfs_sb_getsegtabsz(fs) + 356 1) << (lfs_sb_getbshift(fs) - lfs_sb_getffshift(fs))); 357 358 if (needed >= lfs_sb_getavail(fs)) { 359 #ifdef DEBUG 360 DLOG((DLOG_AVAIL, "lfs_fits: no fit: fsb = %ld, uinodes = %ld, " 361 "needed = %jd, avail = %jd\n", 362 (long)fsb, (long)lfs_sb_getuinodes(fs), (intmax_t)needed, 363 (intmax_t)lfs_sb_getavail(fs))); 364 #endif 365 return 0; 366 } 367 return 1; 368 } 369 370 int 371 lfs_availwait(struct lfs *fs, int fsb) 372 { 373 int error; 374 CLEANERINFO *cip; 375 struct buf *cbp; 376 377 ASSERT_NO_SEGLOCK(fs); 378 /* Push cleaner blocks through regardless */ 379 mutex_enter(&lfs_lock); 380 if (LFS_SEGLOCK_HELD(fs) && 381 fs->lfs_sp->seg_flags & (SEGM_CLEAN | SEGM_FORCE_CKP)) { 382 mutex_exit(&lfs_lock); 383 return 0; 384 } 385 mutex_exit(&lfs_lock); 386 387 while (!lfs_fits(fs, fsb)) { 388 /* 389 * Out of space, need cleaner to run. 390 * Update the cleaner info, then wake it up. 391 * Note the cleanerinfo block is on the ifile 392 * so it CANT_WAIT. 393 */ 394 LFS_CLEANERINFO(cip, fs, cbp); 395 LFS_SYNC_CLEANERINFO(cip, fs, cbp, 0); 396 397 #ifdef DEBUG 398 DLOG((DLOG_AVAIL, "lfs_availwait: out of available space, " 399 "waiting on cleaner\n")); 400 #endif 401 402 lfs_wakeup_cleaner(fs); 403 KASSERTMSG(!LFS_SEGLOCK_HELD(fs), "lfs_availwait: deadlock"); 404 error = tsleep(&fs->lfs_availsleep, PCATCH | PUSER, 405 "cleaner", 0); 406 if (error) 407 return (error); 408 } 409 return 0; 410 } 411 412 int 413 lfs_bwrite_ext(struct buf *bp, int flags) 414 { 415 struct lfs *fs; 416 struct inode *ip; 417 struct vnode *vp; 418 int fsb; 419 420 vp = bp->b_vp; 421 fs = VFSTOULFS(vp->v_mount)->um_lfs; 422 423 ASSERT_MAYBE_SEGLOCK(fs); 424 KASSERT(bp->b_cflags & BC_BUSY); 425 KASSERT(flags & BW_CLEAN || !LFS_IS_MALLOC_BUF(bp)); 426 KASSERT((bp->b_flags & B_LOCKED) || !(bp->b_oflags & BO_DELWRI)); 427 428 /* 429 * Don't write *any* blocks if we're mounted read-only, or 430 * if we are "already unmounted". 431 * 432 * In particular the cleaner can't write blocks either. 433 */ 434 if (fs->lfs_ronly || (lfs_sb_getpflags(fs) & LFS_PF_CLEAN)) { 435 bp->b_oflags &= ~BO_DELWRI; 436 bp->b_flags |= B_READ; /* XXX is this right? --ks */ 437 bp->b_error = 0; 438 mutex_enter(&bufcache_lock); 439 LFS_UNLOCK_BUF(bp); 440 if (LFS_IS_MALLOC_BUF(bp)) 441 bp->b_cflags &= ~BC_BUSY; 442 else 443 brelsel(bp, 0); 444 mutex_exit(&bufcache_lock); 445 return (fs->lfs_ronly ? EROFS : 0); 446 } 447 448 /* 449 * Set the delayed write flag and use reassignbuf to move the buffer 450 * from the clean list to the dirty one. 451 * 452 * Set the B_LOCKED flag and unlock the buffer, causing brelse to move 453 * the buffer onto the LOCKED free list. This is necessary, otherwise 454 * getnewbuf() would try to reclaim the buffers using bawrite, which 455 * isn't going to work. 456 * 457 * XXX we don't let meta-data writes run out of space because they can 458 * come from the segment writer. We need to make sure that there is 459 * enough space reserved so that there's room to write meta-data 460 * blocks. 461 */ 462 if ((bp->b_flags & B_LOCKED) == 0) { 463 fsb = lfs_numfrags(fs, bp->b_bcount); 464 465 ip = VTOI(vp); 466 if (flags & BW_CLEAN) { 467 lfs_setclean(fs, vp); 468 } else { 469 mutex_enter(&lfs_lock); 470 LFS_SET_UINO(ip, IN_MODIFIED); 471 mutex_exit(&lfs_lock); 472 } 473 if ((bp->b_oflags & BO_DELWRI) == 0) 474 lfs_sb_subavail(fs, fsb); 475 476 mutex_enter(&bufcache_lock); 477 mutex_enter(vp->v_interlock); 478 bp->b_oflags = (bp->b_oflags | BO_DELWRI) & ~BO_DONE; 479 LFS_LOCK_BUF(bp); 480 bp->b_flags &= ~B_READ; 481 bp->b_error = 0; 482 reassignbuf(bp, bp->b_vp); 483 mutex_exit(vp->v_interlock); 484 } else { 485 mutex_enter(&bufcache_lock); 486 } 487 488 if (bp->b_iodone != NULL) 489 bp->b_cflags &= ~BC_BUSY; 490 else 491 brelsel(bp, 0); 492 mutex_exit(&bufcache_lock); 493 494 return (0); 495 } 496 497 /* 498 * Called and return with the lfs_lock held. 499 */ 500 void 501 lfs_flush_fs(struct lfs *fs, int flags) 502 { 503 ASSERT_NO_SEGLOCK(fs); 504 KASSERT(mutex_owned(&lfs_lock)); 505 if (fs->lfs_ronly) 506 return; 507 508 if (lfs_dostats) 509 ++lfs_stats.flush_invoked; 510 511 fs->lfs_pdflush = 0; 512 mutex_exit(&lfs_lock); 513 lfs_writer_enter(fs, "fldirop"); 514 lfs_segwrite(fs->lfs_ivnode->v_mount, flags); 515 lfs_writer_leave(fs); 516 mutex_enter(&lfs_lock); 517 fs->lfs_favail = 0; /* XXX */ 518 } 519 520 /* 521 * This routine initiates segment writes when LFS is consuming too many 522 * resources. Ideally the pageout daemon would be able to direct LFS 523 * more subtly. 524 * XXX We have one static count of locked buffers; 525 * XXX need to think more about the multiple filesystem case. 526 * 527 * Called and return with lfs_lock held. 528 * If fs != NULL, we hold the segment lock for fs. 529 */ 530 void 531 lfs_flush(struct lfs *fs, int flags, int only_onefs) 532 { 533 extern u_int64_t locked_fakequeue_count; 534 mount_iterator_t *iter; 535 struct mount *mp; 536 struct lfs *tfs; 537 538 KASSERT(mutex_owned(&lfs_lock)); 539 KDASSERT(fs == NULL || !LFS_SEGLOCK_HELD(fs)); 540 KASSERT(!(fs == NULL && only_onefs)); 541 542 if (lfs_dostats) 543 ++lfs_stats.write_exceeded; 544 if (lfs_writing && !(flags & (SEGM_SYNC|SEGM_CKP))) { 545 DLOG((DLOG_FLUSH, "lfs_flush: not flushing because another flush is active\n")); 546 return; 547 } 548 while (lfs_writing) 549 cv_wait(&lfs_writing_cv, &lfs_lock); 550 lfs_writing = 1; 551 552 mutex_exit(&lfs_lock); 553 554 if (fs != NULL) { 555 if (!(fs->lfs_flags & LFS_NOTYET) 556 && vfs_busy(fs->lfs_ivnode->v_mount)) 557 goto errout; 558 mutex_enter(&lfs_lock); 559 lfs_flush_fs(fs, flags); 560 mutex_exit(&lfs_lock); 561 if (!(fs->lfs_flags & LFS_NOTYET)) 562 vfs_unbusy(fs->lfs_ivnode->v_mount); 563 } 564 if (!only_onefs) { 565 locked_fakequeue_count = 0; 566 mountlist_iterator_init(&iter); 567 while ((mp = mountlist_iterator_next(iter)) != NULL) { 568 if (strncmp(&mp->mnt_stat.f_fstypename[0], MOUNT_LFS, 569 sizeof(mp->mnt_stat.f_fstypename)) == 0) { 570 tfs = VFSTOULFS(mp)->um_lfs; 571 if (tfs == fs) 572 continue; 573 mutex_enter(&lfs_lock); 574 lfs_flush_fs(tfs, flags); 575 mutex_exit(&lfs_lock); 576 } 577 } 578 mountlist_iterator_destroy(iter); 579 } 580 wakeup(&lfs_subsys_pages); 581 582 errout: 583 mutex_enter(&lfs_lock); 584 KASSERT(lfs_writing); 585 lfs_writing = 0; 586 wakeup(&lfs_writing); 587 } 588 589 #define INOCOUNT(fs) howmany(lfs_sb_getuinodes(fs), LFS_INOPB(fs)) 590 #define INOBYTES(fs) (lfs_sb_getuinodes(fs) * DINOSIZE(fs)) 591 592 /* 593 * Determine whether this filesystem, or the global state, 594 * needs to flush. 595 */ 596 int 597 lfs_needsflush(struct lfs *fs) 598 { 599 if (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS) 600 return 1; 601 if (locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES) 602 return 1; 603 if (lfs_subsys_pages > LFS_MAX_PAGES) 604 return 1; 605 if (fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs)) 606 return 1; 607 if (lfs_dirvcount > LFS_MAX_DIROP) 608 return 1; 609 if (fs->lfs_diropwait > 0) 610 return 1; 611 612 return 0; 613 } 614 615 /* 616 * As above, but needs to *wait*. 617 */ 618 int 619 lfs_needswait(struct lfs *fs) 620 { 621 if (locked_queue_count + INOCOUNT(fs) > LFS_WAIT_BUFS) 622 return 1; 623 if (locked_queue_bytes + INOBYTES(fs) > LFS_WAIT_BYTES) 624 return 1; 625 if (lfs_subsys_pages > LFS_WAIT_PAGES) 626 return 1; 627 if (fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs)) /* XXX */ 628 return 1; 629 if (lfs_dirvcount > LFS_MAX_DIROP) /* XXX */ 630 return 1; 631 if (fs->lfs_diropwait > 0) 632 return 1; 633 634 return 0; 635 } 636 637 638 /* 639 * make sure that we don't have too many locked buffers. 640 * flush buffers if needed. 641 */ 642 int 643 lfs_check(struct vnode *vp, daddr_t blkno, int flags) 644 { 645 int error; 646 struct lfs *fs; 647 struct inode *ip; 648 extern kcondvar_t lfs_writerd_cv; 649 650 error = 0; 651 ip = VTOI(vp); 652 653 /* If out of buffers, wait on writer */ 654 /* XXX KS - if it's the Ifile, we're probably the cleaner! */ 655 if (ip->i_number == LFS_IFILE_INUM) 656 return 0; 657 /* If we're being called from inside a dirop, don't sleep */ 658 if (ip->i_state & IN_ADIROP) 659 return 0; 660 661 fs = ip->i_lfs; 662 663 ASSERT_NO_SEGLOCK(fs); 664 665 /* 666 * If we would flush below, but dirops are active, sleep. 667 * Note that a dirop cannot ever reach this code! 668 */ 669 mutex_enter(&lfs_lock); 670 while (fs->lfs_dirops > 0 && lfs_needswait(fs)) { 671 ++fs->lfs_diropwait; 672 mtsleep(&fs->lfs_writer, PRIBIO+1, "bufdirop", 0, 673 &lfs_lock); 674 --fs->lfs_diropwait; 675 } 676 677 #ifdef DEBUG 678 if (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS) 679 DLOG((DLOG_FLUSH, "lfs_check: lqc = %d, max %d\n", 680 locked_queue_count + INOCOUNT(fs), LFS_MAX_BUFS)); 681 if (locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES) 682 DLOG((DLOG_FLUSH, "lfs_check: lqb = %ld, max %ld\n", 683 locked_queue_bytes + INOBYTES(fs), LFS_MAX_BYTES)); 684 if (lfs_subsys_pages > LFS_MAX_PAGES) 685 DLOG((DLOG_FLUSH, "lfs_check: lssp = %d, max %d\n", 686 lfs_subsys_pages, LFS_MAX_PAGES)); 687 if (lfs_fs_pagetrip && fs->lfs_pages > lfs_fs_pagetrip) 688 DLOG((DLOG_FLUSH, "lfs_check: fssp = %d, trip at %d\n", 689 fs->lfs_pages, lfs_fs_pagetrip)); 690 if (lfs_dirvcount > LFS_MAX_DIROP) 691 DLOG((DLOG_FLUSH, "lfs_check: ldvc = %d, max %d\n", 692 lfs_dirvcount, LFS_MAX_DIROP)); 693 if (fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs)) 694 DLOG((DLOG_FLUSH, "lfs_check: lfdvc = %d, max %d\n", 695 fs->lfs_dirvcount, LFS_MAX_FSDIROP(fs))); 696 if (fs->lfs_diropwait > 0) 697 DLOG((DLOG_FLUSH, "lfs_check: ldvw = %d\n", 698 fs->lfs_diropwait)); 699 #endif 700 701 /* If there are too many pending dirops, we have to flush them. */ 702 if (fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) || 703 lfs_dirvcount > LFS_MAX_DIROP || fs->lfs_diropwait > 0) { 704 KASSERT(fs->lfs_dirops == 0); 705 fs->lfs_writer++; 706 mutex_exit(&lfs_lock); 707 lfs_flush_dirops(fs); 708 mutex_enter(&lfs_lock); 709 if (--fs->lfs_writer == 0) 710 cv_broadcast(&fs->lfs_diropscv); 711 KASSERT(fs->lfs_dirops == 0); 712 } else if (lfs_needsflush(fs)) { 713 lfs_flush(fs, flags, 0); 714 } else if (lfs_fs_pagetrip && fs->lfs_pages > lfs_fs_pagetrip) { 715 /* 716 * If we didn't flush the whole thing, some filesystems 717 * still might want to be flushed. 718 */ 719 ++fs->lfs_pdflush; 720 cv_broadcast(&lfs_writerd_cv); 721 } 722 723 while (lfs_needswait(fs)) { 724 if (lfs_dostats) 725 ++lfs_stats.wait_exceeded; 726 DLOG((DLOG_AVAIL, "lfs_check: waiting: count=%d, bytes=%ld\n", 727 locked_queue_count, locked_queue_bytes)); 728 ++locked_queue_waiters; 729 cv_broadcast(&lfs_writerd_cv); 730 error = cv_timedwait_sig(&locked_queue_cv, &lfs_lock, 731 hz * LFS_BUFWAIT); 732 --locked_queue_waiters; 733 if (error != EWOULDBLOCK) 734 break; 735 736 /* 737 * lfs_flush might not flush all the buffers, if some of the 738 * inodes were locked or if most of them were Ifile blocks 739 * and we weren't asked to checkpoint. Try flushing again 740 * after waiting, to keep us from blocking indefinitely. 741 */ 742 if (locked_queue_count + INOCOUNT(fs) >= LFS_MAX_BUFS || 743 locked_queue_bytes + INOBYTES(fs) >= LFS_MAX_BYTES) { 744 lfs_flush(fs, flags | SEGM_CKP, 0); 745 } 746 } 747 mutex_exit(&lfs_lock); 748 return (error); 749 } 750 751 /* 752 * Allocate a new buffer header. 753 */ 754 struct buf * 755 lfs_newbuf(struct lfs *fs, struct vnode *vp, daddr_t daddr, size_t size, int type) 756 { 757 struct buf *bp; 758 size_t nbytes; 759 760 ASSERT_MAYBE_SEGLOCK(fs); 761 nbytes = roundup(size, lfs_fsbtob(fs, 1)); 762 763 bp = getiobuf(NULL, true); 764 if (nbytes) { 765 bp->b_data = lfs_malloc(fs, nbytes, type); 766 /* memset(bp->b_data, 0, nbytes); */ 767 } 768 KASSERT(vp != NULL); 769 KASSERT(bp != NULL); 770 771 bp->b_bufsize = size; 772 bp->b_bcount = size; 773 bp->b_lblkno = daddr; 774 bp->b_blkno = daddr; 775 bp->b_error = 0; 776 bp->b_resid = 0; 777 bp->b_iodone = lfs_free_aiodone; 778 bp->b_cflags |= BC_BUSY | BC_NOCACHE; 779 bp->b_private = fs; 780 781 mutex_enter(&bufcache_lock); 782 mutex_enter(vp->v_interlock); 783 bgetvp(vp, bp); 784 mutex_exit(vp->v_interlock); 785 mutex_exit(&bufcache_lock); 786 787 return (bp); 788 } 789 790 void 791 lfs_freebuf(struct lfs *fs, struct buf *bp) 792 { 793 struct vnode *vp; 794 795 if ((vp = bp->b_vp) != NULL) { 796 mutex_enter(&bufcache_lock); 797 mutex_enter(vp->v_interlock); 798 brelvp(bp); 799 mutex_exit(vp->v_interlock); 800 mutex_exit(&bufcache_lock); 801 } 802 if (!(bp->b_cflags & BC_INVAL)) { /* BC_INVAL indicates a "fake" buffer */ 803 lfs_free(fs, bp->b_data, LFS_NB_UNKNOWN); 804 bp->b_data = NULL; 805 } 806 putiobuf(bp); 807 } 808 809 int 810 lfs_wait_pages(void) 811 { 812 int active, inactive; 813 814 uvm_estimatepageable(&active, &inactive); 815 return LFS_WAIT_RESOURCE(active + inactive + uvm_availmem(false), 1); 816 } 817 818 int 819 lfs_max_pages(void) 820 { 821 int active, inactive; 822 823 uvm_estimatepageable(&active, &inactive); 824 return LFS_MAX_RESOURCE(active + inactive + uvm_availmem(false), 1); 825 } 826