1 /* $NetBSD: lfs_segment.c,v 1.308 2026/01/30 15:52:54 perseant Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Konrad E. Schroder <perseant (at) hhhh.org>. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 /* 32 * Copyright (c) 1991, 1993 33 * The Regents of the University of California. All rights reserved. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. Neither the name of the University nor the names of its contributors 44 * may be used to endorse or promote products derived from this software 45 * without specific prior written permission. 46 * 47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * SUCH DAMAGE. 58 * 59 * @(#)lfs_segment.c 8.10 (Berkeley) 6/10/95 60 */ 61 62 #include <sys/cdefs.h> 63 __KERNEL_RCSID(0, "$NetBSD: lfs_segment.c,v 1.308 2026/01/30 15:52:54 perseant Exp $"); 64 65 #ifdef DEBUG 66 # define vndebug(vp, str) do { \ 67 if (VTOI(vp)->i_state & IN_CLEANING) \ 68 DLOG((DLOG_WVNODE, "not writing ino %d because %s (op %d)\n", \ 69 VTOI(vp)->i_number, (str), op)); \ 70 } while(0) 71 #else 72 # define vndebug(vp, str) 73 #endif 74 #define ivndebug(vp, str) \ 75 DLOG((DLOG_WVNODE, "ino %d: %s\n", VTOI(vp)->i_number, (str))) 76 77 #if defined(_KERNEL_OPT) 78 #include "opt_ddb.h" 79 #endif 80 81 #include <sys/param.h> 82 #include <sys/systm.h> 83 #include <sys/namei.h> 84 #include <sys/kernel.h> 85 #include <sys/resourcevar.h> 86 #include <sys/file.h> 87 #include <sys/stat.h> 88 #include <sys/buf.h> 89 #include <sys/proc.h> 90 #include <sys/vnode.h> 91 #include <sys/mount.h> 92 #include <sys/kauth.h> 93 #include <sys/syslog.h> 94 #include <sys/vnode_impl.h> 95 #include <sys/workqueue.h> 96 97 #include <miscfs/specfs/specdev.h> 98 #include <miscfs/fifofs/fifo.h> 99 100 #include <ufs/lfs/ulfs_inode.h> 101 #include <ufs/lfs/ulfsmount.h> 102 #include <ufs/lfs/ulfs_extern.h> 103 104 #include <ufs/lfs/lfs.h> 105 #include <ufs/lfs/lfs_accessors.h> 106 #include <ufs/lfs/lfs_kernel.h> 107 #include <ufs/lfs/lfs_extern.h> 108 109 #include <uvm/uvm_extern.h> 110 #include <uvm/uvm_page.h> 111 112 MALLOC_JUSTDEFINE(M_SEGMENT, "LFS segment", "Segment for LFS"); 113 114 static void lfs_super_aiodone(struct buf *); 115 static void lfs_cluster_aiodone(struct buf *); 116 117 /* 118 * Determine if it's OK to start a partial in this segment, or if we need 119 * to go on to a new segment. 120 */ 121 #define LFS_PARTIAL_FITS(fs) \ 122 (lfs_sb_getfsbpseg(fs) - \ 123 (lfs_sb_getoffset(fs) - lfs_sb_getcurseg(fs)) > \ 124 lfs_sb_getfrag(fs)) 125 126 /* 127 * Figure out whether we should do a checkpoint write or go ahead with 128 * an ordinary write. 129 */ 130 #define LFS_SHOULD_CHECKPOINT(fs, flags) \ 131 ((flags & SEGM_CLEAN) == 0 && \ 132 ((fs->lfs_nactive > LFS_MAX_ACTIVE || \ 133 (flags & SEGM_CKP) || \ 134 lfs_sb_getnclean(fs) < LFS_MAX_ACTIVE))) 135 136 int lfs_match_fake(struct lfs *, struct buf *); 137 void lfs_newseg(struct lfs *); 138 void lfs_updatemeta(struct segment *); 139 void lfs_writesuper(struct lfs *, daddr_t); 140 int lfs_writevnodes(struct lfs *fs, struct mount *mp, 141 struct segment *sp, int dirops); 142 143 static void lfs_shellsort(struct lfs *, struct buf **, union lfs_blocks *, 144 int, int); 145 146 kcondvar_t lfs_allclean_wakeup; /* Cleaner wakeup address. */ 147 int lfs_writeindir = 1; /* whether to flush indir on non-ckp */ 148 int lfs_clean_vnhead = 0; /* Allow freeing to head of vn list */ 149 int lfs_dirvcount = 0; /* # active dirops */ 150 151 extern struct workqueue *lfs_cluster_wq; 152 extern struct workqueue *lfs_super_wq; 153 154 /* Statistics Counters */ 155 int lfs_dostats = 1; 156 struct lfs_stats lfs_stats; 157 158 /* op values to lfs_writevnodes */ 159 #define VN_REG 0 160 #define VN_DIROP 1 161 #define VN_EMPTY 2 162 #define VN_CLEAN 3 163 164 /* 165 * XXX KS - Set modification time on the Ifile, so the cleaner can 166 * read the fs mod time off of it. We don't set IN_UPDATE here, 167 * since we don't really need this to be flushed to disk (and in any 168 * case that wouldn't happen to the Ifile until we checkpoint). 169 */ 170 void 171 lfs_imtime(struct lfs *fs) 172 { 173 struct timespec ts; 174 struct inode *ip; 175 176 ASSERT_MAYBE_SEGLOCK(fs); 177 vfs_timestamp(&ts); 178 ip = VTOI(fs->lfs_ivnode); 179 lfs_dino_setmtime(fs, ip->i_din, ts.tv_sec); 180 lfs_dino_setmtimensec(fs, ip->i_din, ts.tv_nsec); 181 } 182 183 /* 184 * Ifile and meta data blocks are not marked busy, so segment writes MUST be 185 * single threaded. Currently, there are two paths into lfs_segwrite, sync() 186 * and getnewbuf(). They both mark the file system busy. Lfs_vflush() 187 * explicitly marks the file system busy. So lfs_segwrite is safe. I think. 188 */ 189 190 #define IS_FLUSHING(fs,vp) ((fs)->lfs_flushvp == (vp)) 191 192 int 193 lfs_vflush(struct vnode *vp) 194 { 195 struct inode *ip; 196 struct lfs *fs; 197 struct segment *sp; 198 struct buf *bp, *nbp; 199 int error; 200 int flushed; 201 int relock; 202 203 ip = VTOI(vp); 204 fs = VFSTOULFS(vp->v_mount)->um_lfs; 205 relock = 0; 206 207 top: 208 ASSERT_NO_SEGLOCK(fs); 209 KASSERT(mutex_owned(vp->v_interlock) == false); 210 KASSERT(mutex_owned(&lfs_lock) == false); 211 KASSERT(mutex_owned(&bufcache_lock) == false); 212 KASSERT(!(ip->i_state & IN_CLEANING)); 213 214 lfs_writer_enter(fs, "vfwriter"); 215 216 mutex_enter(vp->v_interlock); 217 218 /* If the node is being written, wait until that is done */ 219 while (WRITEINPROG(vp)) { 220 ivndebug(vp,"vflush/writeinprog"); 221 cv_wait(&vp->v_cv, vp->v_interlock); 222 } 223 error = vdead_check(vp, VDEAD_NOWAIT); 224 mutex_exit(vp->v_interlock); 225 226 /* Protect against deadlock in vinvalbuf() */ 227 lfs_seglock(fs, SEGM_SYNC | ((error != 0) ? SEGM_RECLAIM : 0)); 228 if (error != 0) { 229 fs->lfs_reclino = ip->i_number; 230 error = 0; 231 } 232 233 KASSERT(!(ip->i_state & IN_CLEANING)); 234 235 /* If we're supposed to flush a freed inode, just toss it */ 236 if (ip->i_lfs_iflags & LFSI_DELETED) { 237 DLOG((DLOG_VNODE, "lfs_vflush: ino %d freed, not flushing\n", 238 ip->i_number)); 239 /* Drain v_numoutput */ 240 mutex_enter(vp->v_interlock); 241 while (vp->v_numoutput > 0) { 242 cv_wait(&vp->v_cv, vp->v_interlock); 243 } 244 KASSERT(vp->v_numoutput == 0); 245 mutex_exit(vp->v_interlock); 246 247 mutex_enter(&bufcache_lock); 248 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 249 nbp = LIST_NEXT(bp, b_vnbufs); 250 251 KASSERT((bp->b_flags & B_GATHERED) == 0); 252 if (bp->b_oflags & BO_DELWRI) { /* XXX always true? */ 253 lfs_sb_addavail(fs, lfs_btofsb(fs, bp->b_bcount)); 254 wakeup(&fs->lfs_availsleep); 255 } 256 /* Copied from lfs_writeseg */ 257 if (bp->b_iodone != NULL) { 258 mutex_exit(&bufcache_lock); 259 biodone(bp); 260 mutex_enter(&bufcache_lock); 261 } else { 262 bremfree(bp); 263 LFS_UNLOCK_BUF(bp); 264 mutex_enter(vp->v_interlock); 265 bp->b_flags &= ~(B_READ | B_GATHERED); 266 bp->b_oflags = (bp->b_oflags & ~BO_DELWRI) | BO_DONE; 267 bp->b_error = 0; 268 reassignbuf(bp, vp); 269 mutex_exit(vp->v_interlock); 270 brelse(bp, 0); 271 } 272 } 273 mutex_exit(&bufcache_lock); 274 LFS_CLR_UINO(ip, IN_MODIFIED | IN_ACCESSED); 275 ip->i_state &= ~IN_ALLMOD; 276 DLOG((DLOG_VNODE, "lfs_vflush: done not flushing ino %d\n", 277 ip->i_number)); 278 lfs_segunlock(fs); 279 280 KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL); 281 282 goto out; 283 } 284 285 fs->lfs_flushvp = vp; 286 if (LFS_SHOULD_CHECKPOINT(fs, fs->lfs_sp->seg_flags)) { 287 error = lfs_segwrite(vp->v_mount, SEGM_CKP | SEGM_SYNC); 288 fs->lfs_flushvp = NULL; 289 KASSERT(fs->lfs_flushvp_fakevref == 0); 290 lfs_segunlock(fs); 291 292 /* Make sure that any pending buffers get written */ 293 mutex_enter(vp->v_interlock); 294 while (vp->v_numoutput > 0) { 295 cv_wait(&vp->v_cv, vp->v_interlock); 296 } 297 KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL); 298 KASSERT(vp->v_numoutput == 0); 299 mutex_exit(vp->v_interlock); 300 301 goto out; 302 } 303 sp = fs->lfs_sp; 304 305 flushed = 0; 306 if (VPISEMPTY(vp)) { 307 lfs_writevnodes(fs, vp->v_mount, sp, VN_EMPTY); 308 ++flushed; 309 } else if (lfs_dostats) { 310 if (!VPISEMPTY(vp) || (VTOI(vp)->i_state & IN_ALLMOD)) 311 ++lfs_stats.vflush_invoked; 312 ivndebug(vp,"vflush"); 313 } 314 315 #ifdef DIAGNOSTIC 316 if (vp->v_uflag & VU_DIROP) { 317 DLOG((DLOG_VNODE, "lfs_vflush: flushing VU_DIROP\n")); 318 /* panic("lfs_vflush: VU_DIROP being flushed...this can\'t happen"); */ 319 } 320 #endif 321 322 do { 323 #ifdef DEBUG 324 int loopcount = 0; 325 #endif 326 do { 327 if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL) { 328 relock = lfs_writefile(fs, sp, vp); 329 if (relock && vp != fs->lfs_ivnode) { 330 /* 331 * Might have to wait for the 332 * cleaner to run; but we're 333 * still not done with this vnode. 334 * XXX we can do better than this. 335 */ 336 KASSERT(ip->i_number != LFS_IFILE_INUM); 337 lfs_writeinode(fs, sp, ip); 338 mutex_enter(&lfs_lock); 339 LFS_SET_UINO(ip, IN_MODIFIED); 340 mutex_exit(&lfs_lock); 341 lfs_writeseg(fs, sp); 342 lfs_segunlock(fs); 343 lfs_segunlock_relock(fs); 344 goto top; 345 } 346 } 347 /* 348 * If we begin a new segment in the middle of writing 349 * the Ifile, it creates an inconsistent checkpoint, 350 * since the Ifile information for the new segment 351 * is not up-to-date. Take care of this here by 352 * sending the Ifile through again in case there 353 * are newly dirtied blocks. But wait, there's more! 354 * This second Ifile write could *also* cross a segment 355 * boundary, if the first one was large. The second 356 * one is guaranteed to be no more than 8 blocks, 357 * though (two segment blocks and supporting indirects) 358 * so the third write *will not* cross the boundary. 359 */ 360 if (vp == fs->lfs_ivnode) { 361 lfs_writefile(fs, sp, vp); 362 lfs_writefile(fs, sp, vp); 363 } 364 #ifdef DEBUG 365 if (++loopcount > 2) 366 log(LOG_NOTICE, "lfs_vflush: looping count=%d\n", loopcount); 367 #endif 368 } while (lfs_writeinode(fs, sp, ip)); 369 } while (lfs_writeseg(fs, sp) && ip->i_number == LFS_IFILE_INUM); 370 371 if (lfs_dostats) { 372 ++lfs_stats.nwrites; 373 if (sp->seg_flags & SEGM_SYNC) 374 ++lfs_stats.nsync_writes; 375 if (sp->seg_flags & SEGM_CKP) 376 ++lfs_stats.ncheckpoints; 377 } 378 /* 379 * If we were called from somewhere that has already held the seglock 380 * (e.g., lfs_markv()), the lfs_segunlock will not wait for 381 * the write to complete because we are still locked. 382 * Since lfs_vflush() must return the vnode with no dirty buffers, 383 * we must explicitly wait, if that is the case. 384 * 385 * We compare the iocount against 1, not 0, because it is 386 * artificially incremented by lfs_seglock(). 387 */ 388 mutex_enter(&lfs_lock); 389 if (fs->lfs_seglock > 1) { 390 while (fs->lfs_iocount > 1) 391 (void)mtsleep(&fs->lfs_iocount, PRIBIO + 1, 392 "lfs_vflush", 0, &lfs_lock); 393 } 394 mutex_exit(&lfs_lock); 395 396 lfs_segunlock(fs); 397 398 /* Wait for these buffers to be recovered by aiodoned */ 399 mutex_enter(vp->v_interlock); 400 while (vp->v_numoutput > 0) { 401 cv_wait(&vp->v_cv, vp->v_interlock); 402 } 403 KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL); 404 KASSERT(vp->v_numoutput == 0); 405 mutex_exit(vp->v_interlock); 406 407 fs->lfs_flushvp = NULL; 408 KASSERT(fs->lfs_flushvp_fakevref == 0); 409 410 out: 411 lfs_writer_leave(fs); 412 return error; 413 } 414 415 struct lfs_writevnodes_ctx { 416 int op; 417 struct lfs *fs; 418 }; 419 static bool 420 lfs_writevnodes_selector(void *cl, struct vnode *vp) 421 { 422 struct lfs_writevnodes_ctx *c = cl; 423 struct inode *ip; 424 int op = c->op; 425 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 426 427 KASSERT(mutex_owned(vp->v_interlock)); 428 429 /* 430 * A vnode being reclaimed will be in state VS_RECLAIMING 431 * while it attemmpts to get the segment lock. We hold the 432 * segment lock, so we must skip these vnodes in order to 433 * avoid a deadlock. 434 */ 435 if (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED 436 && !IS_FLUSHING(c->fs, vp)) 437 return false; 438 439 ip = VTOI(vp); 440 if (ip == NULL || vp->v_type == VNON || ip->i_nlink <= 0) 441 return false; 442 if ((op == VN_DIROP && !(vp->v_uflag & VU_DIROP)) || 443 (op != VN_DIROP && op != VN_CLEAN && (vp->v_uflag & VU_DIROP))) { 444 vndebug(vp, "dirop"); 445 return false; 446 } 447 if (op == VN_EMPTY && !VPISEMPTY(vp)) { 448 vndebug(vp,"empty"); 449 return false; 450 } 451 if (op == VN_CLEAN && ip->i_number != LFS_IFILE_INUM && 452 vp != c->fs->lfs_flushvp && !(ip->i_state & IN_CLEANING)) { 453 vndebug(vp,"cleaning"); 454 return false; 455 } 456 mutex_enter(&lfs_lock); 457 if (vp == c->fs->lfs_unlockvp) { 458 mutex_exit(&lfs_lock); 459 return false; 460 } 461 mutex_exit(&lfs_lock); 462 463 return true; 464 } 465 466 int 467 lfs_writevnodes(struct lfs *fs, struct mount *mp, struct segment *sp, int op) 468 { 469 struct inode *ip; 470 struct vnode *vp; 471 struct vnode_iterator *marker; 472 struct lfs_writevnodes_ctx ctx; 473 int inodes_written = 0; 474 int error = 0; 475 476 /* 477 * XXX This was TAILQ_FOREACH_REVERSE on &mp->mnt_vnodelist. 478 * XXX The rationale is unclear, the initial commit had no information. 479 * XXX If the order really matters we have to sort the vnodes first. 480 */ 481 482 ASSERT_SEGLOCK(fs); 483 vfs_vnode_iterator_init(mp, &marker); 484 ctx.op = op; 485 ctx.fs = fs; 486 while ((vp = vfs_vnode_iterator_next(marker, 487 lfs_writevnodes_selector, &ctx)) != NULL) { 488 ip = VTOI(vp); 489 490 /* 491 * Write the inode/file if dirty and it's not the IFILE. 492 */ 493 if (((ip->i_state & IN_ALLMOD) || !VPISEMPTY(vp)) && 494 ip->i_number != LFS_IFILE_INUM) { 495 error = lfs_writefile(fs, sp, vp); 496 if (error) { 497 vrele(vp); 498 if (error == EAGAIN) { 499 /* 500 * This error from lfs_putpages 501 * indicates we need to drop 502 * the segment lock and start 503 * over after the cleaner has 504 * had a chance to run. 505 */ 506 lfs_writeinode(fs, sp, ip); 507 lfs_writeseg(fs, sp); 508 if (!VPISEMPTY(vp) && 509 !WRITEINPROG(vp) && 510 !(ip->i_state & IN_ALLMOD)) { 511 mutex_enter(&lfs_lock); 512 LFS_SET_UINO(ip, IN_MODIFIED); 513 mutex_exit(&lfs_lock); 514 } 515 break; 516 } 517 error = 0; /* XXX not quite right */ 518 continue; 519 } 520 521 if (!VPISEMPTY(vp)) { 522 if (WRITEINPROG(vp)) { 523 ivndebug(vp,"writevnodes/write2"); 524 } else if (!(ip->i_state & IN_ALLMOD)) { 525 mutex_enter(&lfs_lock); 526 LFS_SET_UINO(ip, IN_MODIFIED); 527 mutex_exit(&lfs_lock); 528 } 529 } 530 (void) lfs_writeinode(fs, sp, ip); 531 inodes_written++; 532 } 533 vrele(vp); 534 } 535 vfs_vnode_iterator_destroy(marker); 536 return error; 537 } 538 539 /* 540 * Do a checkpoint. 541 */ 542 int 543 lfs_segwrite(struct mount *mp, int flags) 544 { 545 struct buf *bp; 546 struct inode *ip; 547 struct lfs *fs; 548 struct segment *sp; 549 struct vnode *vp; 550 SEGUSE *segusep; 551 int do_ckp, did_ckp, error; 552 unsigned n, segleft, maxseg, sn, i, curseg; 553 int writer_set = 0; 554 int dirty; 555 int redo; 556 SEGSUM *ssp; 557 int um_error; 558 559 fs = VFSTOULFS(mp)->um_lfs; 560 DLOG((DLOG_SEG, "lfs_segwrite(fs=%p, flags=%x)\n", fs, flags)); 561 ASSERT_MAYBE_SEGLOCK(fs); 562 563 if (fs->lfs_ronly) 564 return EROFS; 565 566 lfs_imtime(fs); 567 568 /* 569 * Allocate a segment structure and enough space to hold pointers to 570 * the maximum possible number of buffers which can be described in a 571 * single summary block. 572 */ 573 do_ckp = LFS_SHOULD_CHECKPOINT(fs, flags); 574 575 /* 576 * If we know we're gonna need the writer lock, take it now to 577 * preserve the lock order lfs_writer -> lfs_seglock. 578 */ 579 if (do_ckp && !LFS_SEGLOCK_HELD(fs)) { 580 lfs_writer_enter(fs, "ckpwriter"); 581 writer_set = 1; 582 } 583 584 /* We can't do a partial write and checkpoint at the same time. */ 585 if (do_ckp) 586 flags &= ~SEGM_SINGLE; 587 588 lfs_seglock(fs, flags | (do_ckp ? SEGM_CKP : 0)); 589 sp = fs->lfs_sp; 590 if (sp->seg_flags & (SEGM_CLEAN | SEGM_CKP)) 591 do_ckp = 1; 592 593 /* 594 * If lfs_flushvp is non-NULL, we are called from lfs_vflush, 595 * in which case we have to flush *all* buffers off of this vnode. 596 * We don't care about other nodes, but write any non-dirop nodes 597 * anyway in anticipation of another getnewvnode(). 598 * 599 * If we're cleaning we only write cleaning and ifile blocks, and 600 * no dirops, since otherwise we'd risk corruption in a crash. 601 */ 602 DLOG((DLOG_SEG, " do_ckp=%d sp->seg_flags=0x%x\n", do_ckp, sp->seg_flags)); 603 if (sp->seg_flags & SEGM_CLEAN) 604 lfs_writevnodes(fs, mp, sp, VN_CLEAN); 605 else if (!(sp->seg_flags & SEGM_FORCE_CKP)) { 606 do { 607 um_error = lfs_writevnodes(fs, mp, sp, VN_REG); 608 if ((sp->seg_flags & SEGM_SINGLE) && 609 lfs_sb_getcurseg(fs) != fs->lfs_startseg) { 610 DLOG((DLOG_SEG, "lfs_segwrite: breaking out of segment write at daddr 0x%jx\n", (uintmax_t)lfs_sb_getoffset(fs))); 611 break; 612 } 613 614 if (do_ckp || 615 (writer_set = lfs_writer_tryenter(fs)) != 0) { 616 KASSERT(fs->lfs_writer); 617 error = lfs_writevnodes(fs, mp, sp, VN_DIROP); 618 if (um_error == 0) 619 um_error = error; 620 /* 621 * In case writevnodes errored out 622 * XXX why are we always doing this and not 623 * just on error? 624 */ 625 lfs_flush_dirops(fs); 626 ssp = (SEGSUM *)(sp->segsum); 627 lfs_ss_setflags(fs, ssp, 628 lfs_ss_getflags(fs, ssp) & ~(SS_CONT)); 629 lfs_finalize_fs_seguse(fs); 630 } 631 if (do_ckp && um_error) { 632 lfs_segunlock_relock(fs); 633 sp = fs->lfs_sp; 634 } 635 } while (do_ckp && um_error != 0); 636 } 637 638 /* 639 * If we are doing a checkpoint, mark everything since the 640 * last checkpoint as no longer ACTIVE. 641 */ 642 if (do_ckp || fs->lfs_doifile) { 643 segleft = lfs_sb_getnseg(fs); 644 curseg = 0; 645 for (n = 0; n < lfs_sb_getsegtabsz(fs); n++) { 646 int bread_error; 647 648 dirty = 0; 649 bread_error = bread(fs->lfs_ivnode, 650 lfs_sb_getcleansz(fs) + n, 651 lfs_sb_getbsize(fs), B_MODIFY, &bp); 652 if (bread_error) 653 panic("lfs_segwrite: ifile read: " 654 "seguse %u: error %d\n", 655 n, bread_error); 656 segusep = (SEGUSE *)bp->b_data; 657 maxseg = uimin(segleft, lfs_sb_getsepb(fs)); 658 for (i = 0; i < maxseg; i++) { 659 sn = curseg + i; 660 if (sn != lfs_dtosn(fs, lfs_sb_getcurseg(fs)) && 661 segusep->su_flags & SEGUSE_ACTIVE) { 662 segusep->su_flags &= ~SEGUSE_ACTIVE; 663 --fs->lfs_nactive; 664 ++dirty; 665 } 666 if (lfs_sb_getversion(fs) > 1) 667 ++segusep; 668 else 669 segusep = (SEGUSE *) 670 ((SEGUSE_V1 *)segusep + 1); 671 } 672 673 if (dirty) 674 error = LFS_BWRITE_LOG(bp); /* Ifile */ 675 else 676 brelse(bp, 0); 677 segleft -= lfs_sb_getsepb(fs); 678 curseg += lfs_sb_getsepb(fs); 679 } 680 } 681 682 KASSERT(LFS_SEGLOCK_HELD(fs)); 683 684 did_ckp = 0; 685 if (do_ckp || fs->lfs_doifile) { 686 vp = fs->lfs_ivnode; 687 ip = VTOI(vp); 688 #ifdef DEBUG 689 int loopcount = 0; 690 #endif 691 do { 692 redo = 0; 693 694 LFS_ENTER_LOG("pretend", __FILE__, __LINE__, 0, 0, curproc->p_pid); 695 696 mutex_enter(&lfs_lock); 697 /* Track changes to the ifile */ 698 fs->lfs_flags &= ~LFS_IFDIRTY; 699 mutex_exit(&lfs_lock); 700 701 mutex_enter(vp->v_interlock); 702 if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL) { 703 /* 704 * Ifile has no pages, so we don't need 705 * to check error return here. 706 */ 707 mutex_exit(vp->v_interlock); 708 lfs_writefile(fs, sp, vp); 709 mutex_enter(vp->v_interlock); 710 } 711 712 /* 713 * If it possible for writes to the Ifile 714 * to cause other sections of the ifile to 715 * be dirtied. If that is the case, try again. 716 */ 717 top0: 718 LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) { 719 if (!(bp->b_oflags & BO_DELWRI)) { 720 goto top0; 721 } 722 if (!(bp->b_flags & B_GATHERED)) { 723 redo = 1; 724 break; 725 } 726 } 727 mutex_exit(vp->v_interlock); 728 729 if (ip->i_state & IN_ALLMOD) 730 ++did_ckp; 731 redo += lfs_writeinode(fs, sp, ip); 732 redo += lfs_writeseg(fs, sp); 733 mutex_enter(&lfs_lock); 734 redo += (fs->lfs_flags & LFS_IFDIRTY); 735 mutex_exit(&lfs_lock); 736 #ifdef DEBUG 737 if (++loopcount > 2) 738 log(LOG_NOTICE, "lfs_segwrite: looping count=%d\n", 739 loopcount); 740 #endif 741 } while (redo && do_ckp); 742 743 /* 744 * Unless we are unmounting, the Ifile may continue to have 745 * dirty blocks even after a checkpoint, due to changes to 746 * inodes' atime. If we're checkpointing, it's "impossible" 747 * for other parts of the Ifile to be dirty after the loop 748 * above, since we hold the segment lock. 749 */ 750 mutex_enter(vp->v_interlock); 751 if (LIST_EMPTY(&vp->v_dirtyblkhd)) { 752 LFS_CLR_UINO(ip, (IN_ALLMOD & ~IN_CLEANING)); 753 } 754 #ifdef DIAGNOSTIC 755 else if (do_ckp) { 756 int do_panic = 0; 757 top: 758 LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) { 759 if (!(bp->b_oflags & BO_DELWRI)) 760 goto top; 761 if (bp->b_lblkno < lfs_sb_getcleansz(fs) + 762 lfs_sb_getsegtabsz(fs) && 763 !(bp->b_flags & B_GATHERED)) { 764 printf("ifile lbn %ld still dirty" 765 " (flags 0x%lx cflags 0x%lx" 766 " oflags 0x%lx)\n", 767 (long)bp->b_lblkno, 768 (long)bp->b_flags, 769 (long)bp->b_cflags, 770 (long)bp->b_oflags); 771 ++do_panic; 772 } 773 } 774 if (do_panic) 775 panic("dirty blocks"); 776 } 777 #endif 778 mutex_exit(vp->v_interlock); 779 } else { 780 (void) lfs_writeseg(fs, sp); 781 } 782 783 /* Note Ifile no longer needs to be written */ 784 fs->lfs_doifile = 0; 785 if (writer_set) 786 lfs_writer_leave(fs); 787 788 /* 789 * If we didn't write the Ifile, we didn't really do anything. 790 * That means that (1) there is a checkpoint on disk and (2) 791 * nothing has changed since it was written. 792 * 793 * Take the flags off of the segment so that lfs_segunlock 794 * doesn't have to write the superblock either. 795 */ 796 if (do_ckp && !did_ckp) { 797 sp->seg_flags &= ~SEGM_CKP; 798 } 799 800 if (lfs_dostats) { 801 ++lfs_stats.nwrites; 802 if (sp->seg_flags & SEGM_SYNC) 803 ++lfs_stats.nsync_writes; 804 if (sp->seg_flags & SEGM_CKP) 805 ++lfs_stats.ncheckpoints; 806 } 807 lfs_segunlock(fs); 808 809 DLOG((DLOG_SEG, " returning 0\n")); 810 return (0); 811 } 812 813 /* 814 * Write the dirty blocks associated with a vnode. 815 */ 816 int 817 lfs_writefile(struct lfs *fs, struct segment *sp, struct vnode *vp) 818 { 819 struct inode *ip; 820 int i, frag; 821 SEGSUM *ssp; 822 int error; 823 bool ifile, rolling, checkpointing, cleaning; 824 825 ASSERT_SEGLOCK(fs); 826 error = 0; 827 ip = VTOI(vp); 828 829 ifile = (vp == fs->lfs_ivnode); 830 rolling = (fs->lfs_flags & LFS_NOTYET) 831 && curproc->p_pid == fs->lfs_rfpid; 832 checkpointing = (sp->seg_flags & SEGM_CKP); 833 cleaning = (sp->seg_flags & SEGM_CLEAN); 834 835 lfs_acquire_finfo(fs, ip->i_number, ip->i_gen); 836 837 if (vp->v_uflag & VU_DIROP) { 838 ssp = (SEGSUM *)sp->segsum; 839 lfs_ss_setflags(fs, ssp, 840 lfs_ss_getflags(fs, ssp) | (SS_DIROP|SS_CONT)); 841 } 842 843 if (ifile) { 844 /* 845 * Whenever we write the ifile, we write all of its blocks. 846 */ 847 lfs_gather(fs, sp, vp, lfs_match_data); 848 } else if (rolling) { 849 /* 850 * When rolling forward, we never write direct blocks; 851 * these are already on disk. Call lfs_ungather to 852 * dispose of them. 853 */ 854 lfs_ungather(fs, sp, vp, lfs_match_data); 855 } else if (IS_FLUSHING(fs, vp)) { 856 /* 857 * For a file being flushed, we need to write *all* blocks. 858 * This means writing any cleaning blocks first, and then 859 * immediately following with non-cleaning blocks. 860 * 861 * We don't need to call VOP_PUTPAGES here as it has already 862 * been done for us by the caller. 863 */ 864 if (cleaning) 865 lfs_gather(fs, sp, vp, lfs_match_fake); 866 lfs_gather(fs, sp, vp, lfs_match_data); 867 } else if (cleaning) { 868 /* 869 * If we are cleaning, only write fake blocks. 870 */ 871 lfs_gather(fs, sp, vp, lfs_match_fake); 872 } else { 873 /* 874 * The normal case. Write everything we've got. 875 */ 876 lfs_gather(fs, sp, vp, lfs_match_data); 877 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); 878 error = VOP_PUTPAGES(vp, 0, 0, 879 PGO_CLEANIT | PGO_ALLPAGES | PGO_LOCKED); 880 } 881 882 /* 883 * It may not be necessary to write the meta-data blocks at this point, 884 * as the roll-forward recovery code should be able to reconstruct the 885 * list. 886 * 887 * We have to write them anyway, though, under two conditions: (1) the 888 * vnode is being flushed (for reuse by vinvalbuf); or (2) we are 889 * checkpointing. 890 * 891 * BUT if we are cleaning, we might have indirect blocks that refer to 892 * new blocks not being written yet, in addition to fragments being 893 * moved out of a cleaned segment. If that is the case, don't 894 * write the indirect blocks, or the finfo will have a small block 895 * in the middle of it! 896 * XXX in this case isn't the inode size wrong too? 897 */ 898 frag = 0; 899 if (cleaning) { 900 for (i = 0; i < ULFS_NDADDR; i++) 901 if (ip->i_lfs_fragsize[i] > 0 && 902 ip->i_lfs_fragsize[i] < lfs_sb_getbsize(fs)) 903 ++frag; 904 } 905 KASSERTMSG((frag <= 1), 906 "lfs_writefile: more than one fragment! frag=%d", frag); 907 if (IS_FLUSHING(fs, vp) || ifile || 908 (frag == 0 && (lfs_writeindir || checkpointing))) { 909 lfs_gather(fs, sp, vp, lfs_match_indir); 910 lfs_gather(fs, sp, vp, lfs_match_dindir); 911 lfs_gather(fs, sp, vp, lfs_match_tindir); 912 } 913 lfs_release_finfo(fs); 914 915 return error; 916 } 917 918 /* 919 * Update segment accounting to reflect this inode's change of address. 920 */ 921 void 922 lfs_update_iaddr(struct lfs *fs, struct inode *ip, daddr_t ndaddr) 923 { 924 struct buf *bp; 925 IFILE *ifp; 926 SEGUSE *sup; 927 ino_t ino; 928 int oldsn, newsn; 929 daddr_t odaddr; 930 931 ASSERT_SEGLOCK(fs); 932 933 ino = ip->i_number; 934 newsn = (ndaddr == LFS_UNUSED_DADDR ? -1 : lfs_dtosn(fs, ndaddr)); 935 936 /* Update recorded location, noting old location */ 937 if (ino == LFS_IFILE_INUM) { 938 odaddr = lfs_sb_getidaddr(fs); 939 oldsn = lfs_dtosn(fs, odaddr); 940 lfs_sb_setidaddr(fs, ndaddr); 941 } else { 942 LFS_IENTRY(ifp, fs, ino, bp); 943 odaddr = lfs_if_getdaddr(fs, ifp); 944 oldsn = (odaddr == LFS_UNUSED_DADDR ? -1 945 : lfs_dtosn(fs, odaddr)); 946 lfs_if_setdaddr(fs, ifp, ndaddr); 947 LFS_WRITEIENTRY(ifp, fs, ino, bp); 948 } 949 950 /* 951 * If moving the inode to another block in the same segment, 952 * there is nothing more to do. 953 */ 954 if (oldsn == newsn) 955 return; 956 957 /* Remove from its old segment, if any */ 958 if (oldsn >= 0) { 959 LFS_SEGENTRY(sup, fs, oldsn, bp); 960 DLOG((DLOG_SU, "seg %jd -= %jd for ino %jd inode\n", 961 (intmax_t)oldsn, 962 (intmax_t)DINOSIZE(fs), 963 (intmax_t)ino)); 964 KASSERTMSG(sup->su_nbytes >= DINOSIZE(fs), 965 "lfs_writeinode: negative bytes " 966 "(segment %jd short by %jd)", 967 (intmax_t)oldsn, 968 (intmax_t)DINOSIZE(fs) - sup->su_nbytes); 969 sup->su_nbytes -= DINOSIZE(fs); 970 LFS_WRITESEGENTRY(sup, fs, oldsn, bp); /* Ifile */ 971 } 972 973 /* Add to its new segment, if any */ 974 if (newsn >= 0) { 975 LFS_SEGENTRY(sup, fs, newsn, bp); 976 DLOG((DLOG_SU, "seg %jd += %jd for ino %jd inode at 0x%jx\n", 977 (intmax_t)newsn, 978 (intmax_t)DINOSIZE(fs), 979 (intmax_t)ino, (intmax_t)ndaddr)); 980 KASSERTMSG(sup->su_nbytes <= lfs_sb_getssize(fs), 981 "lfs_writeinode: overfull segment " 982 "(segment %jd over by %jd)", 983 (intmax_t)newsn, 984 (intmax_t)sup->su_nbytes + DINOSIZE(fs)); 985 sup->su_nbytes += DINOSIZE(fs); 986 LFS_WRITESEGENTRY(sup, fs, oldsn, bp); /* Ifile */ 987 } 988 989 } 990 991 int 992 lfs_writeinode(struct lfs *fs, struct segment *sp, struct inode *ip) 993 { 994 struct buf *bp; 995 union lfs_dinode *cdp; 996 struct vnode *vp = ITOV(ip); 997 daddr_t daddr; 998 IINFO *iip; 999 int i; 1000 int gotblk = 0; 1001 int count; 1002 SEGSUM *ssp; 1003 1004 ASSERT_SEGLOCK(fs); 1005 if (!(ip->i_state & IN_ALLMOD) && !(vp->v_uflag & VU_DIROP)) 1006 return (0); 1007 1008 /* Can't write ifile when writer is not set */ 1009 KASSERT(ip->i_number != LFS_IFILE_INUM || fs->lfs_writer > 0 || 1010 (sp->seg_flags & SEGM_CLEAN)); 1011 1012 /* 1013 * If this is the Ifile, see if writing it here will generate a 1014 * temporary misaccounting. If it will, do the accounting and write 1015 * the blocks, postponing the inode write until the accounting is 1016 * solid. 1017 */ 1018 count = 0; 1019 while (vp == fs->lfs_ivnode) { 1020 int redo = 0; 1021 1022 /* If we don't have enough space, write to make space */ 1023 if (sp->idp == NULL && sp->ibp == NULL && 1024 (sp->seg_bytes_left < lfs_sb_getibsize(fs) || 1025 sp->sum_bytes_left < IINFOSIZE(fs))) { 1026 (void) lfs_writeseg(fs, sp); 1027 continue; 1028 } 1029 1030 /* Look for dirty Ifile blocks */ 1031 mutex_enter(vp->v_interlock); 1032 LIST_FOREACH(bp, &fs->lfs_ivnode->v_dirtyblkhd, b_vnbufs) { 1033 if (!(bp->b_flags & B_GATHERED)) { 1034 DLOG((DLOG_SU, "ifile dirty lbn 0x%lx" 1035 " flags 0x%x" 1036 " cflags 0x%x" 1037 " oflags 0x%x\n", 1038 (long)bp->b_lblkno, 1039 bp->b_flags, 1040 bp->b_cflags, 1041 bp->b_oflags)); 1042 redo = 1; 1043 break; 1044 } 1045 } 1046 mutex_exit(vp->v_interlock); 1047 1048 if (!redo) 1049 break; 1050 1051 if (sp->idp) { 1052 /* Back out inode */ 1053 lfs_dino_setinumber(fs, sp->idp, 0); 1054 sp->idp = NULL; 1055 lfs_sb_setidaddr(fs, 0x0); 1056 } 1057 ++count; 1058 if (count > 2) { 1059 /* 1060 * XXX This is a kludge. We are waiting on a 1061 * busy buffer. We need to wait for whoever 1062 * has it to unbusy it; maybe for it to be 1063 * read from disk. A one clock tick wait 1064 * should suffice. Better would be to protect 1065 * Ifile reads with the segment lock, but the 1066 * segment lock would require some 1067 * modification to work well for that. 1068 */ 1069 kpause("lfsinow", false, 1, NULL); 1070 #ifdef DEBUG 1071 log(LOG_NOTICE, "lfs_writeinode: looping count=%d\n", count); 1072 #endif /* DEBUG */ 1073 } 1074 if (count > 10) 1075 panic("lfs_writeinode: looping"); 1076 /* Write the file again, to gather the blocks */ 1077 lfs_writefile(fs, sp, fs->lfs_ivnode); 1078 } 1079 1080 /* Allocate a new inode block if necessary. */ 1081 if ((ip->i_number != LFS_IFILE_INUM || sp->idp == NULL) && 1082 sp->ibp == NULL) { 1083 /* Allocate a new segment if necessary. */ 1084 if (sp->seg_bytes_left < lfs_sb_getibsize(fs) || 1085 sp->sum_bytes_left < IINFOSIZE(fs)) 1086 (void) lfs_writeseg(fs, sp); 1087 1088 /* Get next inode block. */ 1089 daddr = lfs_sb_getoffset(fs); 1090 lfs_sb_addoffset(fs, lfs_btofsb(fs, lfs_sb_getibsize(fs))); 1091 sp->ibp = *sp->cbpp++ = 1092 getblk(VTOI(fs->lfs_ivnode)->i_devvp, 1093 LFS_FSBTODB(fs, daddr), lfs_sb_getibsize(fs), 0, 0); 1094 gotblk++; 1095 1096 /* Zero out inode numbers */ 1097 for (i = 0; i < LFS_INOPB(fs); ++i) { 1098 union lfs_dinode *tmpdi; 1099 1100 tmpdi = (union lfs_dinode *)((char *)sp->ibp->b_data + 1101 DINOSIZE(fs) * i); 1102 lfs_dino_setinumber(fs, tmpdi, 0); 1103 } 1104 1105 ++sp->start_bpp; 1106 lfs_sb_subavail(fs, lfs_btofsb(fs, lfs_sb_getibsize(fs))); 1107 /* Set remaining space counters. */ 1108 sp->seg_bytes_left -= lfs_sb_getibsize(fs); 1109 sp->sum_bytes_left -= IINFOSIZE(fs); 1110 1111 /* Store the address in the segment summary. */ 1112 iip = NTH_IINFO(fs, sp->segsum, sp->ninodes / LFS_INOPB(fs)); 1113 lfs_ii_setblock(fs, iip, daddr); 1114 } 1115 1116 /* Check VU_DIROP in case there is a new file with no data blocks */ 1117 if (vp->v_uflag & VU_DIROP) { 1118 ssp = (SEGSUM *)sp->segsum; 1119 lfs_ss_setflags(fs, ssp, 1120 lfs_ss_getflags(fs, ssp) | (SS_DIROP|SS_CONT)); 1121 } 1122 1123 /* Update the inode times and copy the inode onto the inode page. */ 1124 /* XXX kludge --- don't redirty the ifile just to put times on it */ 1125 if (ip->i_number != LFS_IFILE_INUM) 1126 LFS_ITIMES(ip, NULL, NULL, NULL); 1127 1128 /* 1129 * If this is the Ifile, and we've already written the Ifile in this 1130 * partial segment, just overwrite it (it's not on disk yet) and 1131 * continue. 1132 * 1133 * XXX we know that the bp that we get the second time around has 1134 * already been gathered. 1135 */ 1136 if (ip->i_number == LFS_IFILE_INUM && sp->idp) { 1137 lfs_copy_dinode(fs, sp->idp, ip->i_din); 1138 ip->i_lfs_osize = ip->i_size; 1139 return 0; 1140 } 1141 1142 bp = sp->ibp; 1143 cdp = DINO_IN_BLOCK(fs, bp->b_data, sp->ninodes % LFS_INOPB(fs)); 1144 DLOG((DLOG_SU, "write ino %jd to 0x%jx (seg %jd)\n", 1145 (intmax_t)ip->i_number, 1146 (intmax_t)LFS_DBTOFSB(fs, bp->b_blkno), 1147 (intmax_t)lfs_dtosn(fs, LFS_DBTOFSB(fs, bp->b_blkno)))); 1148 lfs_copy_dinode(fs, cdp, ip->i_din); 1149 1150 /* 1151 * This inode is on its way to disk; clear its VU_DIROP status when 1152 * the write is complete. 1153 */ 1154 if (vp->v_uflag & VU_DIROP) { 1155 if (!(sp->seg_flags & SEGM_CLEAN)) 1156 ip->i_state |= IN_CDIROP; 1157 else { 1158 DLOG((DLOG_DIROP, "lfs_writeinode: not clearing" 1159 " dirop for cleaned ino %d\n", 1160 (int)ip->i_number)); 1161 } 1162 } 1163 1164 /* 1165 * If cleaning, link counts and directory file sizes cannot change, 1166 * since those would be directory operations---even if the file 1167 * we are writing is marked VU_DIROP we should write the old values. 1168 * If we're not cleaning, of course, update the values so we get 1169 * current values the next time we clean. 1170 */ 1171 if (sp->seg_flags & SEGM_CLEAN) { 1172 if (vp->v_uflag & VU_DIROP) { 1173 lfs_dino_setnlink(fs, cdp, ip->i_lfs_odnlink); 1174 /* if (vp->v_type == VDIR) */ 1175 lfs_dino_setsize(fs, cdp, ip->i_lfs_osize); 1176 } 1177 } else { 1178 ip->i_lfs_odnlink = lfs_dino_getnlink(fs, cdp); 1179 ip->i_lfs_osize = ip->i_size; 1180 } 1181 1182 1183 /* We can finish the segment accounting for truncations now */ 1184 lfs_finalize_ino_seguse(fs, ip); 1185 1186 /* 1187 * If we are cleaning, ensure that we don't write UNWRITTEN disk 1188 * addresses to disk; possibly change the on-disk record of 1189 * the inode size, either by reverting to the previous size 1190 * (in the case of cleaning) or by verifying the inode's block 1191 * holdings (in the case of files being allocated as they are being 1192 * written). 1193 * XXX By not writing UNWRITTEN blocks, we are making the lfs_avail 1194 * XXX count on disk wrong by the same amount. We should be 1195 * XXX able to "borrow" from lfs_avail and return it after the 1196 * XXX Ifile is written. See also in lfs_writeseg. 1197 */ 1198 1199 /* Check file size based on highest allocated block */ 1200 if (((lfs_dino_getmode(fs, ip->i_din) & LFS_IFMT) == LFS_IFREG || 1201 (lfs_dino_getmode(fs, ip->i_din) & LFS_IFMT) == LFS_IFDIR) && 1202 ip->i_size > ((ip->i_lfs_hiblk + 1) << lfs_sb_getbshift(fs))) { 1203 lfs_dino_setsize(fs, cdp, (ip->i_lfs_hiblk + 1) << lfs_sb_getbshift(fs)); 1204 DLOG((DLOG_SEG, "lfs_writeinode: ino %d size %" PRId64 " -> %" 1205 PRId64 "\n", (int)ip->i_number, ip->i_size, lfs_dino_getsize(fs, cdp))); 1206 } 1207 if (ip->i_lfs_effnblks != lfs_dino_getblocks(fs, ip->i_din)) { 1208 DLOG((DLOG_SEG, "lfs_writeinode: cleansing ino %d eff %jd != nblk %d)" 1209 " at %jx\n", ip->i_number, (intmax_t)ip->i_lfs_effnblks, 1210 lfs_dino_getblocks(fs, ip->i_din), (uintmax_t)lfs_sb_getoffset(fs))); 1211 for (i=0; i<ULFS_NDADDR; i++) { 1212 if (lfs_dino_getdb(fs, cdp, i) == UNWRITTEN) { 1213 DLOG((DLOG_SEG, "lfs_writeinode: wiping UNWRITTEN\n")); 1214 lfs_dino_setdb(fs, cdp, i, 0); 1215 } 1216 } 1217 for (i=0; i<ULFS_NIADDR; i++) { 1218 if (lfs_dino_getib(fs, cdp, i) == UNWRITTEN) { 1219 DLOG((DLOG_SEG, "lfs_writeinode: wiping UNWRITTEN\n")); 1220 lfs_dino_setib(fs, cdp, i, 0); 1221 } 1222 } 1223 } 1224 1225 #ifdef DIAGNOSTIC 1226 /* 1227 * Check dinode held blocks against dinode size. 1228 * This should be identical to the check in lfs_vget(). 1229 */ 1230 for (i = (lfs_dino_getsize(fs, cdp) + lfs_sb_getbsize(fs) - 1) >> lfs_sb_getbshift(fs); 1231 i < ULFS_NDADDR; i++) { 1232 KASSERT(i >= 0); 1233 if ((lfs_dino_getmode(fs, cdp) & LFS_IFMT) == LFS_IFLNK) 1234 continue; 1235 if (((lfs_dino_getmode(fs, cdp) & LFS_IFMT) == LFS_IFBLK || 1236 (lfs_dino_getmode(fs, cdp) & LFS_IFMT) == LFS_IFCHR) && i == 0) 1237 continue; 1238 if (lfs_dino_getdb(fs, cdp, i) != 0) { 1239 # ifdef DEBUG 1240 lfs_dump_dinode(fs, cdp); 1241 # endif 1242 panic("writing inconsistent inode"); 1243 } 1244 } 1245 #endif /* DIAGNOSTIC */ 1246 1247 if (!(ip->i_state & IN_CLEANING)) { 1248 /* XXX IN_ALLMOD */ 1249 LFS_CLR_UINO(ip, IN_ACCESSED | IN_ACCESS | IN_CHANGE | 1250 IN_UPDATE | IN_MODIFY); 1251 if (ip->i_lfs_effnblks == lfs_dino_getblocks(fs, ip->i_din)) 1252 LFS_CLR_UINO(ip, IN_MODIFIED); 1253 else { 1254 DLOG((DLOG_VNODE, "lfs_writeinode: ino %d: real " 1255 "blks=%d, eff=%jd\n", ip->i_number, 1256 lfs_dino_getblocks(fs, ip->i_din), (intmax_t)ip->i_lfs_effnblks)); 1257 } 1258 } 1259 1260 if (ip->i_number == LFS_IFILE_INUM) { 1261 /* We know sp->idp == NULL */ 1262 sp->idp = DINO_IN_BLOCK(fs, bp, sp->ninodes % LFS_INOPB(fs)); 1263 1264 /* Not dirty any more */ 1265 mutex_enter(&lfs_lock); 1266 fs->lfs_flags &= ~LFS_IFDIRTY; 1267 mutex_exit(&lfs_lock); 1268 } 1269 1270 if (gotblk) { 1271 mutex_enter(&bufcache_lock); 1272 LFS_LOCK_BUF(bp); 1273 brelsel(bp, 0); 1274 mutex_exit(&bufcache_lock); 1275 } 1276 1277 /* Increment inode count in segment summary block. */ 1278 1279 ssp = (SEGSUM *)sp->segsum; 1280 lfs_ss_setninos(fs, ssp, lfs_ss_getninos(fs, ssp) + 1); 1281 1282 /* If this page is full, set flag to allocate a new page. */ 1283 if (++sp->ninodes % LFS_INOPB(fs) == 0) 1284 sp->ibp = NULL; 1285 1286 lfs_update_iaddr(fs, ip, LFS_DBTOFSB(fs, bp->b_blkno)); 1287 1288 return 0; 1289 } 1290 1291 int 1292 lfs_gatherblock(struct segment *sp, struct buf *bp, kmutex_t *mptr) 1293 { 1294 struct lfs *fs; 1295 int vers; 1296 int j, blksinblk; 1297 1298 ASSERT_SEGLOCK(sp->fs); 1299 KASSERT((bp->b_flags & B_LOCKED) || (bp->b_cflags & (BC_NOCACHE | BC_INVAL))); 1300 KASSERT(bp->b_vp != NULL); 1301 KASSERTMSG((sp->vp != NULL), 1302 "lfs_gatherblock: Null vp in segment"); 1303 1304 /* 1305 * XXX If blksinblk > 1, might we not go into infinite loop here? 1306 * XXX lfs_writeseg doesn't do anything if there 1307 * XXX are no blocks, and we won't transition to a new segment 1308 * XXX unless we are within a block of the end; so if we blksinblk 1309 * XXX is, say, 3 and we are 2 blocks from the end, I expect 1310 * XXX this will not actually address the problem and instead 1311 * XXX we will return to this point again with nothing changed. 1312 * XXX 2025-10-08 kes 1313 */ 1314 /* If full, finish this segment. */ 1315 fs = sp->fs; 1316 blksinblk = howmany(bp->b_bcount, lfs_sb_getbsize(fs)); 1317 if (sp->sum_bytes_left < LFS_BLKPTRSIZE(fs) * blksinblk || 1318 sp->seg_bytes_left < bp->b_bcount) { 1319 KASSERT(++sp->gatherblock_loopcount < 2); 1320 if (mptr) 1321 mutex_exit(mptr); 1322 lfs_updatemeta(sp); 1323 1324 DLOG((DLOG_SEG, "lfs_gatherblock trimming at offset %lx\n", 1325 (long)lfs_sb_getoffset(fs))); 1326 1327 vers = lfs_fi_getversion(fs, sp->fip); 1328 (void) lfs_writeseg(fs, sp); 1329 1330 /* Add the current file to the segment summary. */ 1331 lfs_acquire_finfo(fs, VTOI(sp->vp)->i_number, vers); 1332 1333 if (mptr) 1334 mutex_enter(mptr); 1335 return (1); 1336 } 1337 sp->gatherblock_loopcount = 0; 1338 1339 if (bp->b_flags & B_GATHERED) { 1340 DLOG((DLOG_SEG, "lfs_gatherblock: already gathered! Ino %ju," 1341 " lbn %" PRId64 "\n", 1342 (uintmax_t)lfs_fi_getino(fs, sp->fip), bp->b_lblkno)); 1343 return (0); 1344 } 1345 1346 /* Insert into the buffer list, update the FINFO block. */ 1347 bp->b_flags |= B_GATHERED; 1348 1349 *sp->cbpp++ = bp; 1350 for (j = 0; j < blksinblk; j++) { 1351 unsigned bn; 1352 1353 bn = lfs_fi_getnblocks(fs, sp->fip); 1354 lfs_fi_setnblocks(fs, sp->fip, bn+1); 1355 lfs_fi_setblock(fs, sp->fip, bn, bp->b_lblkno + j); 1356 /* This block's accounting moves from lfs_favail to lfs_avail */ 1357 lfs_deregister_block(sp->vp, bp->b_lblkno + j); 1358 } 1359 1360 sp->sum_bytes_left -= LFS_BLKPTRSIZE(fs) * blksinblk; 1361 sp->seg_bytes_left -= bp->b_bcount; 1362 return (0); 1363 } 1364 1365 /* 1366 * Similar to lfs_gather, but simply throws the buffers away. 1367 * Does not require the seglock. 1368 */ 1369 int 1370 lfs_ungather(struct lfs *fs, struct segment *sp, struct vnode *vp, 1371 int (*match)(struct lfs *, struct buf *)) 1372 { 1373 struct buf *bp, *nbp; 1374 int error = 0; 1375 1376 if (vp->v_type == VBLK) 1377 return 0; 1378 mutex_enter(&bufcache_lock); 1379 1380 restart: 1381 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 1382 KASSERT(bp->b_vp == vp); 1383 nbp = LIST_NEXT(bp, b_vnbufs); 1384 if (!match(fs, bp)) 1385 continue; 1386 error = bbusy(bp, false, hz / 10 + 1, NULL); 1387 if (error != 0) { 1388 if (error == EPASSTHROUGH) 1389 goto restart; 1390 mutex_exit(&bufcache_lock); 1391 return (error); 1392 } 1393 brelsel(bp, (bp->b_flags & B_GATHERED ? 1394 0 : (BC_INVAL | BC_VFLUSH))); 1395 } 1396 1397 mutex_exit(&bufcache_lock); 1398 1399 return 0; 1400 } 1401 1402 int 1403 lfs_gather(struct lfs *fs, struct segment *sp, struct vnode *vp, 1404 int (*match)(struct lfs *, struct buf *)) 1405 { 1406 struct buf *bp, *nbp; 1407 int count = 0; 1408 1409 ASSERT_SEGLOCK(fs); 1410 if (vp->v_type == VBLK) 1411 return 0; 1412 KASSERT(sp->vp == NULL); 1413 sp->vp = vp; 1414 mutex_enter(&bufcache_lock); 1415 1416 #ifndef LFS_NO_BACKBUF_HACK 1417 /* This is a hack to see if ordering the blocks in LFS makes a difference. */ 1418 # define BUF_OFFSET \ 1419 (((char *)&LIST_NEXT(bp, b_vnbufs)) - (char *)bp) 1420 # define BACK_BUF(BP) \ 1421 ((struct buf *)(((char *)(BP)->b_vnbufs.le_prev) - BUF_OFFSET)) 1422 # define BEG_OF_LIST \ 1423 ((struct buf *)(((char *)&LIST_FIRST(&vp->v_dirtyblkhd)) - BUF_OFFSET)) 1424 1425 loop: 1426 /* Find last buffer. */ 1427 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); 1428 bp && LIST_NEXT(bp, b_vnbufs) != NULL; 1429 bp = LIST_NEXT(bp, b_vnbufs)) 1430 continue; 1431 1432 for (; bp && bp != BEG_OF_LIST; bp = nbp) { 1433 nbp = BACK_BUF(bp); 1434 #else /* LFS_NO_BACKBUF_HACK */ 1435 loop: 1436 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 1437 nbp = LIST_NEXT(bp, b_vnbufs); 1438 #endif /* LFS_NO_BACKBUF_HACK */ 1439 if ((bp->b_cflags & BC_BUSY) != 0 || 1440 (bp->b_flags & B_GATHERED) != 0 || 1441 (bp->b_flags & B_LOCKED) == 0 || 1442 (bp->b_oflags & BO_DELWRI) == 0 || 1443 !match(fs, bp)) { 1444 #ifdef DEBUG 1445 if (vp == fs->lfs_ivnode && 1446 (bp->b_cflags & BC_BUSY) != 0 && 1447 (bp->b_flags & B_GATHERED) == 0) 1448 log(LOG_NOTICE, "lfs_gather: ifile lbn %" 1449 PRId64 " busy (%x) at 0x%jx", 1450 bp->b_lblkno, bp->b_flags, 1451 (uintmax_t)lfs_sb_getoffset(fs)); 1452 #endif 1453 continue; 1454 } 1455 #ifdef DIAGNOSTIC 1456 # ifdef LFS_USE_BC_INVAL 1457 if ((bp->b_cflags & BC_INVAL) != 0 && bp->b_iodone == NULL) { 1458 DLOG((DLOG_SEG, "lfs_gather: lbn %" PRId64 1459 " is BC_INVAL\n", bp->b_lblkno)); 1460 VOP_PRINT(bp->b_vp); 1461 } 1462 # endif /* LFS_USE_BC_INVAL */ 1463 if (!(bp->b_oflags & BO_DELWRI)) 1464 panic("lfs_gather: bp not BO_DELWRI"); 1465 if (!(bp->b_flags & B_LOCKED)) { 1466 DLOG((DLOG_SEG, "lfs_gather: lbn %" PRId64 1467 " blk %" PRId64 " not B_LOCKED\n", 1468 bp->b_lblkno, 1469 LFS_DBTOFSB(fs, bp->b_blkno))); 1470 VOP_PRINT(bp->b_vp); 1471 panic("lfs_gather: bp not B_LOCKED"); 1472 } 1473 #endif 1474 if (lfs_gatherblock(sp, bp, &bufcache_lock)) { 1475 goto loop; 1476 } 1477 count++; 1478 } 1479 mutex_exit(&bufcache_lock); 1480 lfs_updatemeta(sp); 1481 KASSERT(sp->vp == vp); 1482 sp->vp = NULL; 1483 return count; 1484 } 1485 1486 #if DEBUG 1487 # define DEBUG_OOFF(n) do { \ 1488 if (ooff == 0) { \ 1489 DLOG((DLOG_SEG, "lfs_updatemeta[%d]: warning: writing " \ 1490 "ino %d lbn %" PRId64 " at 0x%" PRIx32 \ 1491 ", was 0x0 (or %" PRId64 ")\n", \ 1492 (n), ip->i_number, lbn, ndaddr, daddr)); \ 1493 } \ 1494 } while (0) 1495 #else 1496 # define DEBUG_OOFF(n) 1497 #endif 1498 1499 /* 1500 * Change the given block's address to ndaddr, finding its previous 1501 * location using ulfs_bmaparray(). 1502 * 1503 * Account for this change in the segment table. 1504 * 1505 * called with sp == NULL by roll-forwarding code. 1506 */ 1507 #define NOT_ON_DISK(daddr) ((daddr) == 0 || (daddr) == UNASSIGNED || (daddr) == UNWRITTEN) 1508 1509 void 1510 lfs_update_single(struct lfs *fs, struct segment *sp, 1511 struct vnode *vp, daddr_t lbn, daddr_t ndaddr, int size) 1512 { 1513 SEGUSE *sup; 1514 struct buf *bp; 1515 struct indir a[ULFS_NIADDR + 2], *ap; 1516 struct inode *ip; 1517 daddr_t daddr, ooff; 1518 int num, error; 1519 int bb, osize, obb; 1520 1521 ASSERT_SEGLOCK(fs); 1522 KASSERT(sp == NULL || sp->vp == vp); 1523 ip = VTOI(vp); 1524 1525 KASSERTMSG(sp == NULL || lfs_dtosn(fs, ndaddr) 1526 == lfs_dtosn(fs, lfs_sb_getoffset(fs)), 1527 "Segment overwrite"); 1528 1529 error = ulfs_bmaparray(vp, lbn, &daddr, a, &num, NULL, NULL); 1530 if (error) 1531 panic("lfs_updatemeta: ulfs_bmaparray returned %d", error); 1532 1533 KASSERT(daddr <= LFS_MAX_DADDR(fs)); 1534 if (daddr > 0) 1535 daddr = LFS_DBTOFSB(fs, daddr); 1536 1537 bb = lfs_numfrags(fs, size); 1538 switch (num) { 1539 case 0: 1540 ooff = lfs_dino_getdb(fs, ip->i_din, lbn); 1541 DEBUG_OOFF(0); 1542 if (NOT_ON_DISK(ooff)) 1543 lfs_dino_setblocks(fs, ip->i_din, 1544 lfs_dino_getblocks(fs, ip->i_din) + bb); 1545 else { 1546 /* possible fragment truncation or extension */ 1547 obb = lfs_btofsb(fs, ip->i_lfs_fragsize[lbn]); 1548 lfs_dino_setblocks(fs, ip->i_din, 1549 lfs_dino_getblocks(fs, ip->i_din) + (bb-obb)); 1550 } 1551 lfs_dino_setdb(fs, ip->i_din, lbn, ndaddr); 1552 break; 1553 case 1: 1554 ooff = lfs_dino_getib(fs, ip->i_din, a[0].in_off); 1555 DEBUG_OOFF(1); 1556 if (NOT_ON_DISK(ooff)) 1557 lfs_dino_setblocks(fs, ip->i_din, 1558 lfs_dino_getblocks(fs, ip->i_din) + bb); 1559 lfs_dino_setib(fs, ip->i_din, a[0].in_off, ndaddr); 1560 break; 1561 default: 1562 ap = &a[num - 1]; 1563 if (bread(vp, ap->in_lbn, lfs_sb_getbsize(fs), 1564 B_MODIFY, &bp)) 1565 panic("lfs_updatemeta: bread bno %" PRId64, 1566 ap->in_lbn); 1567 1568 ooff = lfs_iblock_get(fs, bp->b_data, ap->in_off); 1569 DEBUG_OOFF(num); 1570 if (NOT_ON_DISK(ooff)) 1571 lfs_dino_setblocks(fs, ip->i_din, 1572 lfs_dino_getblocks(fs, ip->i_din) + bb); 1573 lfs_iblock_set(fs, bp->b_data, ap->in_off, ndaddr); 1574 (void) VOP_BWRITE(bp->b_vp, bp); 1575 } 1576 1577 KASSERT(ooff == 0 || ooff == UNWRITTEN || ooff == daddr); 1578 1579 /* Update hiblk when extending the file */ 1580 if (lbn > ip->i_lfs_hiblk) 1581 ip->i_lfs_hiblk = lbn; 1582 1583 /* 1584 * Though we'd rather it couldn't, this *can* happen right now 1585 * if cleaning blocks and regular blocks coexist. 1586 */ 1587 /* KASSERT(daddr < fs->lfs_lastpseg || daddr > ndaddr); */ 1588 1589 /* 1590 * Update segment usage information, based on old size 1591 * and location. 1592 */ 1593 if (daddr > 0) { 1594 u_int32_t oldsn = lfs_dtosn(fs, daddr); 1595 1596 KASSERT(oldsn < lfs_sb_getnseg(fs)); 1597 if (lbn >= 0 && lbn < ULFS_NDADDR) 1598 osize = ip->i_lfs_fragsize[lbn]; 1599 else 1600 osize = lfs_sb_getbsize(fs); 1601 LFS_SEGENTRY(sup, fs, oldsn, bp); 1602 KASSERTMSG(sup->su_nbytes >= osize, 1603 "lfs_updatemeta: negative bytes " 1604 "(segment %" PRIu32 " short by %" PRId64 1605 ")\n" 1606 "lfs_updatemeta: ino %llu, lbn %" PRId64 1607 ", addr = 0x%" PRIx64 "\n", 1608 lfs_dtosn(fs, daddr), 1609 (int64_t)osize - sup->su_nbytes, 1610 (unsigned long long)ip->i_number, lbn, daddr); 1611 DLOG((DLOG_SU, "seg %jd -= %jd for ino %jd lbn %jd" 1612 " db 0x%jx\n", 1613 (intmax_t)lfs_dtosn(fs, daddr), (intmax_t)osize, 1614 (intmax_t)ip->i_number, (intmax_t)lbn, 1615 (intmax_t)daddr)); 1616 sup->su_nbytes -= osize; 1617 if (!(bp->b_flags & B_GATHERED)) { 1618 mutex_enter(&lfs_lock); 1619 fs->lfs_flags |= LFS_IFDIRTY; 1620 mutex_exit(&lfs_lock); 1621 } 1622 LFS_WRITESEGENTRY(sup, fs, oldsn, bp); 1623 } 1624 /* 1625 * Now that this block has a new address, and its old 1626 * segment no longer owns it, we can forget about its 1627 * old size. 1628 */ 1629 if (lbn >= 0 && lbn < ULFS_NDADDR) 1630 ip->i_lfs_fragsize[lbn] = size; 1631 } 1632 1633 /* 1634 * Update the metadata that points to the blocks listed in the FINFO 1635 * array. 1636 */ 1637 void 1638 lfs_updatemeta(struct segment *sp) 1639 { 1640 struct buf *sbp; 1641 struct lfs *fs; 1642 struct vnode *vp; 1643 daddr_t lbn; 1644 int i, nblocks, num; 1645 int __diagused nblocks_orig; 1646 int bb; 1647 int bytesleft, size; 1648 unsigned lastlength; 1649 union lfs_blocks tmpptr; 1650 1651 fs = sp->fs; 1652 vp = sp->vp; 1653 ASSERT_SEGLOCK(fs); 1654 1655 /* 1656 * This used to be: 1657 * 1658 * nblocks = &sp->fip->fi_blocks[sp->fip->fi_nblocks] - sp->start_lbp; 1659 * 1660 * that is, it allowed for the possibility that start_lbp did 1661 * not point to the beginning of the finfo block pointer area. 1662 * This particular formulation is six kinds of painful in the 1663 * lfs64 world where we have two sizes of block pointer, so 1664 * unless/until everything can be cleaned up to not move 1665 * start_lbp around but instead use an offset, we do the 1666 * following: 1667 * 1. Get NEXT_FINFO(sp->fip). This is the same pointer as 1668 * &sp->fip->fi_blocks[sp->fip->fi_nblocks], just the wrong 1669 * type. (Ugh.) 1670 * 2. Cast it to void *, then assign it to a temporary 1671 * union lfs_blocks. 1672 * 3. Subtract start_lbp from that. 1673 * 4. Save the value of nblocks in blocks_orig so we can 1674 * assert below that it hasn't changed without repeating this 1675 * rubbish. 1676 * 1677 * XXX. 1678 */ 1679 lfs_blocks_fromvoid(fs, &tmpptr, (void *)NEXT_FINFO(fs, sp->fip)); 1680 nblocks = lfs_blocks_sub(fs, &tmpptr, &sp->start_lbp); 1681 nblocks_orig = nblocks; 1682 1683 KASSERT(nblocks >= 0); 1684 KASSERT(vp != NULL); 1685 if (nblocks == 0) 1686 return; 1687 1688 /* 1689 * This count may be high due to oversize blocks from lfs_gop_write. 1690 * Correct for this. (XXX we should be able to keep track of these.) 1691 */ 1692 for (i = 0; i < nblocks; i++) { 1693 if (sp->start_bpp[i] == NULL) { 1694 DLOG((DLOG_SEG, "lfs_updatemeta: nblocks = %d, not %d\n", i, nblocks)); 1695 nblocks = i; 1696 break; 1697 } 1698 num = howmany(sp->start_bpp[i]->b_bcount, lfs_sb_getbsize(fs)); 1699 KASSERT(sp->start_bpp[i]->b_lblkno >= 0 || num == 1); 1700 nblocks -= num - 1; 1701 } 1702 1703 #if 0 1704 /* pre-lfs64 assertion */ 1705 KASSERT(vp->v_type == VREG || 1706 nblocks == &sp->fip->fi_blocks[sp->fip->fi_nblocks] - sp->start_lbp); 1707 #else 1708 KASSERT(vp->v_type == VREG || nblocks == nblocks_orig); 1709 #endif 1710 KASSERT(nblocks == sp->cbpp - sp->start_bpp); 1711 1712 /* 1713 * Sort the blocks. 1714 * 1715 * We have to sort even if the blocks come from the 1716 * cleaner, because there might be other pending blocks on the 1717 * same inode...and if we don't sort, and there are fragments 1718 * present, blocks may be written in the wrong place. 1719 */ 1720 lfs_shellsort(fs, sp->start_bpp, &sp->start_lbp, nblocks, lfs_sb_getbsize(fs)); 1721 1722 /* 1723 * Record the length of the last block in case it's a fragment. 1724 * If there are indirect blocks present, they sort last. An 1725 * indirect block will be lfs_bsize and its presence indicates 1726 * that you cannot have fragments. 1727 * 1728 * XXX This last is a lie. A cleaned fragment can coexist with 1729 * XXX a later indirect block. This will continue to be 1730 * XXX true until lfs_markv is fixed to do everything with 1731 * XXX fake blocks (including fake inodes and fake indirect blocks). 1732 */ 1733 lastlength = ((sp->start_bpp[nblocks - 1]->b_bcount - 1) & 1734 lfs_sb_getbmask(fs)) + 1; 1735 lfs_fi_setlastlength(fs, sp->fip, lastlength); 1736 1737 /* 1738 * Assign disk addresses, and update references to the logical 1739 * block and the segment usage information. 1740 */ 1741 for (i = nblocks; i--; ++sp->start_bpp) { 1742 sbp = *sp->start_bpp; 1743 lbn = lfs_blocks_get(fs, &sp->start_lbp, 0); 1744 KASSERT(sbp->b_lblkno == lbn); 1745 1746 sbp->b_blkno = LFS_FSBTODB(fs, lfs_sb_getoffset(fs)); 1747 1748 /* 1749 * If we write a frag in the wrong place, the cleaner won't 1750 * be able to correctly identify its size later, and the 1751 * segment will be uncleanable. (Even worse, it will assume 1752 * that the indirect block that actually ends the list 1753 * is of a smaller size!) 1754 */ 1755 if ((sbp->b_bcount & lfs_sb_getbmask(fs)) && i != 0) 1756 panic("lfs_updatemeta: fragment is not last block"); 1757 1758 /* 1759 * For each subblock in this possibly oversized block, 1760 * update its address on disk. 1761 */ 1762 KASSERT(lbn >= 0 || sbp->b_bcount == lfs_sb_getbsize(fs)); 1763 KASSERT(sbp->b_vp != NULL); 1764 KASSERT(vp == sbp->b_vp); 1765 for (bytesleft = sbp->b_bcount; bytesleft > 0; 1766 bytesleft -= lfs_sb_getbsize(fs)) { 1767 size = MIN(bytesleft, lfs_sb_getbsize(fs)); 1768 bb = lfs_numfrags(fs, size); 1769 lbn = lfs_blocks_get(fs, &sp->start_lbp, 0); 1770 lfs_blocks_inc(fs, &sp->start_lbp); 1771 lfs_update_single(fs, sp, sp->vp, lbn, lfs_sb_getoffset(fs), 1772 size); 1773 lfs_sb_addoffset(fs, bb); 1774 } 1775 1776 } 1777 1778 /* This inode has been modified */ 1779 LFS_SET_UINO(VTOI(vp), IN_MODIFIED); 1780 } 1781 1782 /* 1783 * Move lfs_offset to a segment earlier than newsn. 1784 */ 1785 int 1786 lfs_rewind(struct lfs *fs, int newsn) 1787 { 1788 int sn, osn, isdirty; 1789 struct buf *bp; 1790 SEGUSE *sup; 1791 1792 ASSERT_SEGLOCK(fs); 1793 1794 osn = lfs_dtosn(fs, lfs_sb_getoffset(fs)); 1795 if (osn < newsn) 1796 return 0; 1797 1798 /* lfs_avail eats the remaining space in this segment */ 1799 lfs_sb_subavail(fs, lfs_sb_getfsbpseg(fs) - (lfs_sb_getoffset(fs) - lfs_sb_getcurseg(fs))); 1800 1801 /* Find a low-numbered segment */ 1802 for (sn = 0; sn < lfs_sb_getnseg(fs); ++sn) { 1803 LFS_SEGENTRY(sup, fs, sn, bp); 1804 isdirty = sup->su_flags & SEGUSE_DIRTY; 1805 brelse(bp, 0); 1806 1807 if (!isdirty) 1808 break; 1809 } 1810 if (sn == lfs_sb_getnseg(fs)) 1811 panic("lfs_rewind: no clean segments"); 1812 if (newsn >= 0 && sn >= newsn) 1813 return ENOENT; 1814 lfs_sb_setnextseg(fs, lfs_sntod(fs, sn)); 1815 lfs_newseg(fs); 1816 lfs_sb_setoffset(fs, lfs_sb_getcurseg(fs)); 1817 1818 return 0; 1819 } 1820 1821 /* 1822 * Start a new partial segment. 1823 * 1824 * Return 1 when we entered to a new segment. 1825 * Otherwise, return 0. 1826 */ 1827 int 1828 lfs_initseg(struct lfs *fs, uint16_t flags) 1829 { 1830 struct segment *sp = fs->lfs_sp; 1831 SEGSUM *ssp; 1832 struct buf *sbp; /* buffer for SEGSUM */ 1833 int repeat = 0; /* return value */ 1834 SEGUSE *sup; 1835 struct buf *bp; 1836 1837 ASSERT_SEGLOCK(fs); 1838 /* Advance to the next segment. */ 1839 if (!LFS_PARTIAL_FITS(fs)) { 1840 /* lfs_avail eats the remaining space */ 1841 lfs_sb_subavail(fs, lfs_sb_getfsbpseg(fs) - (lfs_sb_getoffset(fs) - 1842 lfs_sb_getcurseg(fs))); 1843 /* Wake up any cleaning procs waiting on this file system. */ 1844 lfs_wakeup_cleaner(fs); 1845 lfs_newseg(fs); 1846 repeat = 1; 1847 lfs_sb_setoffset(fs, lfs_sb_getcurseg(fs)); 1848 1849 sp->seg_number = lfs_dtosn(fs, lfs_sb_getcurseg(fs)); 1850 sp->seg_bytes_left = lfs_fsbtob(fs, lfs_sb_getfsbpseg(fs)); 1851 1852 /* 1853 * If the segment contains a superblock, update the offset 1854 * and summary address to skip over it. 1855 */ 1856 LFS_SEGENTRY(sup, fs, sp->seg_number, bp); 1857 if (sup->su_flags & SEGUSE_SUPERBLOCK) { 1858 lfs_sb_addoffset(fs, lfs_btofsb(fs, LFS_SBPAD)); 1859 sp->seg_bytes_left -= LFS_SBPAD; 1860 } 1861 brelse(bp, 0); 1862 /* Segment zero could also contain the labelpad */ 1863 if (lfs_sb_getversion(fs) > 1 && sp->seg_number == 0 && 1864 lfs_sb_gets0addr(fs) < lfs_btofsb(fs, LFS_LABELPAD)) { 1865 lfs_sb_addoffset(fs, 1866 lfs_btofsb(fs, LFS_LABELPAD) - lfs_sb_gets0addr(fs)); 1867 sp->seg_bytes_left -= 1868 LFS_LABELPAD - lfs_fsbtob(fs, lfs_sb_gets0addr(fs)); 1869 } 1870 } else { 1871 sp->seg_number = lfs_dtosn(fs, lfs_sb_getcurseg(fs)); 1872 sp->seg_bytes_left = lfs_fsbtob(fs, lfs_sb_getfsbpseg(fs) - 1873 (lfs_sb_getoffset(fs) - lfs_sb_getcurseg(fs))); 1874 } 1875 lfs_sb_setlastpseg(fs, lfs_sb_getoffset(fs)); 1876 1877 /* Record first address of this partial segment */ 1878 if (sp->seg_flags & SEGM_CLEAN) { 1879 fs->lfs_cleanint[fs->lfs_cleanind] = lfs_sb_getoffset(fs); 1880 if (++fs->lfs_cleanind >= LFS_MAX_CLEANIND) { 1881 /* "1" is the artificial inc in lfs_seglock */ 1882 mutex_enter(&lfs_lock); 1883 while (fs->lfs_iocount > 1) { 1884 mtsleep(&fs->lfs_iocount, PRIBIO + 1, 1885 "lfs_initseg", 0, &lfs_lock); 1886 } 1887 mutex_exit(&lfs_lock); 1888 fs->lfs_cleanind = 0; 1889 } 1890 } 1891 1892 sp->fs = fs; 1893 sp->ibp = NULL; 1894 sp->idp = NULL; 1895 sp->ninodes = 0; 1896 1897 sp->cbpp = sp->bpp; 1898 1899 /* Get a new buffer for SEGSUM */ 1900 sbp = lfs_newbuf(fs, VTOI(fs->lfs_ivnode)->i_devvp, 1901 LFS_FSBTODB(fs, lfs_sb_getoffset(fs)), lfs_sb_getsumsize(fs), LFS_NB_SUMMARY); 1902 1903 /* ... and enter it into the buffer list. */ 1904 *sp->cbpp = sbp; 1905 sp->cbpp++; 1906 lfs_sb_addoffset(fs, lfs_btofsb(fs, lfs_sb_getsumsize(fs))); 1907 1908 sp->start_bpp = sp->cbpp; 1909 1910 if ((fs->lfs_flags & LFS_NOTYET) && curproc->p_pid == fs->lfs_rfpid) 1911 flags |= SS_RFW; 1912 1913 /* Set point to SEGSUM, initialize it. */ 1914 ssp = sp->segsum = sbp->b_data; 1915 memset(ssp, 0, lfs_sb_getsumsize(fs)); 1916 lfs_ss_setnext(fs, ssp, lfs_sb_getnextseg(fs)); 1917 lfs_ss_setnfinfo(fs, ssp, 0); 1918 lfs_ss_setninos(fs, ssp, 0); 1919 lfs_ss_setmagic(fs, ssp, SS_MAGIC); 1920 lfs_ss_setflags(fs, ssp, flags); 1921 1922 /* Set pointer to first FINFO, initialize it. */ 1923 sp->fip = SEGSUM_FINFOBASE(fs, sp->segsum); 1924 lfs_fi_setnblocks(fs, sp->fip, 0); 1925 lfs_fi_setlastlength(fs, sp->fip, 0); 1926 lfs_blocks_fromfinfo(fs, &sp->start_lbp, sp->fip); 1927 1928 sp->seg_bytes_left -= lfs_sb_getsumsize(fs); 1929 sp->sum_bytes_left = lfs_sb_getsumsize(fs) - SEGSUM_SIZE(fs); 1930 1931 return (repeat); 1932 } 1933 1934 int 1935 lfs_invalidate(struct lfs *fs, int sn) 1936 { 1937 SEGUSE *sup; 1938 struct buf *bp; 1939 1940 LFS_SEGENTRY(sup, fs, sn, bp); 1941 if (sup->su_nbytes > 0) { 1942 brelse(bp, 0); 1943 lfs_seguse_clrflag_all(fs, SEGUSE_INVAL); 1944 return EBUSY; 1945 } 1946 sup->su_flags |= SEGUSE_INVAL; 1947 VOP_BWRITE(bp->b_vp, bp); 1948 return 0; 1949 } 1950 1951 /* 1952 * Return the next segment to write. 1953 */ 1954 void 1955 lfs_newseg(struct lfs *fs) 1956 { 1957 CLEANERINFO *cip; 1958 SEGUSE *sup; 1959 struct buf *bp; 1960 int curseg, isdirty, sn, skip_inval; 1961 1962 ASSERT_SEGLOCK(fs); 1963 1964 /* Honor LFCNWRAPSTOP */ 1965 mutex_enter(&lfs_lock); 1966 while (lfs_sb_getnextseg(fs) < lfs_sb_getcurseg(fs) && fs->lfs_nowrap) { 1967 if (fs->lfs_wrappass) { 1968 log(LOG_NOTICE, "%s: wrappass=%d\n", 1969 lfs_sb_getfsmnt(fs), fs->lfs_wrappass); 1970 fs->lfs_wrappass = 0; 1971 break; 1972 } 1973 fs->lfs_wrapstatus = LFS_WRAP_WAITING; 1974 wakeup(&fs->lfs_nowrap); 1975 log(LOG_NOTICE, "%s: waiting at log wrap\n", lfs_sb_getfsmnt(fs)); 1976 mtsleep(&fs->lfs_wrappass, PVFS, "newseg", 10 * hz, 1977 &lfs_lock); 1978 } 1979 fs->lfs_wrapstatus = LFS_WRAP_GOING; 1980 mutex_exit(&lfs_lock); 1981 1982 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, lfs_sb_getnextseg(fs)), bp); 1983 DLOG((DLOG_SU, "lfs_newseg: seg %d := 0 in newseg\n", 1984 lfs_dtosn(fs, lfs_sb_getnextseg(fs)))); 1985 sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE; 1986 /* XXX these flags should not be on clean segmentss */ 1987 sup->su_flags &= ~(SEGUSE_EMPTY | SEGUSE_READY | SEGUSE_ERROR); 1988 sup->su_nbytes = 0; 1989 sup->su_nsums = 0; 1990 sup->su_ninos = 0; 1991 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, lfs_sb_getnextseg(fs)), bp); 1992 1993 LFS_CLEANERINFO(cip, fs, bp); 1994 lfs_ci_shiftcleantodirty(fs, cip, 1); 1995 lfs_sb_setnclean(fs, lfs_ci_getclean(fs, cip)); 1996 LFS_SYNC_CLEANERINFO(cip, fs, bp, 1); 1997 1998 lfs_sb_setlastseg(fs, lfs_sb_getcurseg(fs)); 1999 lfs_sb_setcurseg(fs, lfs_sb_getnextseg(fs)); 2000 skip_inval = 1; 2001 for (sn = curseg = lfs_dtosn(fs, lfs_sb_getcurseg(fs)) + lfs_sb_getinterleave(fs);;) { 2002 sn = (sn + 1) % lfs_sb_getnseg(fs); 2003 2004 if (sn == curseg) { 2005 if (skip_inval) 2006 skip_inval = 0; 2007 else 2008 panic("lfs_nextseg: no clean segments"); 2009 } 2010 LFS_SEGENTRY(sup, fs, sn, bp); 2011 isdirty = sup->su_flags & (SEGUSE_DIRTY | (skip_inval ? SEGUSE_INVAL : 0)); 2012 brelse(bp, 0); 2013 2014 if (!isdirty) 2015 break; 2016 } 2017 if (skip_inval == 0) 2018 lfs_seguse_clrflag_all(fs, SEGUSE_INVAL); 2019 2020 ++fs->lfs_nactive; 2021 lfs_sb_setnextseg(fs, lfs_sntod(fs, sn)); 2022 if (lfs_dostats) { 2023 ++lfs_stats.segsused; 2024 } 2025 } 2026 2027 static struct buf * 2028 lfs_newclusterbuf(struct lfs *fs, struct vnode *vp, daddr_t addr, 2029 int n) 2030 { 2031 struct lfs_cluster *cl; 2032 struct buf **bpp, *bp; 2033 2034 ASSERT_SEGLOCK(fs); 2035 cl = (struct lfs_cluster *)pool_get(&fs->lfs_clpool, PR_WAITOK); 2036 bpp = (struct buf **)pool_get(&fs->lfs_bpppool, PR_WAITOK); 2037 memset(cl, 0, sizeof(*cl)); 2038 cl->fs = fs; 2039 cl->bpp = bpp; 2040 cl->bufcount = 0; 2041 cl->bufsize = 0; 2042 2043 /* If this segment is being written synchronously, note that */ 2044 if (fs->lfs_sp->seg_flags & SEGM_SYNC) { 2045 cl->flags |= LFS_CL_SYNC; 2046 cl->seg = fs->lfs_sp; 2047 ++cl->seg->seg_iocount; 2048 } 2049 2050 /* Get an empty buffer header, or maybe one with something on it */ 2051 bp = getiobuf(vp, true); 2052 bp->b_dev = NODEV; 2053 bp->b_blkno = bp->b_lblkno = addr; 2054 bp->b_iodone = lfs_cluster_aiodone; 2055 bp->b_private = cl; 2056 2057 return bp; 2058 } 2059 2060 int 2061 lfs_writeseg(struct lfs *fs, struct segment *sp) 2062 { 2063 struct buf **bpp, *bp, *cbp, *newbp, *unbusybp; 2064 SEGUSE *sup; 2065 SEGSUM *ssp; 2066 int i; 2067 int do_again, nblocks, byteoffset; 2068 size_t el_size; 2069 struct lfs_cluster *cl; 2070 u_short ninos; 2071 struct vnode *devvp; 2072 char *p = NULL; 2073 struct vnode *vp; 2074 unsigned ibindex, iblimit; 2075 int changed; 2076 u_int32_t sum; 2077 size_t sumstart; 2078 uint16_t oflags; 2079 #ifdef DEBUG 2080 FINFO *fip; 2081 int findex; 2082 #endif 2083 2084 ASSERT_SEGLOCK(fs); 2085 2086 ssp = (SEGSUM *)sp->segsum; 2087 2088 /* 2089 * If there are no buffers other than the segment summary to write, 2090 * don't do anything. If we are the end of a dirop sequence, however, 2091 * write the empty segment summary anyway, to help out the 2092 * roll-forward agent. 2093 */ 2094 if ((nblocks = sp->cbpp - sp->bpp) == 1) { 2095 if ((lfs_ss_getflags(fs, ssp) & (SS_DIROP | SS_CONT)) != SS_DIROP) 2096 return 0; 2097 } 2098 2099 /* Note if partial segment is being written by the cleaner */ 2100 if (sp->seg_flags & SEGM_CLEAN) 2101 lfs_ss_setflags(fs, ssp, lfs_ss_getflags(fs, ssp) | SS_CLEAN); 2102 2103 /* Note if we are writing to reclaim */ 2104 if (sp->seg_flags & SEGM_RECLAIM) { 2105 lfs_ss_setflags(fs, ssp, lfs_ss_getflags(fs, ssp) | SS_RECLAIM); 2106 lfs_ss_setreclino(fs, ssp, fs->lfs_reclino); 2107 } 2108 /* Save flags to attach to a new partial segment, if we need one */ 2109 oflags = lfs_ss_getflags(fs, ssp); 2110 2111 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 2112 2113 /* Update the segment usage information. */ 2114 LFS_SEGENTRY(sup, fs, sp->seg_number, bp); 2115 2116 /* Loop through all blocks, except the segment summary. */ 2117 for (bpp = sp->bpp; ++bpp < sp->cbpp; ) { 2118 if ((*bpp)->b_vp != devvp) { 2119 KASSERT(lfs_dtosn(fs, LFS_DBTOFSB(fs, (*bpp)->b_blkno)) == sp->seg_number); 2120 sup->su_nbytes += (*bpp)->b_bcount; 2121 DLOG((DLOG_SU, "seg %jd += %jd for ino %jd" 2122 " lbn %jd db 0x%jd\n", 2123 (intmax_t)sp->seg_number, 2124 (intmax_t)(*bpp)->b_bcount, 2125 (intmax_t)VTOI((*bpp)->b_vp)->i_number, 2126 (intmax_t)(*bpp)->b_lblkno, 2127 (intmax_t)LFS_DBTOFSB(fs, (*bpp)->b_blkno))); 2128 } 2129 } 2130 2131 #ifdef DEBUG 2132 /* Check for zero-length and zero-version FINFO entries. */ 2133 fip = SEGSUM_FINFOBASE(fs, ssp); 2134 for (findex = 0; findex < lfs_ss_getnfinfo(fs, ssp); findex++) { 2135 KDASSERT(lfs_fi_getnblocks(fs, fip) > 0); 2136 KDASSERT(lfs_fi_getversion(fs, fip) > 0); 2137 fip = NEXT_FINFO(fs, fip); 2138 } 2139 #endif /* DEBUG */ 2140 2141 ninos = (lfs_ss_getninos(fs, ssp) + LFS_INOPB(fs) - 1) / LFS_INOPB(fs); 2142 if (lfs_sb_getversion(fs) == 1) 2143 sup->su_olastmod = time_second; 2144 else 2145 sup->su_lastmod = time_second; 2146 sup->su_ninos += ninos; 2147 ++sup->su_nsums; 2148 lfs_sb_subavail(fs, lfs_btofsb(fs, lfs_sb_getsumsize(fs))); 2149 2150 do_again = !(bp->b_flags & B_GATHERED); 2151 LFS_WRITESEGENTRY(sup, fs, sp->seg_number, bp); /* Ifile */ 2152 2153 /* 2154 * Mark blocks BC_BUSY, to prevent then from being changed between 2155 * the checksum computation and the actual write. 2156 * 2157 * If we are cleaning, check indirect blocks for UNWRITTEN, and if 2158 * there are any, replace them with copies that have UNASSIGNED 2159 * instead. 2160 */ 2161 mutex_enter(&bufcache_lock); 2162 for (bpp = sp->bpp, i = nblocks - 1; i--;) { 2163 ++bpp; 2164 bp = *bpp; 2165 if (bp->b_iodone != NULL) { /* UBC or malloced buffer */ 2166 bp->b_cflags |= BC_BUSY; 2167 continue; 2168 } 2169 2170 while (bp->b_cflags & BC_BUSY) { 2171 DLOG((DLOG_SEG, "lfs_writeseg: avoiding potential" 2172 " data summary corruption for ino %d, lbn %" 2173 PRId64 "\n", 2174 VTOI(bp->b_vp)->i_number, bp->b_lblkno)); 2175 bp->b_cflags |= BC_WANTED; 2176 cv_wait(&bp->b_busy, &bufcache_lock); 2177 } 2178 bp->b_cflags |= BC_BUSY; 2179 mutex_exit(&bufcache_lock); 2180 unbusybp = NULL; 2181 2182 /* 2183 * Check and replace indirect block UNWRITTEN bogosity. 2184 * XXX See comment in lfs_writefile. 2185 */ 2186 if (bp->b_lblkno < 0 && bp->b_vp != devvp && bp->b_vp && 2187 lfs_dino_getblocks(fs, VTOI(bp->b_vp)->i_din) != 2188 VTOI(bp->b_vp)->i_lfs_effnblks) { 2189 DLOG((DLOG_VNODE, "lfs_writeseg: cleansing ino %d (%jd != %d)\n", 2190 VTOI(bp->b_vp)->i_number, 2191 (intmax_t)VTOI(bp->b_vp)->i_lfs_effnblks, 2192 lfs_dino_getblocks(fs, VTOI(bp->b_vp)->i_din))); 2193 /* Make a copy we'll make changes to */ 2194 newbp = lfs_newbuf(fs, bp->b_vp, bp->b_lblkno, 2195 bp->b_bcount, LFS_NB_IBLOCK); 2196 newbp->b_blkno = bp->b_blkno; 2197 memcpy(newbp->b_data, bp->b_data, 2198 newbp->b_bcount); 2199 2200 changed = 0; 2201 iblimit = newbp->b_bcount / LFS_BLKPTRSIZE(fs); 2202 for (ibindex = 0; ibindex < iblimit; ibindex++) { 2203 if (lfs_iblock_get(fs, newbp->b_data, ibindex) == UNWRITTEN) { 2204 ++changed; 2205 lfs_iblock_set(fs, newbp->b_data, 2206 ibindex, 0); 2207 } 2208 } 2209 /* 2210 * Get rid of the old buffer. Don't mark it clean, 2211 * though, if it still has dirty data on it. 2212 */ 2213 if (changed) { 2214 DLOG((DLOG_SEG, "lfs_writeseg: replacing UNWRITTEN(%d):" 2215 " bp = %p newbp = %p\n", changed, bp, 2216 newbp)); 2217 *bpp = newbp; 2218 bp->b_flags &= ~B_GATHERED; 2219 bp->b_error = 0; 2220 if (bp->b_iodone != NULL) { 2221 DLOG((DLOG_SEG, "lfs_writeseg: " 2222 "indir bp should not be B_CALL\n")); 2223 biodone(bp); 2224 bp = NULL; 2225 } else { 2226 /* Still on free list, leave it there */ 2227 unbusybp = bp; 2228 /* 2229 * We have to re-decrement lfs_avail 2230 * since this block is going to come 2231 * back around to us in the next 2232 * segment. 2233 */ 2234 lfs_sb_subavail(fs, 2235 lfs_btofsb(fs, bp->b_bcount)); 2236 } 2237 } else { 2238 lfs_freebuf(fs, newbp); 2239 } 2240 } 2241 mutex_enter(&bufcache_lock); 2242 if (unbusybp != NULL) { 2243 unbusybp->b_cflags &= ~BC_BUSY; 2244 if (unbusybp->b_cflags & BC_WANTED) 2245 cv_broadcast(&bp->b_busy); 2246 } 2247 } 2248 mutex_exit(&bufcache_lock); 2249 2250 /* 2251 * Compute checksum across data and then across summary; the first 2252 * block (the summary block) is skipped. Set the create time here 2253 * so that it's guaranteed to be later than the inode mod times. 2254 */ 2255 sum = 0; 2256 if (lfs_sb_getversion(fs) == 1) 2257 el_size = sizeof(u_long); 2258 else 2259 el_size = sizeof(u_int32_t); 2260 for (bpp = sp->bpp, i = nblocks - 1; i--; ) { 2261 ++bpp; 2262 /* Loop through gop_write cluster blocks */ 2263 for (byteoffset = 0; byteoffset < (*bpp)->b_bcount; 2264 byteoffset += lfs_sb_getbsize(fs)) { 2265 #ifdef LFS_USE_BC_INVAL 2266 if (((*bpp)->b_cflags & BC_INVAL) != 0 && 2267 (*bpp)->b_iodone != NULL) { 2268 if (copyin((void *)(*bpp)->b_saveaddr + 2269 byteoffset, dp, el_size)) { 2270 panic("lfs_writeseg: copyin failed [1]:" 2271 " ino %" PRIu64 " blk %" PRId64, 2272 VTOI((*bpp)->b_vp)->i_number, 2273 (*bpp)->b_lblkno); 2274 } 2275 } else 2276 #endif /* LFS_USE_BC_INVAL */ 2277 { 2278 sum = lfs_cksum_part((char *) 2279 (*bpp)->b_data + byteoffset, el_size, sum); 2280 } 2281 } 2282 } 2283 if (lfs_sb_getversion(fs) == 1) 2284 lfs_ss_setocreate(fs, ssp, time_second); 2285 else { 2286 lfs_ss_setcreate(fs, ssp, time_second); 2287 lfs_sb_addserial(fs, 1); 2288 lfs_ss_setserial(fs, ssp, lfs_sb_getserial(fs)); 2289 lfs_ss_setident(fs, ssp, lfs_sb_getident(fs)); 2290 } 2291 lfs_ss_setdatasum(fs, ssp, lfs_cksum_fold(sum)); 2292 sumstart = lfs_ss_getsumstart(fs); 2293 lfs_ss_setsumsum(fs, ssp, cksum((char *)ssp + sumstart, 2294 lfs_sb_getsumsize(fs) - sumstart)); 2295 2296 mutex_enter(&lfs_lock); 2297 lfs_sb_subbfree(fs, (lfs_btofsb(fs, ninos * lfs_sb_getibsize(fs)) + 2298 lfs_btofsb(fs, lfs_sb_getsumsize(fs)))); 2299 lfs_sb_adddmeta(fs, (lfs_btofsb(fs, ninos * lfs_sb_getibsize(fs)) + 2300 lfs_btofsb(fs, lfs_sb_getsumsize(fs)))); 2301 mutex_exit(&lfs_lock); 2302 2303 /* 2304 * When we simply write the blocks we lose a rotation for every block 2305 * written. To avoid this problem, we cluster the buffers into a 2306 * chunk and write the chunk. MAXPHYS is the largest size I/O 2307 * devices can handle, use that for the size of the chunks. 2308 * 2309 * Blocks that are already clusters (from GOP_WRITE), however, we 2310 * don't bother to copy into other clusters. 2311 */ 2312 2313 #define CHUNKSIZE MAXPHYS 2314 2315 if (devvp == NULL) 2316 panic("devvp is NULL"); 2317 for (bpp = sp->bpp, i = nblocks; i;) { 2318 cbp = lfs_newclusterbuf(fs, devvp, (*bpp)->b_blkno, i); 2319 cl = cbp->b_private; 2320 2321 cbp->b_flags |= B_ASYNC; 2322 cbp->b_cflags |= BC_BUSY; 2323 cbp->b_bcount = 0; 2324 2325 KASSERTMSG((bpp - sp->bpp <= 2326 (lfs_sb_getsumsize(fs) - SEGSUM_SIZE(fs)) 2327 / LFS_BLKPTRSIZE(fs)), 2328 "lfs_writeseg: real bpp overwrite"); 2329 KASSERTMSG((bpp - sp->bpp <= 2330 lfs_segsize(fs) / lfs_sb_getfsize(fs)), 2331 "lfs_writeseg: theoretical bpp overwrite"); 2332 2333 /* 2334 * Construct the cluster. 2335 */ 2336 mutex_enter(&lfs_lock); 2337 ++fs->lfs_iocount; 2338 mutex_exit(&lfs_lock); 2339 while (i && cbp->b_bcount < CHUNKSIZE) { 2340 bp = *bpp; 2341 2342 if (bp->b_bcount > (CHUNKSIZE - cbp->b_bcount)) 2343 break; 2344 if (cbp->b_bcount > 0 && !(cl->flags & LFS_CL_MALLOC)) 2345 break; 2346 2347 /* Clusters from GOP_WRITE are expedited */ 2348 if (bp->b_bcount > lfs_sb_getbsize(fs)) { 2349 if (cbp->b_bcount > 0) 2350 /* Put in its own buffer */ 2351 break; 2352 else { 2353 cbp->b_data = bp->b_data; 2354 } 2355 } else if (cbp->b_bcount == 0) { 2356 p = cbp->b_data = lfs_malloc(fs, CHUNKSIZE, 2357 LFS_NB_CLUSTER); 2358 cl->flags |= LFS_CL_MALLOC; 2359 } 2360 KASSERTMSG((lfs_dtosn(fs, LFS_DBTOFSB(fs, bp->b_blkno + 2361 btodb(bp->b_bcount - 1))) == 2362 sp->seg_number), 2363 "segment overwrite: blk size %d daddr %" PRIx64 2364 " not in seg %d\n", 2365 bp->b_bcount, bp->b_blkno, 2366 sp->seg_number); 2367 2368 #ifdef LFS_USE_BC_INVAL 2369 /* 2370 * Fake buffers from the cleaner are marked as BC_INVAL. 2371 * We need to copy the data from user space rather than 2372 * from the buffer indicated. 2373 * XXX == what do I do on an error? 2374 */ 2375 if ((bp->b_cflags & BC_INVAL) != 0 && 2376 bp->b_iodone != NULL) { 2377 if (copyin(bp->b_saveaddr, p, bp->b_bcount)) 2378 panic("lfs_writeseg: " 2379 "copyin failed [2]"); 2380 } else 2381 #endif /* LFS_USE_BC_INVAL */ 2382 if (cl->flags & LFS_CL_MALLOC) { 2383 /* copy data into our cluster. */ 2384 memcpy(p, bp->b_data, bp->b_bcount); 2385 p += bp->b_bcount; 2386 } 2387 2388 cbp->b_bcount += bp->b_bcount; 2389 cl->bufsize += bp->b_bcount; 2390 2391 bp->b_flags &= ~B_READ; 2392 bp->b_error = 0; 2393 cl->bpp[cl->bufcount++] = bp; 2394 2395 vp = bp->b_vp; 2396 mutex_enter(&bufcache_lock); 2397 mutex_enter(vp->v_interlock); 2398 bp->b_oflags &= ~(BO_DELWRI | BO_DONE); 2399 reassignbuf(bp, vp); 2400 vp->v_numoutput++; 2401 mutex_exit(vp->v_interlock); 2402 mutex_exit(&bufcache_lock); 2403 2404 bpp++; 2405 i--; 2406 } 2407 if (fs->lfs_sp->seg_flags & SEGM_SYNC) 2408 BIO_SETPRIO(cbp, BPRIO_TIMECRITICAL); 2409 else 2410 BIO_SETPRIO(cbp, BPRIO_TIMELIMITED); 2411 mutex_enter(devvp->v_interlock); 2412 devvp->v_numoutput++; 2413 mutex_exit(devvp->v_interlock); 2414 sp->bytes_written += cbp->b_bcount; 2415 VOP_STRATEGY(devvp, cbp); 2416 curlwp->l_ru.ru_oublock++; 2417 } 2418 2419 if (lfs_dostats) { 2420 ++lfs_stats.psegwrites; 2421 lfs_stats.blocktot += nblocks - 1; 2422 if (fs->lfs_sp->seg_flags & SEGM_SYNC) 2423 ++lfs_stats.psyncwrites; 2424 if (fs->lfs_sp->seg_flags & SEGM_CLEAN) { 2425 ++lfs_stats.pcleanwrites; 2426 lfs_stats.cleanblocks += nblocks - 1; 2427 } 2428 } 2429 2430 return (lfs_initseg(fs, oflags) || do_again); 2431 } 2432 2433 void 2434 lfs_writesuper(struct lfs *fs, daddr_t daddr) 2435 { 2436 struct buf *bp; 2437 struct vnode *devvp = VTOI(fs->lfs_ivnode)->i_devvp; 2438 2439 ASSERT_MAYBE_SEGLOCK(fs); 2440 if (fs->lfs_is64) { 2441 KASSERT(fs->lfs_dlfs_u.u_64.dlfs_magic == LFS64_MAGIC); 2442 } else { 2443 KASSERT(fs->lfs_dlfs_u.u_32.dlfs_magic == LFS_MAGIC); 2444 } 2445 /* 2446 * If we can write one superblock while another is in 2447 * progress, we risk not having a complete checkpoint if we crash. 2448 * So, block here if a superblock write is in progress. 2449 */ 2450 mutex_enter(&lfs_lock); 2451 while (fs->lfs_sbactive) { 2452 mtsleep(&fs->lfs_sbactive, PRIBIO+1, "lfs sb", 0, 2453 &lfs_lock); 2454 } 2455 fs->lfs_sbactive = daddr; 2456 mutex_exit(&lfs_lock); 2457 2458 /* Set timestamp of this version of the superblock */ 2459 if (lfs_sb_getversion(fs) == 1) 2460 lfs_sb_setotstamp(fs, time_second); 2461 lfs_sb_settstamp(fs, time_second); 2462 2463 /* The next chunk of code relies on this assumption */ 2464 CTASSERT(sizeof(struct dlfs) == sizeof(struct dlfs64)); 2465 2466 /* Checksum the superblock and copy it into a buffer. */ 2467 lfs_sb_setcksum(fs, lfs_sb_cksum(fs)); 2468 bp = lfs_newbuf(fs, devvp, 2469 LFS_FSBTODB(fs, daddr), LFS_SBPAD, LFS_NB_SBLOCK); 2470 memcpy(bp->b_data, &fs->lfs_dlfs_u, sizeof(struct dlfs)); 2471 memset((char *)bp->b_data + sizeof(struct dlfs), 0, 2472 LFS_SBPAD - sizeof(struct dlfs)); 2473 2474 mutex_enter(&bufcache_lock); 2475 KASSERT(bp->b_cflags & BC_BUSY); 2476 mutex_exit(&bufcache_lock); 2477 KASSERT((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0); 2478 bp->b_flags = (bp->b_flags & ~B_READ) | B_ASYNC; 2479 bp->b_error = 0; 2480 bp->b_iodone = lfs_super_aiodone; 2481 2482 if (fs->lfs_sp != NULL && fs->lfs_sp->seg_flags & SEGM_SYNC) 2483 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 2484 else 2485 BIO_SETPRIO(bp, BPRIO_TIMELIMITED); 2486 curlwp->l_ru.ru_oublock++; 2487 2488 mutex_enter(devvp->v_interlock); 2489 devvp->v_numoutput++; 2490 mutex_exit(devvp->v_interlock); 2491 2492 mutex_enter(&lfs_lock); 2493 ++fs->lfs_iocount; 2494 mutex_exit(&lfs_lock); 2495 VOP_STRATEGY(devvp, bp); 2496 } 2497 2498 /* 2499 * Logical block number match routines used when traversing the dirty block 2500 * chain. 2501 */ 2502 int 2503 lfs_match_fake(struct lfs *fs, struct buf *bp) 2504 { 2505 2506 ASSERT_SEGLOCK(fs); 2507 return LFS_IS_MALLOC_BUF(bp); 2508 } 2509 2510 #if 0 2511 int 2512 lfs_match_real(struct lfs *fs, struct buf *bp) 2513 { 2514 2515 ASSERT_SEGLOCK(fs); 2516 return (lfs_match_data(fs, bp) && !lfs_match_fake(fs, bp)); 2517 } 2518 #endif 2519 2520 int 2521 lfs_match_data(struct lfs *fs, struct buf *bp) 2522 { 2523 2524 ASSERT_SEGLOCK(fs); 2525 return (bp->b_lblkno >= 0); 2526 } 2527 2528 int 2529 lfs_match_indir(struct lfs *fs, struct buf *bp) 2530 { 2531 daddr_t lbn; 2532 2533 ASSERT_SEGLOCK(fs); 2534 lbn = bp->b_lblkno; 2535 return (lbn < 0 && (-lbn - ULFS_NDADDR) % LFS_NINDIR(fs) == 0); 2536 } 2537 2538 int 2539 lfs_match_dindir(struct lfs *fs, struct buf *bp) 2540 { 2541 daddr_t lbn; 2542 2543 ASSERT_SEGLOCK(fs); 2544 lbn = bp->b_lblkno; 2545 return (lbn < 0 && (-lbn - ULFS_NDADDR) % LFS_NINDIR(fs) == 1); 2546 } 2547 2548 int 2549 lfs_match_tindir(struct lfs *fs, struct buf *bp) 2550 { 2551 daddr_t lbn; 2552 2553 ASSERT_SEGLOCK(fs); 2554 lbn = bp->b_lblkno; 2555 return (lbn < 0 && (-lbn - ULFS_NDADDR) % LFS_NINDIR(fs) == 2); 2556 } 2557 2558 void 2559 lfs_free_aiodone(struct buf *bp) 2560 { 2561 struct lfs *fs; 2562 2563 fs = bp->b_private; 2564 ASSERT_NO_SEGLOCK(fs); 2565 lfs_freebuf(fs, bp); 2566 } 2567 2568 static void 2569 lfs_super_aiodone(struct buf *bp) 2570 { 2571 workqueue_enqueue(lfs_super_wq, (struct work *)bp, NULL); 2572 } 2573 2574 void 2575 lfs_super_work(struct work *wk, void *arg) 2576 { 2577 struct buf *bp = (struct buf *)wk; 2578 struct lfs *fs; 2579 2580 fs = bp->b_private; 2581 ASSERT_NO_SEGLOCK(fs); 2582 2583 mutex_enter(&bufcache_lock); 2584 KASSERT(bp->b_cflags & BC_BUSY); 2585 mutex_exit(&bufcache_lock); 2586 lfs_freebuf(fs, bp); 2587 2588 mutex_enter(&lfs_lock); 2589 fs->lfs_sbactive = 0; 2590 if (--fs->lfs_iocount <= 1) 2591 wakeup(&fs->lfs_iocount); 2592 wakeup(&fs->lfs_sbactive); 2593 mutex_exit(&lfs_lock); 2594 } 2595 2596 static void 2597 lfs_cluster_aiodone(struct buf *bp) 2598 { 2599 workqueue_enqueue(lfs_cluster_wq, (struct work *)bp, NULL); 2600 } 2601 2602 void 2603 lfs_cluster_work(struct work *wk, void *arg) 2604 { 2605 struct buf *bp = (struct buf *)wk; 2606 struct lfs_cluster *cl; 2607 struct lfs *fs; 2608 struct buf *tbp, *fbp; 2609 struct vnode *vp, *devvp, *ovp; 2610 struct inode *ip; 2611 int error; 2612 2613 KERNEL_LOCK(1, curlwp); 2614 2615 error = bp->b_error; 2616 cl = bp->b_private; 2617 fs = cl->fs; 2618 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 2619 ASSERT_NO_SEGLOCK(fs); 2620 2621 /* Put the pages back, and release the buffer */ 2622 while (cl->bufcount--) { 2623 tbp = cl->bpp[cl->bufcount]; 2624 KASSERT(tbp->b_cflags & BC_BUSY); 2625 if (error) { 2626 tbp->b_error = error; 2627 } 2628 2629 /* 2630 * We're done with tbp. If it has not been re-dirtied since 2631 * the cluster was written, free it. Otherwise, keep it on 2632 * the locked list to be written again. 2633 */ 2634 vp = tbp->b_vp; 2635 2636 tbp->b_flags &= ~B_GATHERED; 2637 2638 #ifdef DEBUG 2639 if ((tbp)->b_vp == (fs)->lfs_ivnode) 2640 LFS_ENTER_LOG("clear", __FILE__, __LINE__, 2641 tbp->b_lblkno, tbp->b_flags, curproc->p_pid); 2642 #endif 2643 2644 mutex_enter(&bufcache_lock); 2645 if (tbp->b_iodone == NULL) { 2646 KASSERT((tbp->b_flags & B_LOCKED) || 2647 (tbp->b_cflags & (BC_NOCACHE | BC_INVAL))); 2648 bremfree(tbp); 2649 if (vp) { 2650 mutex_enter(vp->v_interlock); 2651 reassignbuf(tbp, vp); 2652 mutex_exit(vp->v_interlock); 2653 } 2654 tbp->b_flags |= B_ASYNC; /* for biodone */ 2655 2656 /* 2657 * Check for ordinary buffers on regular files. 2658 * These are created by the cleaner when coalescing 2659 * files. The rest of the filesystem ignores these, 2660 * so the buffer cache version of this block 2661 * is invalid. 2662 */ 2663 if (tbp->b_lblkno >= 0 && tbp->b_vp != NULL 2664 && tbp->b_vp->v_type == VREG) { 2665 tbp->b_cflags |= BC_INVAL; 2666 } 2667 } 2668 2669 if ((tbp->b_flags & B_LOCKED) && !(tbp->b_oflags & BO_DELWRI)) 2670 LFS_UNLOCK_BUF(tbp); 2671 2672 if (tbp->b_oflags & BO_DONE) { 2673 DLOG((DLOG_SEG, "blk %d biodone already (flags %lx)\n", 2674 cl->bufcount, (long)tbp->b_flags)); 2675 } 2676 2677 if (tbp->b_iodone != NULL && !LFS_IS_MALLOC_BUF(tbp)) { 2678 /* 2679 * A buffer from the page daemon. 2680 * We use the same iodone as it does, 2681 * so we must manually disassociate its 2682 * buffers from the vp. 2683 */ 2684 if ((ovp = tbp->b_vp) != NULL) { 2685 /* This is just silly */ 2686 mutex_enter(ovp->v_interlock); 2687 brelvp(tbp); 2688 mutex_exit(ovp->v_interlock); 2689 tbp->b_vp = vp; 2690 tbp->b_objlock = vp->v_interlock; 2691 } 2692 /* Put it back the way it was */ 2693 tbp->b_flags |= B_ASYNC; 2694 /* Master buffers have BC_AGE */ 2695 if (tbp->b_private == tbp) 2696 tbp->b_cflags |= BC_AGE; 2697 } 2698 mutex_exit(&bufcache_lock); 2699 2700 biodone(tbp); 2701 2702 /* 2703 * If this is the last block for this vnode, but 2704 * there are other blocks on its dirty list, 2705 * set IN_MODIFIED. Only do this for our mount point, 2706 * not for, e.g., inode blocks that are attached to 2707 * the devvp. 2708 * XXX KS - Shouldn't we set *both* if both types 2709 * of blocks are present (traverse the dirty list?) 2710 */ 2711 mutex_enter(vp->v_interlock); 2712 mutex_enter(&lfs_lock); 2713 if (vp != devvp && vp->v_numoutput == 0 && 2714 (fbp = LIST_FIRST(&vp->v_dirtyblkhd)) != NULL) { 2715 ip = VTOI(vp); 2716 DLOG((DLOG_SEG, "lfs_cluster_aiodone: mark ino %d\n", 2717 ip->i_number)); 2718 LFS_SET_UINO(ip, IN_MODIFIED); 2719 } 2720 cv_broadcast(&vp->v_cv); 2721 mutex_exit(&lfs_lock); 2722 mutex_exit(vp->v_interlock); 2723 } 2724 2725 /* Fix up the cluster buffer, and release it */ 2726 if (cl->flags & LFS_CL_MALLOC) 2727 lfs_free(fs, bp->b_data, LFS_NB_CLUSTER); 2728 putiobuf(bp); 2729 2730 /* Note i/o done */ 2731 if (cl->flags & LFS_CL_SYNC) { 2732 if (--cl->seg->seg_iocount == 0) 2733 wakeup(&cl->seg->seg_iocount); 2734 } 2735 2736 pool_put(&fs->lfs_bpppool, cl->bpp); 2737 cl->bpp = NULL; 2738 pool_put(&fs->lfs_clpool, cl); 2739 mutex_enter(&lfs_lock); 2740 2741 KASSERTMSG((fs->lfs_iocount != 0), 2742 "lfs_cluster_aiodone: zero iocount"); 2743 if (--fs->lfs_iocount <= 1) 2744 wakeup(&fs->lfs_iocount); 2745 mutex_exit(&lfs_lock); 2746 2747 KERNEL_UNLOCK_ONE(curlwp); 2748 } 2749 2750 /* 2751 * Shellsort (diminishing increment sort) from Data Structures and 2752 * Algorithms, Aho, Hopcraft and Ullman, 1983 Edition, page 290; 2753 * see also Knuth Vol. 3, page 84. The increments are selected from 2754 * formula (8), page 95. Roughly O(N^3/2). 2755 */ 2756 /* 2757 * This is our own private copy of shellsort because we want to sort 2758 * two parallel arrays (the array of buffer pointers and the array of 2759 * logical block numbers) simultaneously. Note that we cast the array 2760 * of logical block numbers to a unsigned in this routine so that the 2761 * negative block numbers (meta data blocks) sort AFTER the data blocks. 2762 */ 2763 2764 static void 2765 lfs_shellsort(struct lfs *fs, 2766 struct buf **bp_array, union lfs_blocks *lb_array, 2767 int nmemb, int size) 2768 { 2769 static int __rsshell_increments[] = { 4, 1, 0 }; 2770 int incr, *incrp, t1, t2; 2771 struct buf *bp_temp; 2772 2773 #ifdef DEBUG 2774 incr = 0; 2775 for (t1 = 0; t1 < nmemb; t1++) { 2776 for (t2 = 0; t2 * size < bp_array[t1]->b_bcount; t2++) { 2777 if (lfs_blocks_get(fs, lb_array, incr++) != bp_array[t1]->b_lblkno + t2) { 2778 /* dump before panic */ 2779 printf("lfs_shellsort: nmemb=%d, size=%d\n", 2780 nmemb, size); 2781 incr = 0; 2782 for (t1 = 0; t1 < nmemb; t1++) { 2783 const struct buf *bp = bp_array[t1]; 2784 2785 printf("bp[%d]: lbn=%" PRIu64 ", size=%" 2786 PRIu64 "\n", t1, 2787 (uint64_t)bp->b_bcount, 2788 (uint64_t)bp->b_lblkno); 2789 printf("lbns:"); 2790 for (t2 = 0; t2 * size < bp->b_bcount; 2791 t2++) { 2792 printf(" %jd", 2793 (intmax_t)lfs_blocks_get(fs, lb_array, incr++)); 2794 } 2795 printf("\n"); 2796 } 2797 panic("lfs_shellsort: inconsistent input"); 2798 } 2799 } 2800 } 2801 #endif 2802 2803 for (incrp = __rsshell_increments; (incr = *incrp++) != 0;) 2804 for (t1 = incr; t1 < nmemb; ++t1) 2805 for (t2 = t1 - incr; t2 >= 0;) 2806 if ((u_int64_t)bp_array[t2]->b_lblkno > 2807 (u_int64_t)bp_array[t2 + incr]->b_lblkno) { 2808 bp_temp = bp_array[t2]; 2809 bp_array[t2] = bp_array[t2 + incr]; 2810 bp_array[t2 + incr] = bp_temp; 2811 t2 -= incr; 2812 } else 2813 break; 2814 2815 /* Reform the list of logical blocks */ 2816 incr = 0; 2817 for (t1 = 0; t1 < nmemb; t1++) { 2818 for (t2 = 0; t2 * size < bp_array[t1]->b_bcount; t2++) { 2819 lfs_blocks_set(fs, lb_array, incr++, 2820 bp_array[t1]->b_lblkno + t2); 2821 } 2822 } 2823 } 2824 2825 /* 2826 * Set up an FINFO entry for a new file. The fip pointer is assumed to 2827 * point at uninitialized space. 2828 */ 2829 void 2830 lfs_acquire_finfo(struct lfs *fs, ino_t ino, int vers) 2831 { 2832 struct segment *sp = fs->lfs_sp; 2833 SEGSUM *ssp; 2834 2835 KASSERT(vers > 0); 2836 2837 if (sp->seg_bytes_left < lfs_sb_getbsize(fs) || 2838 sp->sum_bytes_left < FINFOSIZE(fs) + LFS_BLKPTRSIZE(fs)) 2839 (void) lfs_writeseg(fs, fs->lfs_sp); 2840 2841 sp->sum_bytes_left -= FINFOSIZE(fs); 2842 ssp = (SEGSUM *)sp->segsum; 2843 lfs_ss_setnfinfo(fs, ssp, lfs_ss_getnfinfo(fs, ssp) + 1); 2844 lfs_fi_setnblocks(fs, sp->fip, 0); 2845 lfs_fi_setino(fs, sp->fip, ino); 2846 lfs_fi_setversion(fs, sp->fip, vers); 2847 } 2848 2849 /* 2850 * Release the FINFO entry, either clearing out an unused entry or 2851 * advancing us to the next available entry. 2852 */ 2853 void 2854 lfs_release_finfo(struct lfs *fs) 2855 { 2856 struct segment *sp = fs->lfs_sp; 2857 SEGSUM *ssp; 2858 2859 if (lfs_fi_getnblocks(fs, sp->fip) != 0) { 2860 sp->fip = NEXT_FINFO(fs, sp->fip); 2861 lfs_blocks_fromfinfo(fs, &sp->start_lbp, sp->fip); 2862 } else { 2863 /* XXX shouldn't this update sp->fip? */ 2864 sp->sum_bytes_left += FINFOSIZE(fs); 2865 ssp = (SEGSUM *)sp->segsum; 2866 lfs_ss_setnfinfo(fs, ssp, lfs_ss_getnfinfo(fs, ssp) - 1); 2867 } 2868 } 2869