1 /* $NetBSD: lfs_alloc.c,v 1.145 2025/09/21 14:19:14 christos Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Konrad E. Schroder <perseant (at) hhhh.org>. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 /* 32 * Copyright (c) 1991, 1993 33 * The Regents of the University of California. All rights reserved. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. Neither the name of the University nor the names of its contributors 44 * may be used to endorse or promote products derived from this software 45 * without specific prior written permission. 46 * 47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * SUCH DAMAGE. 58 * 59 * @(#)lfs_alloc.c 8.4 (Berkeley) 1/4/94 60 */ 61 62 #include <sys/cdefs.h> 63 __KERNEL_RCSID(0, "$NetBSD: lfs_alloc.c,v 1.145 2025/09/21 14:19:14 christos Exp $"); 64 65 #if defined(_KERNEL_OPT) 66 #include "opt_quota.h" 67 #endif 68 69 #include <sys/param.h> 70 #include <sys/systm.h> 71 #include <sys/kernel.h> 72 #include <sys/buf.h> 73 #include <sys/lock.h> 74 #include <sys/vnode.h> 75 #include <sys/syslog.h> 76 #include <sys/mount.h> 77 #include <sys/malloc.h> 78 #include <sys/pool.h> 79 #include <sys/proc.h> 80 #include <sys/kauth.h> 81 82 #include <ufs/lfs/ulfs_quotacommon.h> 83 #include <ufs/lfs/ulfs_inode.h> 84 #include <ufs/lfs/ulfsmount.h> 85 #include <ufs/lfs/ulfs_extern.h> 86 87 #include <ufs/lfs/lfs.h> 88 #include <ufs/lfs/lfs_accessors.h> 89 #include <ufs/lfs/lfs_extern.h> 90 #include <ufs/lfs/lfs_kernel.h> 91 92 int lfs_do_check_freelist = 0; 93 94 /* Constants for inode free bitmap */ 95 #define BMSHIFT 5 /* 2 ** 5 = 32 */ 96 #define BMMASK ((1 << BMSHIFT) - 1) 97 #define SET_BITMAP_FREE(F, I) do { \ 98 DLOG((DLOG_ALLOC, "lfs: ino %d wrd %d bit %d set\n", (int)(I), \ 99 (int)((I) >> BMSHIFT), (int)((I) & BMMASK))); \ 100 (F)->lfs_ino_bitmap[(I) >> BMSHIFT] |= (1U << ((I) & BMMASK)); \ 101 } while (0) 102 #define CLR_BITMAP_FREE(F, I) do { \ 103 DLOG((DLOG_ALLOC, "lfs: ino %d wrd %d bit %d clr\n", (int)(I), \ 104 (int)((I) >> BMSHIFT), (int)((I) & BMMASK))); \ 105 (F)->lfs_ino_bitmap[(I) >> BMSHIFT] &= ~(1U << ((I) & BMMASK)); \ 106 } while(0) 107 108 #define ISSET_BITMAP_FREE(F, I) \ 109 ((F)->lfs_ino_bitmap[(I) >> BMSHIFT] & (1U << ((I) & BMMASK))) 110 111 /* 112 * Add a new block to the Ifile, to accommodate future file creations. 113 * Called with the segment lock held. 114 */ 115 int 116 lfs_extend_ifile(struct lfs *fs, kauth_cred_t cred) 117 { 118 struct vnode *vp; 119 struct inode *ip; 120 IFILE64 *ifp64; 121 IFILE32 *ifp32; 122 IFILE_V1 *ifp_v1; 123 struct buf *bp, *cbp; 124 int error; 125 daddr_t i, blkno, xmax; 126 ino_t oldhead, maxino, tail; 127 CLEANERINFO *cip; 128 129 ASSERT_SEGLOCK(fs); 130 131 /* XXX should check or assert that we aren't readonly. */ 132 133 /* 134 * Get a block and extend the ifile inode. Leave the buffer for 135 * the block in bp. 136 */ 137 138 vp = fs->lfs_ivnode; 139 ip = VTOI(vp); 140 blkno = lfs_lblkno(fs, ip->i_size); 141 if ((error = lfs_balloc(vp, ip->i_size, lfs_sb_getbsize(fs), cred, 0, 142 &bp)) != 0) { 143 return (error); 144 } 145 ip->i_size += lfs_sb_getbsize(fs); 146 lfs_dino_setsize(fs, ip->i_din, ip->i_size); 147 uvm_vnp_setsize(vp, ip->i_size); 148 149 /* 150 * Compute the new number of inodes, and reallocate the in-memory 151 * inode freemap. 152 */ 153 154 maxino = ((ip->i_size >> lfs_sb_getbshift(fs)) - lfs_sb_getcleansz(fs) - 155 lfs_sb_getsegtabsz(fs)) * lfs_sb_getifpb(fs); 156 fs->lfs_ino_bitmap = (lfs_bm_t *) 157 realloc(fs->lfs_ino_bitmap, ((maxino + BMMASK) >> BMSHIFT) * 158 sizeof(lfs_bm_t), M_SEGMENT, M_WAITOK); 159 KASSERT(fs->lfs_ino_bitmap != NULL); 160 161 /* first new inode number */ 162 i = (blkno - lfs_sb_getsegtabsz(fs) - lfs_sb_getcleansz(fs)) * 163 lfs_sb_getifpb(fs); 164 165 /* inode number to stop at (XXX: why *x*max?) */ 166 xmax = i + lfs_sb_getifpb(fs); 167 168 /* 169 * We insert the new inodes at the head of the free list. 170 * Under normal circumstances, the free list is empty here, 171 * so we are also incidentally placing them at the end (which 172 * we must do if we are to keep them in order). 173 */ 174 LFS_GET_HEADFREE(fs, cip, cbp, &oldhead); 175 LFS_PUT_HEADFREE(fs, cip, cbp, i); 176 LFS_GET_TAILFREE(fs, cip, cbp, &tail); 177 DLOG((DLOG_ALLOC, "oldhead=%jd, i=%jd, xmax=%jd, oldtail=%jd\n", 178 (intmax_t)oldhead, (intmax_t)i, (intmax_t)xmax, 179 (intmax_t)tail)); 180 if (tail == LFS_UNUSED_INUM) { 181 tail = xmax - 1; 182 LFS_PUT_TAILFREE(fs, cip, cbp, tail); 183 } 184 KASSERTMSG((lfs_sb_getfreehd(fs) != LFS_UNUSED_INUM), 185 "inode 0 allocated [2]"); 186 187 188 /* 189 * Initialize the ifile block. 190 * 191 * XXX: these loops should be restructured to use the accessor 192 * functions instead of using cutpaste polymorphism. 193 */ 194 195 if (fs->lfs_is64) { 196 for (ifp64 = (IFILE64 *)bp->b_data; i < xmax; ++ifp64) { 197 SET_BITMAP_FREE(fs, i); 198 ifp64->if_version = 1; 199 ifp64->if_daddr = LFS_UNUSED_DADDR; 200 ifp64->if_nextfree = ++i; 201 } 202 ifp64--; 203 ifp64->if_nextfree = oldhead; 204 } else if (lfs_sb_getversion(fs) > 1) { 205 for (ifp32 = (IFILE32 *)bp->b_data; i < xmax; ++ifp32) { 206 SET_BITMAP_FREE(fs, i); 207 ifp32->if_version = 1; 208 ifp32->if_daddr = LFS_UNUSED_DADDR; 209 ifp32->if_nextfree = ++i; 210 } 211 ifp32--; 212 ifp32->if_nextfree = oldhead; 213 } else { 214 for (ifp_v1 = (IFILE_V1 *)bp->b_data; i < xmax; ++ifp_v1) { 215 SET_BITMAP_FREE(fs, i); 216 ifp_v1->if_version = 1; 217 ifp_v1->if_daddr = LFS_UNUSED_DADDR; 218 ifp_v1->if_nextfree = ++i; 219 } 220 ifp_v1--; 221 ifp_v1->if_nextfree = oldhead; 222 } 223 DLOG((DLOG_ALLOC, " now head=%jd tail=%jd\n", 224 (intmax_t)xmax - lfs_sb_getifpb(fs), (intmax_t)tail)); 225 226 /* 227 * Write out the new block. 228 */ 229 230 (void) LFS_BWRITE_LOG(bp); /* Ifile */ 231 232 return 0; 233 } 234 235 /* 236 * Allocate an inode for a new file. 237 * 238 * Takes the segment lock. Also (while holding it) takes lfs_lock 239 * to frob fs->lfs_fmod. 240 * 241 * XXX: the mode argument is unused; should just get rid of it. 242 */ 243 /* ARGSUSED */ 244 /* VOP_BWRITE 2i times */ 245 int 246 lfs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred, 247 ino_t *ino, int *gen) 248 { 249 struct lfs *fs; 250 struct buf *bp, *cbp; 251 IFILE *ifp; 252 int error; 253 CLEANERINFO *cip; 254 255 fs = VTOI(pvp)->i_lfs; 256 if (fs->lfs_ronly) 257 return EROFS; 258 259 if (!(fs->lfs_flags & LFS_NOTYET)) 260 ASSERT_NO_SEGLOCK(fs); 261 262 DEBUG_CHECK_FREELIST(fs); 263 264 lfs_seglock(fs, SEGM_PROT); 265 266 /* Get the head of the freelist. */ 267 LFS_GET_HEADFREE(fs, cip, cbp, ino); 268 269 /* paranoia */ 270 KASSERT(*ino != LFS_UNUSED_INUM && *ino != LFS_IFILE_INUM); 271 DLOG((DLOG_ALLOC, "lfs_valloc: allocate inode %" PRId64 "\n", 272 *ino)); 273 274 /* Update the in-memory inode freemap */ 275 CLR_BITMAP_FREE(fs, *ino); 276 277 /* 278 * Fetch the ifile entry and make sure the inode is really 279 * free. 280 */ 281 LFS_IENTRY(ifp, fs, *ino, bp); 282 if (lfs_if_getdaddr(fs, ifp) != LFS_UNUSED_DADDR) 283 panic("lfs_valloc: inuse inode %" PRId64 " on the free list", 284 *ino); 285 286 /* Update the inode freelist head in the superblock. */ 287 LFS_PUT_HEADFREE(fs, cip, cbp, lfs_if_getnextfree(fs, ifp)); 288 DLOG((DLOG_ALLOC, "lfs_valloc: headfree %" PRId64 " -> %ju\n", 289 *ino, (uintmax_t)lfs_if_getnextfree(fs, ifp))); 290 291 /* 292 * Retrieve the version number from the ifile entry. It was 293 * bumped by vfree, so don't bump it again. 294 */ 295 *gen = lfs_if_getversion(fs, ifp); 296 297 /* Done with ifile entry */ 298 299 /* 300 * Note LFS_ILLEGAL_DADDR == LFS_UNUSED_DADDR 301 * unless DEBUG is set. In that case we 302 * use that address to track inodes that have been 303 * allocated (and so not on the free list) but do not 304 * have proper disk addresses yet. 305 */ 306 lfs_if_setdaddr(fs, ifp, LFS_ILLEGAL_DADDR); 307 lfs_if_setnextfree(fs, ifp, LFS_UNUSED_INUM); 308 LFS_BWRITE_LOG(bp); 309 310 if (lfs_sb_getfreehd(fs) == LFS_UNUSED_INUM) { 311 /* 312 * No more inodes; extend the ifile so that the next 313 * lfs_valloc will succeed. 314 */ 315 if ((error = lfs_extend_ifile(fs, cred)) != 0) { 316 /* restore the freelist */ 317 LFS_PUT_HEADFREE(fs, cip, cbp, *ino); 318 319 /* unlock and return */ 320 lfs_segunlock(fs); 321 return error; 322 } 323 } 324 KASSERTMSG((lfs_sb_getfreehd(fs) != LFS_UNUSED_INUM), 325 "inode 0 allocated [3]"); 326 327 /* Set superblock modified bit */ 328 mutex_enter(&lfs_lock); 329 fs->lfs_fmod = 1; 330 mutex_exit(&lfs_lock); 331 332 /* increment file count */ 333 lfs_sb_addnfiles(fs, 1); 334 335 /* done */ 336 lfs_segunlock(fs); 337 338 DEBUG_CHECK_FREELIST(fs); 339 return 0; 340 } 341 342 /* 343 * Allocate an inode for a new file, with given inode number and 344 * version. 345 * 346 * Called in the same context as lfs_valloc and therefore shares the 347 * same locking assumptions. 348 */ 349 int 350 lfs_valloc_fixed(struct lfs *fs, ino_t ino, int vers) 351 { 352 IFILE *ifp; 353 struct buf *bp, *cbp; 354 ino_t headino, thisino, oldnext, tailino; 355 CLEANERINFO *cip; 356 int extended = 0; 357 358 if (fs->lfs_ronly) 359 return EROFS; 360 361 if (!(fs->lfs_flags & LFS_NOTYET)) 362 ASSERT_NO_SEGLOCK(fs); 363 364 DEBUG_CHECK_FREELIST(fs); 365 366 lfs_seglock(fs, SEGM_PROT); 367 368 /* 369 * If the ifile is too short to contain this inum, extend it. 370 * 371 * XXX: lfs_extend_ifile should take a size instead of always 372 * doing just one block at time. 373 */ 374 while (VTOI(fs->lfs_ivnode)->i_size <= (ino / 375 lfs_sb_getifpb(fs) + lfs_sb_getcleansz(fs) + lfs_sb_getsegtabsz(fs)) 376 << lfs_sb_getbshift(fs)) { 377 DLOG((DLOG_ALLOC, "extend ifile to accommodate ino %jd\n", 378 (intmax_t)ino)); 379 lfs_extend_ifile(fs, NOCRED); 380 extended = 1; 381 } 382 383 /* 384 * Get the inode freelist next pointer. 385 */ 386 LFS_IENTRY(ifp, fs, ino, bp); 387 oldnext = lfs_if_getnextfree(fs, ifp); 388 brelse(bp, 0); 389 390 /* Get fail of inode freelist */ 391 LFS_GET_TAILFREE(fs, cip, cbp, &tailino); 392 393 /* Get head of inode freelist */ 394 LFS_GET_HEADFREE(fs, cip, cbp, &headino); 395 if (headino == ino) { 396 /* Easy case: the inode we wanted was at the head */ 397 LFS_PUT_HEADFREE(fs, cip, cbp, oldnext); 398 } else { 399 ino_t nextfree = 0, maxino, count; /* XXX: gcc */ 400 401 /* Have to find the desired inode in the freelist... */ 402 maxino = ((VTOI(fs->lfs_ivnode)->i_size >> lfs_sb_getbshift(fs)) 403 - lfs_sb_getcleansz(fs) - lfs_sb_getsegtabsz(fs)) 404 * lfs_sb_getifpb(fs); 405 count = 0; 406 407 thisino = headino; 408 while (thisino != LFS_UNUSED_INUM) { 409 /* read this ifile entry */ 410 LFS_IENTRY(ifp, fs, thisino, bp); 411 nextfree = lfs_if_getnextfree(fs, ifp); 412 /* stop if we find it or we hit the end */ 413 if (nextfree == ino || 414 nextfree == LFS_UNUSED_INUM) 415 break; 416 /* nope, keep going... */ 417 thisino = nextfree; 418 brelse(bp, 0); 419 if (++count > maxino) 420 break; 421 } 422 if (count > maxino) { 423 panic("loop in free list"); 424 lfs_segunlock(fs); 425 return ENOENT; 426 } 427 if (nextfree == LFS_UNUSED_INUM) { 428 /* hit the end -- this inode is not available */ 429 brelse(bp, 0); 430 lfs_segunlock(fs); 431 if (extended) 432 panic("extended ifile to accommodate but inode not found"); 433 return ENOENT; 434 } 435 /* found it; update the next pointer */ 436 lfs_if_setnextfree(fs, ifp, oldnext); 437 /* write the ifile block */ 438 LFS_BWRITE_LOG(bp); 439 440 /* If our inode was the tail, thisino is now the tail */ 441 if (tailino == ino) 442 LFS_PUT_TAILFREE(fs, cip, cbp, thisino); 443 } 444 445 /* Clear nextfree, note daddr, and set generation number */ 446 LFS_IENTRY(ifp, fs, ino, bp); 447 lfs_if_setversion(fs, ifp, vers); 448 lfs_if_setnextfree(fs, ifp, LFS_UNUSED_INUM); 449 /* See comment in lfs_valloc */ 450 lfs_if_setdaddr(fs, ifp, LFS_ILLEGAL_DADDR); 451 LFS_BWRITE_LOG(bp); 452 453 if (lfs_sb_getfreehd(fs) == LFS_UNUSED_INUM) { 454 int error; 455 /* 456 * No more inodes; extend the ifile so that the next 457 * lfs_valloc will succeed. 458 */ 459 if ((error = lfs_extend_ifile(fs, NOCRED)) != 0) { 460 /* restore the freelist */ 461 LFS_PUT_HEADFREE(fs, cip, cbp, ino); 462 463 /* unlock and return */ 464 lfs_segunlock(fs); 465 return error; 466 } 467 } 468 KASSERTMSG((lfs_sb_getfreehd(fs) != LFS_UNUSED_INUM), 469 "inode 0 allocated [4]"); 470 471 /* done */ 472 lfs_segunlock(fs); 473 474 DEBUG_CHECK_FREELIST(fs); 475 476 return 0; 477 } 478 479 #if 0 480 /* 481 * Find the highest-numbered allocated inode. 482 * This will be used to shrink the Ifile. 483 */ 484 static inline ino_t 485 lfs_last_alloc_ino(struct lfs *fs) 486 { 487 ino_t ino, maxino; 488 489 maxino = ((fs->lfs_ivnode->v_size >> lfs_sb_getbshift(fs)) - 490 lfs_sb_getcleansz(fs) - lfs_sb_getsegtabsz(fs)) * 491 lfs_sb_getifpb(fs); 492 for (ino = maxino - 1; ino > LFS_UNUSED_INUM; --ino) { 493 if (ISSET_BITMAP_FREE(fs, ino) == 0) 494 break; 495 } 496 return ino; 497 } 498 499 /* 500 * Find the previous (next lowest numbered) free inode, if any. 501 * If there is none, return LFS_UNUSED_INUM. 502 * 503 * XXX: locking? 504 */ 505 static inline ino_t 506 lfs_freelist_prev(struct lfs *fs, ino_t ino) 507 { 508 ino_t tino, bound, bb, freehdbb; 509 510 if (lfs_sb_getfreehd(fs) == LFS_UNUSED_INUM) { 511 /* No free inodes at all */ 512 return LFS_UNUSED_INUM; 513 } 514 515 /* Search our own word first */ 516 bound = ino & ~BMMASK; 517 for (tino = ino - 1; tino >= bound && tino > LFS_UNUSED_INUM; tino--) 518 if (ISSET_BITMAP_FREE(fs, tino)) 519 return tino; 520 /* If there are no lower words to search, just return */ 521 if (ino >> BMSHIFT == 0) 522 return LFS_UNUSED_INUM; 523 524 /* 525 * Find a word with a free inode in it. We have to be a bit 526 * careful here since ino_t is unsigned. 527 */ 528 freehdbb = (lfs_sb_getfreehd(fs) >> BMSHIFT); 529 for (bb = (ino >> BMSHIFT) - 1; bb >= freehdbb && bb > 0; --bb) 530 if (fs->lfs_ino_bitmap[bb]) 531 break; 532 if (fs->lfs_ino_bitmap[bb] == 0) 533 return LFS_UNUSED_INUM; 534 535 /* Search the word we found */ 536 for (tino = (bb << BMSHIFT) | BMMASK; tino >= (bb << BMSHIFT) && 537 tino > LFS_UNUSED_INUM; tino--) 538 if (ISSET_BITMAP_FREE(fs, tino)) 539 break; 540 541 /* Avoid returning reserved inode numbers */ 542 if (tino <= LFS_IFILE_INUM) 543 tino = LFS_UNUSED_INUM; 544 545 return tino; 546 } 547 #endif 548 549 /* 550 * Free an inode. 551 * 552 * Takes lfs_seglock. Also (independently) takes vp->v_interlock. 553 */ 554 /* ARGUSED */ 555 /* VOP_BWRITE 2i times */ 556 int 557 lfs_vfree(struct vnode *vp, ino_t ino, int mode) 558 { 559 SEGUSE *sup; 560 CLEANERINFO *cip; 561 struct buf *cbp, *bp; 562 IFILE *ifp; 563 struct inode *ip; 564 struct lfs *fs; 565 daddr_t old_iaddr; 566 567 /* Get the inode number and file system. */ 568 ip = VTOI(vp); 569 fs = ip->i_lfs; 570 ino = ip->i_number; 571 572 /* XXX: assert not readonly */ 573 574 ASSERT_NO_SEGLOCK(fs); 575 DLOG((DLOG_ALLOC, "lfs_vfree: free ino %lld\n", (long long)ino)); 576 577 /* Drain of pending writes */ 578 mutex_enter(vp->v_interlock); 579 while (lfs_sb_getversion(fs) > 1 && WRITEINPROG(vp)) { 580 cv_wait(&vp->v_cv, vp->v_interlock); 581 } 582 mutex_exit(vp->v_interlock); 583 584 lfs_seglock(fs, SEGM_PROT); 585 586 DEBUG_CHECK_FREELIST(fs); 587 588 /* 589 * If the inode was in a dirop, it isn't now. 590 * 591 * XXX: why are (v_uflag & VU_DIROP) and (ip->i_state & IN_ADIROP) 592 * not updated together in one function? (and why do both exist, 593 * anyway?) 594 */ 595 UNMARK_VNODE(vp); 596 597 mutex_enter(&lfs_lock); 598 if (vp->v_uflag & VU_DIROP) { 599 vp->v_uflag &= ~VU_DIROP; 600 --lfs_dirvcount; 601 --fs->lfs_dirvcount; 602 TAILQ_REMOVE(&fs->lfs_dchainhd, ip, i_lfs_dchain); 603 wakeup(&fs->lfs_dirvcount); 604 wakeup(&lfs_dirvcount); 605 mutex_exit(&lfs_lock); 606 vrele(vp); 607 608 /* 609 * If this inode is not going to be written any more, any 610 * segment accounting left over from its truncation needs 611 * to occur at the end of the next dirops flush. Attach 612 * them to the fs-wide list for that purpose. 613 */ 614 if (LIST_FIRST(&ip->i_lfs_segdhd) != NULL) { 615 struct segdelta *sd; 616 617 while((sd = LIST_FIRST(&ip->i_lfs_segdhd)) != NULL) { 618 LIST_REMOVE(sd, list); 619 LIST_INSERT_HEAD(&fs->lfs_segdhd, sd, list); 620 } 621 } 622 } else { 623 /* 624 * If it's not a dirop, we can finalize right away. 625 */ 626 mutex_exit(&lfs_lock); 627 lfs_finalize_ino_seguse(fs, ip); 628 } 629 630 /* it is no longer an unwritten inode, so update the counts */ 631 mutex_enter(&lfs_lock); 632 LFS_CLR_UINO(ip, IN_ACCESSED|IN_CLEANING|IN_MODIFIED); 633 mutex_exit(&lfs_lock); 634 635 /* Turn off all inode modification flags */ 636 ip->i_state &= ~IN_ALLMOD; 637 638 /* Mark it deleted */ 639 ip->i_lfs_iflags |= LFSI_DELETED; 640 641 /* Mark it free in the in-memory inode freemap */ 642 SET_BITMAP_FREE(fs, ino); 643 644 /* 645 * Set the ifile's inode entry to unused, increment its version number 646 * and link it onto the free chain. 647 */ 648 649 /* fetch the ifile entry */ 650 LFS_IENTRY(ifp, fs, ino, bp); 651 652 /* update the on-disk address (to "nowhere") */ 653 old_iaddr = lfs_if_getdaddr(fs, ifp); 654 lfs_if_setdaddr(fs, ifp, LFS_UNUSED_DADDR); 655 656 /* bump the version */ 657 lfs_if_setversion(fs, ifp, lfs_if_getversion(fs, ifp) + 1); 658 659 #if 0 660 if (lfs_sb_getversion(fs) == 1) { 661 #endif 662 ino_t nextfree; 663 664 /* insert on freelist */ 665 LFS_GET_HEADFREE(fs, cip, cbp, &nextfree); 666 lfs_if_setnextfree(fs, ifp, nextfree); 667 LFS_PUT_HEADFREE(fs, cip, cbp, ino); 668 669 /* write the ifile block */ 670 (void) LFS_BWRITE_LOG(bp); /* Ifile */ 671 #if 0 672 } else { 673 ino_t tino, onf, otail; 674 675 /* 676 * Clear the freelist next pointer and write the ifile 677 * block. XXX: why? I'm sure there must be a reason but 678 * it seems both silly and dangerous. 679 */ 680 lfs_if_setnextfree(fs, ifp, LFS_UNUSED_INUM); 681 (void) LFS_BWRITE_LOG(bp); /* Ifile */ 682 683 /* 684 * Insert on freelist in order. 685 */ 686 687 /* Find the next lower (by number) free inode */ 688 tino = lfs_freelist_prev(fs, ino); 689 690 if (tino == LFS_UNUSED_INUM) { 691 ino_t nextfree; 692 693 /* 694 * There isn't one; put us on the freelist head. 695 */ 696 697 /* reload the ifile block */ 698 LFS_IENTRY(ifp, fs, ino, bp); 699 /* update the list */ 700 LFS_GET_HEADFREE(fs, cip, cbp, &nextfree); 701 lfs_if_setnextfree(fs, ifp, nextfree); 702 LFS_PUT_HEADFREE(fs, cip, cbp, ino); 703 DLOG((DLOG_ALLOC, "lfs_vfree: headfree %lld -> %lld\n", 704 (long long)nextfree, (long long)ino)); 705 /* write the ifile block */ 706 LFS_BWRITE_LOG(bp); /* Ifile */ 707 708 /* If the list was empty, set tail too */ 709 LFS_GET_TAILFREE(fs, cip, cbp, &otail); 710 if (otail == LFS_UNUSED_INUM) { 711 LFS_PUT_TAILFREE(fs, cip, cbp, ino); 712 DLOG((DLOG_ALLOC, "lfs_vfree: tailfree %lld " 713 "-> %lld\n", (long long)otail, 714 (long long)ino)); 715 } 716 } else { 717 /* 718 * Insert this inode into the list after tino. 719 * We hold the segment lock so we don't have to 720 * worry about blocks being written out of order. 721 */ 722 723 DLOG((DLOG_ALLOC, "lfs_vfree: insert ino %lld " 724 " after %lld\n", ino, tino)); 725 726 /* load the previous inode's ifile block */ 727 LFS_IENTRY(ifp, fs, tino, bp); 728 /* update the list pointer */ 729 onf = lfs_if_getnextfree(fs, ifp); 730 lfs_if_setnextfree(fs, ifp, ino); 731 /* write the block */ 732 LFS_BWRITE_LOG(bp); /* Ifile */ 733 734 /* load this inode's ifile block */ 735 LFS_IENTRY(ifp, fs, ino, bp); 736 /* update the list pointer */ 737 lfs_if_setnextfree(fs, ifp, onf); 738 /* write the block */ 739 LFS_BWRITE_LOG(bp); /* Ifile */ 740 741 /* If we're last, put us on the tail */ 742 if (onf == LFS_UNUSED_INUM) { 743 LFS_GET_TAILFREE(fs, cip, cbp, &otail); 744 LFS_PUT_TAILFREE(fs, cip, cbp, ino); 745 DLOG((DLOG_ALLOC, "lfs_vfree: tailfree %lld " 746 "-> %lld\n", (long long)otail, 747 (long long)ino)); 748 } 749 } 750 } 751 #endif 752 /* XXX: shouldn't this check be further up *before* we trash the fs? */ 753 KASSERTMSG((ino != LFS_UNUSED_INUM), "inode 0 freed"); 754 755 /* 756 * Update the segment summary for the segment where the on-disk 757 * copy used to be. 758 */ 759 if (!DADDR_IS_BAD(old_iaddr)) { 760 /* load it */ 761 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, old_iaddr), bp); 762 /* the number of bytes in the segment should not become < 0 */ 763 KASSERTMSG((sup->su_nbytes >= DINOSIZE(fs)), 764 "lfs_vfree: negative byte count" 765 " (segment %" PRIu32 " short by %d)\n", 766 lfs_dtosn(fs, old_iaddr), 767 (int)DINOSIZE(fs) - sup->su_nbytes); 768 /* update the number of bytes in the segment */ 769 sup->su_nbytes -= DINOSIZE(fs); 770 /* write the segment entry */ 771 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, old_iaddr), bp); /* Ifile */ 772 } 773 774 /* Set superblock modified bit. */ 775 mutex_enter(&lfs_lock); 776 fs->lfs_fmod = 1; 777 mutex_exit(&lfs_lock); 778 779 /* Decrement file count. */ 780 lfs_sb_subnfiles(fs, 1); 781 782 lfs_segunlock(fs); 783 784 DEBUG_CHECK_FREELIST(fs); 785 786 return (0); 787 } 788 789 /* 790 * Sort the freelist and set up the free-inode bitmap. 791 * To be called by lfs_mountfs(). 792 * 793 * Takes the segmenet lock. 794 */ 795 void 796 lfs_order_freelist(struct lfs *fs, ino_t **orphanp, size_t *norphanp) 797 { 798 CLEANERINFO *cip; 799 IFILE *ifp = NULL; 800 struct buf *bp; 801 ino_t ino, firstino, lastino, maxino; 802 ino_t *orphan = NULL; 803 size_t norphan = 0; 804 size_t norphan_alloc = 0; 805 806 ASSERT_NO_SEGLOCK(fs); 807 lfs_seglock(fs, SEGM_PROT); 808 809 DEBUG_CHECK_FREELIST(fs); 810 811 /* largest inode on fs */ 812 maxino = ((fs->lfs_ivnode->v_size >> lfs_sb_getbshift(fs)) - 813 lfs_sb_getcleansz(fs) - lfs_sb_getsegtabsz(fs)) * lfs_sb_getifpb(fs); 814 815 /* allocate the in-memory inode freemap */ 816 /* XXX: assert that fs->lfs_ino_bitmap is null here */ 817 fs->lfs_ino_bitmap = 818 malloc(((maxino + BMMASK) >> BMSHIFT) * sizeof(lfs_bm_t), 819 M_SEGMENT, M_WAITOK | M_ZERO); 820 KASSERT(fs->lfs_ino_bitmap != NULL); 821 822 /* 823 * Scan the ifile. 824 */ 825 826 firstino = lastino = LFS_UNUSED_INUM; 827 for (ino = 0; ino < maxino; ino++) { 828 /* Load this inode's ifile entry. */ 829 if (ino % lfs_sb_getifpb(fs) == 0) 830 LFS_IENTRY(ifp, fs, ino, bp); 831 else 832 LFS_IENTRY_NEXT(ifp, fs); 833 834 /* Don't put zero or ifile on the free list */ 835 if (ino == LFS_UNUSED_INUM || ino == LFS_IFILE_INUM) 836 continue; 837 838 /* 839 * Address orphaned files. 840 * 841 * The idea of this is to free inodes belonging to 842 * files that were unlinked but not reclaimed, I guess 843 * because if we're going to scan the whole ifile 844 * anyway it costs very little to do this. I don't 845 * immediately see any reason this should be disabled, 846 * but presumably it doesn't work... not sure what 847 * happens to such files currently. -- dholland 20160806 848 */ 849 if (lfs_if_getnextfree(fs, ifp) == LFS_ORPHAN_NEXTFREE(fs)) { 850 if (orphan == NULL) { 851 norphan_alloc = 32; /* XXX pulled from arse */ 852 orphan = kmem_zalloc(sizeof(orphan[0]) * 853 norphan_alloc, KM_SLEEP); 854 } else if (norphan == norphan_alloc) { 855 ino_t *orphan_new; 856 if (norphan_alloc >= 4096) 857 norphan_alloc += 4096; 858 else 859 norphan_alloc *= 2; 860 orphan_new = kmem_zalloc(sizeof(orphan[0]) * 861 norphan_alloc, KM_SLEEP); 862 memcpy(orphan_new, orphan, sizeof(orphan[0]) * 863 norphan); 864 kmem_free(orphan, sizeof(orphan[0]) * norphan); 865 orphan = orphan_new; 866 } 867 orphan[norphan++] = ino; 868 } 869 870 if (DADDR_IS_BAD(lfs_if_getdaddr(fs, ifp))) { 871 872 /* 873 * This inode is free. Put it on the free list. 874 */ 875 876 if (firstino == LFS_UNUSED_INUM) { 877 /* XXX: assert lastino == LFS_UNUSED_INUM? */ 878 /* remember the first free inode */ 879 firstino = ino; 880 } else { 881 /* release this inode's ifile entry */ 882 brelse(bp, 0); 883 884 /* XXX: assert lastino != LFS_UNUSED_INUM? */ 885 886 /* load lastino's ifile entry */ 887 LFS_IENTRY(ifp, fs, lastino, bp); 888 /* set the list pointer */ 889 lfs_if_setnextfree(fs, ifp, ino); 890 /* write the block */ 891 LFS_BWRITE_LOG(bp); 892 893 /* reload this inode's ifile entry */ 894 LFS_IENTRY(ifp, fs, ino, bp); 895 } 896 /* remember the last free inode seen so far */ 897 lastino = ino; 898 899 /* Mark this inode free in the in-memory freemap */ 900 SET_BITMAP_FREE(fs, ino); 901 } 902 903 /* If moving to the next ifile block, release the buffer. */ 904 if ((ino + 1) % lfs_sb_getifpb(fs) == 0) 905 brelse(bp, 0); 906 } 907 908 /* Write the freelist head and tail pointers */ 909 /* XXX: do we need to mark the superblock dirty? */ 910 LFS_PUT_HEADFREE(fs, cip, bp, firstino); 911 LFS_PUT_TAILFREE(fs, cip, bp, lastino); 912 913 /* done */ 914 lfs_segunlock(fs); 915 916 /* 917 * Shrink the array of orphans so we don't have to carry around 918 * the allocation size. 919 */ 920 if (norphan < norphan_alloc) { 921 ino_t *orphan_new = kmem_alloc(sizeof(orphan[0]) * norphan, 922 KM_SLEEP); 923 memcpy(orphan_new, orphan, sizeof(orphan[0]) * norphan); 924 kmem_free(orphan, sizeof(orphan[0]) * norphan_alloc); 925 orphan = orphan_new; 926 norphan_alloc = norphan; 927 } 928 929 *orphanp = orphan; 930 *norphanp = norphan; 931 932 DEBUG_CHECK_FREELIST(fs); 933 } 934 935 /* 936 * Mark a file orphaned (unlinked but not yet reclaimed) by inode 937 * number. Do this with a magic freelist next pointer. 938 * 939 * XXX: howzabout some locking? 940 */ 941 void 942 lfs_orphan(struct lfs *fs, ino_t ino) 943 { 944 IFILE *ifp; 945 struct buf *bp; 946 947 LFS_IENTRY(ifp, fs, ino, bp); 948 lfs_if_setnextfree(fs, ifp, LFS_ORPHAN_NEXTFREE(fs)); 949 LFS_BWRITE_LOG(bp); 950 } 951 952 /* 953 * Free orphans discovered during mount. This is a separate stage 954 * because it requires fs->lfs_suflags to be set up, which is not done 955 * by the time we run lfs_order_freelist. It's possible that we could 956 * run lfs_order_freelist later (i.e., set up fs->lfs_suflags sooner) 957 * but that requires more thought than I can put into this at the 958 * moment. 959 */ 960 void 961 lfs_free_orphans(struct lfs *fs, ino_t *orphan, size_t norphan) 962 { 963 size_t i; 964 965 DEBUG_CHECK_FREELIST(fs); 966 967 for (i = 0; i < norphan; i++) { 968 ino_t ino = orphan[i]; 969 unsigned segno; 970 struct vnode *vp; 971 struct inode *ip; 972 struct buf *bp; 973 IFILE *ifp; 974 SEGUSE *sup; 975 int error; 976 977 /* Get the segment the inode is in on disk. */ 978 LFS_IENTRY(ifp, fs, ino, bp); 979 KASSERT(!DADDR_IS_BAD(lfs_if_getdaddr(fs, ifp))); 980 segno = lfs_dtosn(fs, lfs_if_getdaddr(fs, ifp)); 981 brelse(bp, 0); 982 983 /* 984 * Try to get the vnode. If we can't, tough -- hope 985 * you have backups! 986 */ 987 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, LK_EXCLUSIVE, 988 &vp); 989 if (error) { 990 printf("orphan %jd vget error %d\n", (intmax_t)ino, 991 error); 992 continue; 993 } 994 995 /* 996 * Sanity-check the inode. 997 * 998 * XXX What to do if it is still referenced? 999 */ 1000 ip = VTOI(vp); 1001 if (ip->i_nlink != 0) 1002 printf("orphan %jd nlink %d\n", (intmax_t)ino, 1003 ip->i_nlink); 1004 1005 /* 1006 * Truncate the inode, to free any blocks allocated for 1007 * it, and release it, to free the inode number. 1008 * 1009 * XXX Isn't it redundant to truncate? Won't vput do 1010 * that for us? 1011 */ 1012 error = lfs_truncate(vp, 0, 0, NOCRED); 1013 if (error) 1014 printf("orphan %jd truncate error %d", (intmax_t)ino, 1015 error); 1016 vput(vp); 1017 1018 /* Update the number of bytes in the segment summary. */ 1019 LFS_SEGENTRY(sup, fs, segno, bp); 1020 KASSERT(sup->su_nbytes >= DINOSIZE(fs)); 1021 sup->su_nbytes -= DINOSIZE(fs); 1022 LFS_WRITESEGENTRY(sup, fs, segno, bp); 1023 1024 /* Drop the on-disk address. */ 1025 LFS_IENTRY(ifp, fs, ino, bp); 1026 lfs_if_setdaddr(fs, ifp, LFS_UNUSED_DADDR); 1027 LFS_BWRITE_LOG(bp); 1028 } 1029 1030 if (orphan) 1031 kmem_free(orphan, sizeof(orphan[0]) * norphan); 1032 1033 DEBUG_CHECK_FREELIST(fs); 1034 } 1035 1036 #ifdef DEBUG 1037 static void dump_freelist(struct lfs *); 1038 1039 void 1040 lfs_check_freelist(struct lfs *fs, const char *func, int line) 1041 { 1042 ino_t i, headino, maxino, thisino, tailino, nextfree; 1043 int nfree, count; 1044 struct inode *ip; 1045 IFILE *ifp; 1046 CLEANERINFO *cip; 1047 struct buf *bp; 1048 1049 if (!lfs_do_check_freelist) 1050 return; 1051 1052 lfs_seglock(fs, SEGM_PROT); 1053 1054 ip = VTOI(fs->lfs_ivnode); 1055 maxino = ((ip->i_size >> lfs_sb_getbshift(fs)) - lfs_sb_getcleansz(fs) - 1056 lfs_sb_getsegtabsz(fs)) * lfs_sb_getifpb(fs); 1057 1058 /* 1059 * First, check every node. Every inode that doesn't have a disk 1060 * address must have a nextfree pointer, the only exception 1061 * being the tail of the free list. 1062 */ 1063 LFS_GET_TAILFREE(fs, cip, bp, &tailino); 1064 nfree = 0; 1065 for (i = LFS_IFILE_INUM + 1; i < maxino; ++i) { 1066 LFS_IENTRY(ifp, fs, i, bp); 1067 if (lfs_if_getdaddr(fs, ifp) == LFS_UNUSED_DADDR) { 1068 ++nfree; 1069 if (lfs_if_getnextfree(fs, ifp) == LFS_UNUSED_INUM 1070 && i != tailino) { 1071 brelse(bp, 0); 1072 dump_freelist(fs); 1073 printf("At %s:%d:\n", func, line); 1074 printf("tailino=%jd, but ino=%jd" 1075 " neither daddr nor nextfree\n", 1076 (intmax_t)tailino, (intmax_t)i); 1077 panic("Free list leak\n"); 1078 } 1079 } 1080 if (i == tailino 1081 || (!INUM_IS_BAD(fs, lfs_if_getnextfree(fs, ifp)))) { 1082 if (lfs_if_getdaddr(fs, ifp) != LFS_UNUSED_DADDR) { 1083 brelse(bp, 0); 1084 dump_freelist(fs); 1085 printf("At %s:%d:\n", func, line); 1086 printf("with tailino=%jd, ino=%jd" 1087 " daddr=0x%jx, nextfree=0x%jx\n", 1088 (intmax_t)tailino, 1089 (intmax_t)i, 1090 (intmax_t)lfs_if_getdaddr(fs, ifp), 1091 (intmax_t)lfs_if_getnextfree(fs, ifp)); 1092 panic("In use inode on free list\n"); 1093 } 1094 } 1095 brelse(bp, 0); 1096 } 1097 1098 /* 1099 * Walk the free list from head to tail. We should end up with 1100 * the same number of free inodes as we counted above. 1101 */ 1102 1103 /* Get head of inode freelist */ 1104 LFS_GET_HEADFREE(fs, cip, bp, &headino); 1105 count = 0; 1106 thisino = headino; 1107 while (thisino != LFS_UNUSED_INUM) { 1108 if (++count > maxino) 1109 break; 1110 /* read this ifile entry */ 1111 LFS_IENTRY(ifp, fs, thisino, bp); 1112 nextfree = lfs_if_getnextfree(fs, ifp); 1113 brelse(bp, 0); 1114 if (nextfree == LFS_UNUSED_INUM) 1115 break; 1116 thisino = nextfree; 1117 } 1118 if (count > maxino) { 1119 dump_freelist(fs); 1120 printf("At %s:%d:\n", func, line); 1121 printf("count=%jd, maxino=%jd:\n", 1122 (intmax_t)count, (intmax_t)maxino); 1123 panic("loop in free list"); 1124 } 1125 if (count != nfree) { 1126 dump_freelist(fs); 1127 printf("At %s:%d:\n", func, line); 1128 printf("%d inodes without addresses, %d on free list\n", 1129 nfree, count); 1130 panic("Bad free list count"); 1131 } 1132 if (thisino != tailino) { 1133 dump_freelist(fs); 1134 printf("At %s:%d:\n", func, line); 1135 printf("Last ino %jd but tail %jd\n", 1136 (intmax_t)thisino, (intmax_t)tailino); 1137 panic("Bad tail"); 1138 } 1139 lfs_segunlock(fs); 1140 } 1141 1142 static void 1143 dump_freelist(struct lfs *fs) 1144 { 1145 ino_t i, ni, maxino, headino, tailino; 1146 struct inode *ip; 1147 IFILE *ifp; 1148 CLEANERINFO *cip; 1149 struct buf *bp; 1150 int count; 1151 1152 ip = VTOI(fs->lfs_ivnode); 1153 maxino = ((ip->i_size >> lfs_sb_getbshift(fs)) - lfs_sb_getcleansz(fs) - 1154 lfs_sb_getsegtabsz(fs)) * lfs_sb_getifpb(fs); 1155 1156 LFS_GET_HEADFREE(fs, cip, bp, &headino); 1157 printf(" head: %jd\n", (intmax_t)headino); 1158 LFS_GET_TAILFREE(fs, cip, bp, &tailino); 1159 printf(" tail: %jd\n", (intmax_t)tailino); 1160 count = 0; 1161 for (i = LFS_IFILE_INUM + 1; i < maxino; ++i) { 1162 LFS_IENTRY(ifp, fs, i, bp); 1163 ni = lfs_if_getnextfree(fs, ifp); 1164 if (ni != LFS_UNUSED_DADDR) { 1165 printf("%jd -> %jd\n", 1166 (intmax_t)i, (intmax_t)ni); 1167 if (++count > 30) { 1168 printf("...\n"); 1169 i = maxino; /* terminate loop */ 1170 } 1171 } 1172 brelse(bp, 0); 1173 } 1174 } 1175 #endif /* DEBUG */ 1176