Home | History | Annotate | Line # | Download | only in lfs
lfs_alloc.c revision 1.143
      1 /*	$NetBSD: lfs_alloc.c,v 1.143 2025/09/15 18:49:16 perseant Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Konrad E. Schroder <perseant (at) hhhh.org>.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 /*
     32  * Copyright (c) 1991, 1993
     33  *	The Regents of the University of California.  All rights reserved.
     34  *
     35  * Redistribution and use in source and binary forms, with or without
     36  * modification, are permitted provided that the following conditions
     37  * are met:
     38  * 1. Redistributions of source code must retain the above copyright
     39  *    notice, this list of conditions and the following disclaimer.
     40  * 2. Redistributions in binary form must reproduce the above copyright
     41  *    notice, this list of conditions and the following disclaimer in the
     42  *    documentation and/or other materials provided with the distribution.
     43  * 3. Neither the name of the University nor the names of its contributors
     44  *    may be used to endorse or promote products derived from this software
     45  *    without specific prior written permission.
     46  *
     47  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     48  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     49  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     50  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     51  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     52  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     53  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     54  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     55  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     56  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     57  * SUCH DAMAGE.
     58  *
     59  *	@(#)lfs_alloc.c	8.4 (Berkeley) 1/4/94
     60  */
     61 
     62 #include <sys/cdefs.h>
     63 __KERNEL_RCSID(0, "$NetBSD: lfs_alloc.c,v 1.143 2025/09/15 18:49:16 perseant Exp $");
     64 
     65 #if defined(_KERNEL_OPT)
     66 #include "opt_quota.h"
     67 #endif
     68 
     69 #include <sys/param.h>
     70 #include <sys/systm.h>
     71 #include <sys/kernel.h>
     72 #include <sys/buf.h>
     73 #include <sys/lock.h>
     74 #include <sys/vnode.h>
     75 #include <sys/syslog.h>
     76 #include <sys/mount.h>
     77 #include <sys/malloc.h>
     78 #include <sys/pool.h>
     79 #include <sys/proc.h>
     80 #include <sys/kauth.h>
     81 
     82 #include <ufs/lfs/ulfs_quotacommon.h>
     83 #include <ufs/lfs/ulfs_inode.h>
     84 #include <ufs/lfs/ulfsmount.h>
     85 #include <ufs/lfs/ulfs_extern.h>
     86 
     87 #include <ufs/lfs/lfs.h>
     88 #include <ufs/lfs/lfs_accessors.h>
     89 #include <ufs/lfs/lfs_extern.h>
     90 #include <ufs/lfs/lfs_kernel.h>
     91 
     92 /* Constants for inode free bitmap */
     93 #define BMSHIFT 5	/* 2 ** 5 = 32 */
     94 #define BMMASK  ((1 << BMSHIFT) - 1)
     95 #define SET_BITMAP_FREE(F, I) do { \
     96 	DLOG((DLOG_ALLOC, "lfs: ino %d wrd %d bit %d set\n", (int)(I), 	\
     97 	     (int)((I) >> BMSHIFT), (int)((I) & BMMASK)));		\
     98 	(F)->lfs_ino_bitmap[(I) >> BMSHIFT] |= (1U << ((I) & BMMASK));	\
     99 } while (0)
    100 #define CLR_BITMAP_FREE(F, I) do { \
    101 	DLOG((DLOG_ALLOC, "lfs: ino %d wrd %d bit %d clr\n", (int)(I), 	\
    102 	     (int)((I) >> BMSHIFT), (int)((I) & BMMASK)));		\
    103 	(F)->lfs_ino_bitmap[(I) >> BMSHIFT] &= ~(1U << ((I) & BMMASK));	\
    104 } while(0)
    105 
    106 #define ISSET_BITMAP_FREE(F, I) \
    107 	((F)->lfs_ino_bitmap[(I) >> BMSHIFT] & (1U << ((I) & BMMASK)))
    108 
    109 /*
    110  * Add a new block to the Ifile, to accommodate future file creations.
    111  * Called with the segment lock held.
    112  */
    113 int
    114 lfs_extend_ifile(struct lfs *fs, kauth_cred_t cred)
    115 {
    116 	struct vnode *vp;
    117 	struct inode *ip;
    118 	IFILE64 *ifp64;
    119 	IFILE32 *ifp32;
    120 	IFILE_V1 *ifp_v1;
    121 	struct buf *bp, *cbp;
    122 	int error;
    123 	daddr_t i, blkno, xmax;
    124 	ino_t oldhead, maxino, tail;
    125 	CLEANERINFO *cip;
    126 
    127 	ASSERT_SEGLOCK(fs);
    128 
    129 	/* XXX should check or assert that we aren't readonly. */
    130 
    131 	/*
    132 	 * Get a block and extend the ifile inode. Leave the buffer for
    133 	 * the block in bp.
    134 	 */
    135 
    136 	vp = fs->lfs_ivnode;
    137 	ip = VTOI(vp);
    138 	blkno = lfs_lblkno(fs, ip->i_size);
    139 	if ((error = lfs_balloc(vp, ip->i_size, lfs_sb_getbsize(fs), cred, 0,
    140 				&bp)) != 0) {
    141 		return (error);
    142 	}
    143 	ip->i_size += lfs_sb_getbsize(fs);
    144 	lfs_dino_setsize(fs, ip->i_din, ip->i_size);
    145 	uvm_vnp_setsize(vp, ip->i_size);
    146 
    147 	/*
    148 	 * Compute the new number of inodes, and reallocate the in-memory
    149 	 * inode freemap.
    150 	 */
    151 
    152 	maxino = ((ip->i_size >> lfs_sb_getbshift(fs)) - lfs_sb_getcleansz(fs) -
    153 		  lfs_sb_getsegtabsz(fs)) * lfs_sb_getifpb(fs);
    154 	fs->lfs_ino_bitmap = (lfs_bm_t *)
    155 		realloc(fs->lfs_ino_bitmap, ((maxino + BMMASK) >> BMSHIFT) *
    156 			sizeof(lfs_bm_t), M_SEGMENT, M_WAITOK);
    157 	KASSERT(fs->lfs_ino_bitmap != NULL);
    158 
    159 	/* first new inode number */
    160 	i = (blkno - lfs_sb_getsegtabsz(fs) - lfs_sb_getcleansz(fs)) *
    161 		lfs_sb_getifpb(fs);
    162 
    163 	/* inode number to stop at (XXX: why *x*max?) */
    164 	xmax = i + lfs_sb_getifpb(fs);
    165 
    166 	/*
    167 	 * We insert the new inodes at the head of the free list.
    168 	 * Under normal circumstances, the free list is empty here,
    169 	 * so we are also incidentally placing them at the end (which
    170 	 * we must do if we are to keep them in order).
    171 	 */
    172 	LFS_GET_HEADFREE(fs, cip, cbp, &oldhead);
    173 	LFS_PUT_HEADFREE(fs, cip, cbp, i);
    174 	LFS_GET_TAILFREE(fs, cip, cbp, &tail);
    175 	if (tail == LFS_UNUSED_INUM) {
    176 		tail = xmax - 1;
    177 		LFS_PUT_TAILFREE(fs, cip, cbp, tail);
    178 	}
    179 	KASSERTMSG((lfs_sb_getfreehd(fs) != LFS_UNUSED_INUM),
    180 	    "inode 0 allocated [2]");
    181 
    182 
    183 	/*
    184 	 * Initialize the ifile block.
    185 	 *
    186 	 * XXX: these loops should be restructured to use the accessor
    187 	 * functions instead of using cutpaste polymorphism.
    188 	 */
    189 
    190 	if (fs->lfs_is64) {
    191 		for (ifp64 = (IFILE64 *)bp->b_data; i < xmax; ++ifp64) {
    192 			SET_BITMAP_FREE(fs, i);
    193 			ifp64->if_version = 1;
    194 			ifp64->if_daddr = LFS_UNUSED_DADDR;
    195 			ifp64->if_nextfree = ++i;
    196 		}
    197 		ifp64--;
    198 		ifp64->if_nextfree = oldhead;
    199 	} else if (lfs_sb_getversion(fs) > 1) {
    200 		for (ifp32 = (IFILE32 *)bp->b_data; i < xmax; ++ifp32) {
    201 			SET_BITMAP_FREE(fs, i);
    202 			ifp32->if_version = 1;
    203 			ifp32->if_daddr = LFS_UNUSED_DADDR;
    204 			ifp32->if_nextfree = ++i;
    205 		}
    206 		ifp32--;
    207 		ifp32->if_nextfree = oldhead;
    208 	} else {
    209 		for (ifp_v1 = (IFILE_V1 *)bp->b_data; i < xmax; ++ifp_v1) {
    210 			SET_BITMAP_FREE(fs, i);
    211 			ifp_v1->if_version = 1;
    212 			ifp_v1->if_daddr = LFS_UNUSED_DADDR;
    213 			ifp_v1->if_nextfree = ++i;
    214 		}
    215 		ifp_v1--;
    216 		ifp_v1->if_nextfree = oldhead;
    217 	}
    218 
    219 	/*
    220 	 * Write out the new block.
    221 	 */
    222 
    223 	(void) LFS_BWRITE_LOG(bp); /* Ifile */
    224 
    225 	return 0;
    226 }
    227 
    228 /*
    229  * Allocate an inode for a new file.
    230  *
    231  * Takes the segment lock. Also (while holding it) takes lfs_lock
    232  * to frob fs->lfs_fmod.
    233  *
    234  * XXX: the mode argument is unused; should just get rid of it.
    235  */
    236 /* ARGSUSED */
    237 /* VOP_BWRITE 2i times */
    238 int
    239 lfs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred,
    240     ino_t *ino, int *gen)
    241 {
    242 	struct lfs *fs;
    243 	struct buf *bp, *cbp;
    244 	IFILE *ifp;
    245 	int error;
    246 	CLEANERINFO *cip;
    247 
    248 	fs = VTOI(pvp)->i_lfs;
    249 	if (fs->lfs_ronly)
    250 		return EROFS;
    251 
    252 	if (!(fs->lfs_flags & LFS_NOTYET))
    253 		ASSERT_NO_SEGLOCK(fs);
    254 
    255 	lfs_seglock(fs, SEGM_PROT);
    256 
    257 	/* Get the head of the freelist. */
    258 	LFS_GET_HEADFREE(fs, cip, cbp, ino);
    259 
    260 	/* paranoia */
    261 	KASSERT(*ino != LFS_UNUSED_INUM && *ino != LFS_IFILE_INUM);
    262 	DLOG((DLOG_ALLOC, "lfs_valloc: allocate inode %" PRId64 "\n",
    263 	     *ino));
    264 
    265 	/* Update the in-memory inode freemap */
    266 	CLR_BITMAP_FREE(fs, *ino);
    267 
    268 	/*
    269 	 * Fetch the ifile entry and make sure the inode is really
    270 	 * free.
    271 	 */
    272 	LFS_IENTRY(ifp, fs, *ino, bp);
    273 	if (lfs_if_getdaddr(fs, ifp) != LFS_UNUSED_DADDR)
    274 		panic("lfs_valloc: inuse inode %" PRId64 " on the free list",
    275 		    *ino);
    276 
    277 	/* Update the inode freelist head in the superblock. */
    278 	LFS_PUT_HEADFREE(fs, cip, cbp, lfs_if_getnextfree(fs, ifp));
    279 	DLOG((DLOG_ALLOC, "lfs_valloc: headfree %" PRId64 " -> %ju\n",
    280 	     *ino, (uintmax_t)lfs_if_getnextfree(fs, ifp)));
    281 
    282 	/*
    283 	 * Retrieve the version number from the ifile entry. It was
    284 	 * bumped by vfree, so don't bump it again.
    285 	 */
    286 	*gen = lfs_if_getversion(fs, ifp);
    287 
    288 	/* Done with ifile entry */
    289 	lfs_if_setdaddr(fs, ifp, LFS_UNUSED_DADDR);
    290 	lfs_if_setnextfree(fs, ifp, LFS_UNUSED_INUM);
    291 	LFS_BWRITE_LOG(bp);
    292 
    293 	if (lfs_sb_getfreehd(fs) == LFS_UNUSED_INUM) {
    294 		/*
    295 		 * No more inodes; extend the ifile so that the next
    296 		 * lfs_valloc will succeed.
    297 		 */
    298 		if ((error = lfs_extend_ifile(fs, cred)) != 0) {
    299 			/* restore the freelist */
    300 			LFS_PUT_HEADFREE(fs, cip, cbp, *ino);
    301 
    302 			/* unlock and return */
    303 			lfs_segunlock(fs);
    304 			return error;
    305 		}
    306 	}
    307 	KASSERTMSG((lfs_sb_getfreehd(fs) != LFS_UNUSED_INUM),
    308 	    "inode 0 allocated [3]");
    309 
    310 	/* Set superblock modified bit */
    311 	mutex_enter(&lfs_lock);
    312 	fs->lfs_fmod = 1;
    313 	mutex_exit(&lfs_lock);
    314 
    315 	/* increment file count */
    316 	lfs_sb_addnfiles(fs, 1);
    317 
    318 	/* done */
    319 	lfs_segunlock(fs);
    320 	return 0;
    321 }
    322 
    323 /*
    324  * Allocate an inode for a new file, with given inode number and
    325  * version.
    326  *
    327  * Called in the same context as lfs_valloc and therefore shares the
    328  * same locking assumptions.
    329  */
    330 int
    331 lfs_valloc_fixed(struct lfs *fs, ino_t ino, int vers)
    332 {
    333 	IFILE *ifp;
    334 	struct buf *bp, *cbp;
    335 	ino_t headino, thisino, oldnext, tailino;
    336 	CLEANERINFO *cip;
    337 	int extended = 0;
    338 
    339 	if (fs->lfs_ronly)
    340 		return EROFS;
    341 
    342 	if (!(fs->lfs_flags & LFS_NOTYET))
    343 		ASSERT_NO_SEGLOCK(fs);
    344 
    345 
    346 	lfs_seglock(fs, SEGM_PROT);
    347 
    348 	/*
    349 	 * If the ifile is too short to contain this inum, extend it.
    350 	 *
    351 	 * XXX: lfs_extend_ifile should take a size instead of always
    352 	 * doing just one block at time.
    353 	 */
    354 	while (VTOI(fs->lfs_ivnode)->i_size <= (ino /
    355 		lfs_sb_getifpb(fs) + lfs_sb_getcleansz(fs) + lfs_sb_getsegtabsz(fs))
    356 		<< lfs_sb_getbshift(fs)) {
    357 		DLOG((DLOG_ALLOC, "extend ifile to accommodate ino %jd\n",
    358 		      (intmax_t)ino));
    359 		lfs_extend_ifile(fs, NOCRED);
    360 		extended = 1;
    361 	}
    362 
    363 	/*
    364 	 * Get the inode freelist next pointer.
    365 	 */
    366 	LFS_IENTRY(ifp, fs, ino, bp);
    367 	oldnext = lfs_if_getnextfree(fs, ifp);
    368 	brelse(bp, 0);
    369 
    370 	/* Get fail of inode freelist */
    371 	LFS_GET_TAILFREE(fs, cip, cbp, &tailino);
    372 
    373 	/* Get head of inode freelist */
    374 	LFS_GET_HEADFREE(fs, cip, cbp, &headino);
    375 	if (headino == ino) {
    376 		/* Easy case: the inode we wanted was at the head */
    377 		LFS_PUT_HEADFREE(fs, cip, cbp, oldnext);
    378 	} else {
    379 		ino_t nextfree = 0, maxino, count; /* XXX: gcc */
    380 
    381 		/* Have to find the desired inode in the freelist... */
    382 		maxino = ((VTOI(fs->lfs_ivnode)->i_size >> lfs_sb_getbshift(fs))
    383 			  - lfs_sb_getcleansz(fs) - lfs_sb_getsegtabsz(fs))
    384 			* lfs_sb_getifpb(fs);
    385 		count = 0;
    386 
    387 		thisino = headino;
    388 		while (thisino != LFS_UNUSED_INUM) {
    389 			/* read this ifile entry */
    390 			LFS_IENTRY(ifp, fs, thisino, bp);
    391 			nextfree = lfs_if_getnextfree(fs, ifp);
    392 			/* stop if we find it or we hit the end */
    393 			if (nextfree == ino ||
    394 			    nextfree == LFS_UNUSED_INUM)
    395 				break;
    396 			/* nope, keep going... */
    397 			thisino = nextfree;
    398 			brelse(bp, 0);
    399 			if (++count > maxino)
    400 				break;
    401 		}
    402 		if (count > maxino) {
    403 			panic("loop in free list");
    404 			lfs_segunlock(fs);
    405 			return ENOENT;
    406 		}
    407 		if (nextfree == LFS_UNUSED_INUM) {
    408 			/* hit the end -- this inode is not available */
    409 			brelse(bp, 0);
    410 			lfs_segunlock(fs);
    411 			if (extended)
    412 				panic("extended ifile to accommodate but inode not found");
    413 			return ENOENT;
    414 		}
    415 		/* found it; update the next pointer */
    416 		lfs_if_setnextfree(fs, ifp, oldnext);
    417 		/* write the ifile block */
    418 		LFS_BWRITE_LOG(bp);
    419 
    420 		/* If our inode was the tail, thisino is now the tail */
    421 		if (tailino == ino)
    422 			LFS_PUT_TAILFREE(fs, cip, cbp, thisino);
    423 	}
    424 
    425 	/* Clear nextfree, note daddr, and set generation number */
    426 	LFS_IENTRY(ifp, fs, ino, bp);
    427 	lfs_if_setversion(fs, ifp, vers);
    428 	lfs_if_setnextfree(fs, ifp, LFS_UNUSED_INUM);
    429 	LFS_BWRITE_LOG(bp);
    430 
    431 	if (lfs_sb_getfreehd(fs) == LFS_UNUSED_INUM) {
    432 		int error;
    433 		/*
    434 		 * No more inodes; extend the ifile so that the next
    435 		 * lfs_valloc will succeed.
    436 		 */
    437 		if ((error = lfs_extend_ifile(fs, NOCRED)) != 0) {
    438 			/* restore the freelist */
    439 			LFS_PUT_HEADFREE(fs, cip, cbp, ino);
    440 
    441 			/* unlock and return */
    442 			lfs_segunlock(fs);
    443 			return error;
    444 		}
    445 	}
    446 	KASSERTMSG((lfs_sb_getfreehd(fs) != LFS_UNUSED_INUM),
    447 	    "inode 0 allocated [4]");
    448 
    449 	/* done */
    450 	lfs_segunlock(fs);
    451 	return 0;
    452 }
    453 
    454 #if 0
    455 /*
    456  * Find the highest-numbered allocated inode.
    457  * This will be used to shrink the Ifile.
    458  */
    459 static inline ino_t
    460 lfs_last_alloc_ino(struct lfs *fs)
    461 {
    462 	ino_t ino, maxino;
    463 
    464 	maxino = ((fs->lfs_ivnode->v_size >> lfs_sb_getbshift(fs)) -
    465 		  lfs_sb_getcleansz(fs) - lfs_sb_getsegtabsz(fs)) *
    466 		lfs_sb_getifpb(fs);
    467 	for (ino = maxino - 1; ino > LFS_UNUSED_INUM; --ino) {
    468 		if (ISSET_BITMAP_FREE(fs, ino) == 0)
    469 			break;
    470 	}
    471 	return ino;
    472 }
    473 #endif
    474 
    475 /*
    476  * Find the previous (next lowest numbered) free inode, if any.
    477  * If there is none, return LFS_UNUSED_INUM.
    478  *
    479  * XXX: locking?
    480  */
    481 static inline ino_t
    482 lfs_freelist_prev(struct lfs *fs, ino_t ino)
    483 {
    484 	ino_t tino, bound, bb, freehdbb;
    485 
    486 	if (lfs_sb_getfreehd(fs) == LFS_UNUSED_INUM) {
    487 		/* No free inodes at all */
    488 		return LFS_UNUSED_INUM;
    489 	}
    490 
    491 	/* Search our own word first */
    492 	bound = ino & ~BMMASK;
    493 	for (tino = ino - 1; tino >= bound && tino > LFS_UNUSED_INUM; tino--)
    494 		if (ISSET_BITMAP_FREE(fs, tino))
    495 			return tino;
    496 	/* If there are no lower words to search, just return */
    497 	if (ino >> BMSHIFT == 0)
    498 		return LFS_UNUSED_INUM;
    499 
    500 	/*
    501 	 * Find a word with a free inode in it.  We have to be a bit
    502 	 * careful here since ino_t is unsigned.
    503 	 */
    504 	freehdbb = (lfs_sb_getfreehd(fs) >> BMSHIFT);
    505 	for (bb = (ino >> BMSHIFT) - 1; bb >= freehdbb && bb > 0; --bb)
    506 		if (fs->lfs_ino_bitmap[bb])
    507 			break;
    508 	if (fs->lfs_ino_bitmap[bb] == 0)
    509 		return LFS_UNUSED_INUM;
    510 
    511 	/* Search the word we found */
    512 	for (tino = (bb << BMSHIFT) | BMMASK; tino >= (bb << BMSHIFT) &&
    513 	     tino > LFS_UNUSED_INUM; tino--)
    514 		if (ISSET_BITMAP_FREE(fs, tino))
    515 			break;
    516 
    517 	/* Avoid returning reserved inode numbers */
    518 	if (tino <= LFS_IFILE_INUM)
    519 		tino = LFS_UNUSED_INUM;
    520 
    521 	return tino;
    522 }
    523 
    524 /*
    525  * Free an inode.
    526  *
    527  * Takes lfs_seglock. Also (independently) takes vp->v_interlock.
    528  */
    529 /* ARGUSED */
    530 /* VOP_BWRITE 2i times */
    531 int
    532 lfs_vfree(struct vnode *vp, ino_t ino, int mode)
    533 {
    534 	SEGUSE *sup;
    535 	CLEANERINFO *cip;
    536 	struct buf *cbp, *bp;
    537 	IFILE *ifp;
    538 	struct inode *ip;
    539 	struct lfs *fs;
    540 	daddr_t old_iaddr;
    541 	ino_t otail;
    542 
    543 	/* Get the inode number and file system. */
    544 	ip = VTOI(vp);
    545 	fs = ip->i_lfs;
    546 	ino = ip->i_number;
    547 
    548 	/* XXX: assert not readonly */
    549 
    550 	ASSERT_NO_SEGLOCK(fs);
    551 	DLOG((DLOG_ALLOC, "lfs_vfree: free ino %lld\n", (long long)ino));
    552 
    553 	/* Drain of pending writes */
    554 	mutex_enter(vp->v_interlock);
    555 	while (lfs_sb_getversion(fs) > 1 && WRITEINPROG(vp)) {
    556 		cv_wait(&vp->v_cv, vp->v_interlock);
    557 	}
    558 	mutex_exit(vp->v_interlock);
    559 
    560 	lfs_seglock(fs, SEGM_PROT);
    561 
    562 	/*
    563 	 * If the inode was in a dirop, it isn't now.
    564 	 *
    565 	 * XXX: why are (v_uflag & VU_DIROP) and (ip->i_state & IN_ADIROP)
    566 	 * not updated together in one function? (and why do both exist,
    567 	 * anyway?)
    568 	 */
    569 	UNMARK_VNODE(vp);
    570 
    571 	mutex_enter(&lfs_lock);
    572 	if (vp->v_uflag & VU_DIROP) {
    573 		vp->v_uflag &= ~VU_DIROP;
    574 		--lfs_dirvcount;
    575 		--fs->lfs_dirvcount;
    576 		TAILQ_REMOVE(&fs->lfs_dchainhd, ip, i_lfs_dchain);
    577 		wakeup(&fs->lfs_dirvcount);
    578 		wakeup(&lfs_dirvcount);
    579 		mutex_exit(&lfs_lock);
    580 		vrele(vp);
    581 
    582 		/*
    583 		 * If this inode is not going to be written any more, any
    584 		 * segment accounting left over from its truncation needs
    585 		 * to occur at the end of the next dirops flush.  Attach
    586 		 * them to the fs-wide list for that purpose.
    587 		 */
    588 		if (LIST_FIRST(&ip->i_lfs_segdhd) != NULL) {
    589 			struct segdelta *sd;
    590 
    591 			while((sd = LIST_FIRST(&ip->i_lfs_segdhd)) != NULL) {
    592 				LIST_REMOVE(sd, list);
    593 				LIST_INSERT_HEAD(&fs->lfs_segdhd, sd, list);
    594 			}
    595 		}
    596 	} else {
    597 		/*
    598 		 * If it's not a dirop, we can finalize right away.
    599 		 */
    600 		mutex_exit(&lfs_lock);
    601 		lfs_finalize_ino_seguse(fs, ip);
    602 	}
    603 
    604 	/* it is no longer an unwritten inode, so update the counts */
    605 	mutex_enter(&lfs_lock);
    606 	LFS_CLR_UINO(ip, IN_ACCESSED|IN_CLEANING|IN_MODIFIED);
    607 	mutex_exit(&lfs_lock);
    608 
    609 	/* Turn off all inode modification flags */
    610 	ip->i_state &= ~IN_ALLMOD;
    611 
    612 	/* Mark it deleted */
    613 	ip->i_lfs_iflags |= LFSI_DELETED;
    614 
    615 	/* Mark it free in the in-memory inode freemap */
    616 	SET_BITMAP_FREE(fs, ino);
    617 
    618 	/*
    619 	 * Set the ifile's inode entry to unused, increment its version number
    620 	 * and link it onto the free chain.
    621 	 */
    622 
    623 	/* fetch the ifile entry */
    624 	LFS_IENTRY(ifp, fs, ino, bp);
    625 
    626 	/* update the on-disk address (to "nowhere") */
    627 	old_iaddr = lfs_if_getdaddr(fs, ifp);
    628 	lfs_if_setdaddr(fs, ifp, LFS_UNUSED_DADDR);
    629 
    630 	/* bump the version */
    631 	lfs_if_setversion(fs, ifp, lfs_if_getversion(fs, ifp) + 1);
    632 
    633 	if (lfs_sb_getversion(fs) == 1) {
    634 		ino_t nextfree;
    635 
    636 		/* insert on freelist */
    637 		LFS_GET_HEADFREE(fs, cip, cbp, &nextfree);
    638 		lfs_if_setnextfree(fs, ifp, nextfree);
    639 		LFS_PUT_HEADFREE(fs, cip, cbp, ino);
    640 
    641 		/* write the ifile block */
    642 		(void) LFS_BWRITE_LOG(bp); /* Ifile */
    643 	} else {
    644 		ino_t tino, onf;
    645 
    646 		/*
    647 		 * Clear the freelist next pointer and write the ifile
    648 		 * block. XXX: why? I'm sure there must be a reason but
    649 		 * it seems both silly and dangerous.
    650 		 */
    651 		lfs_if_setnextfree(fs, ifp, LFS_UNUSED_INUM);
    652 		(void) LFS_BWRITE_LOG(bp); /* Ifile */
    653 
    654 		/*
    655 		 * Insert on freelist in order.
    656 		 */
    657 
    658 		/* Find the next lower (by number) free inode */
    659 		tino = lfs_freelist_prev(fs, ino);
    660 
    661 		if (tino == LFS_UNUSED_INUM) {
    662 			ino_t nextfree;
    663 
    664 			/*
    665 			 * There isn't one; put us on the freelist head.
    666 			 */
    667 
    668 			/* reload the ifile block */
    669 			LFS_IENTRY(ifp, fs, ino, bp);
    670 			/* update the list */
    671 			LFS_GET_HEADFREE(fs, cip, cbp, &nextfree);
    672 			lfs_if_setnextfree(fs, ifp, nextfree);
    673 			LFS_PUT_HEADFREE(fs, cip, cbp, ino);
    674 			DLOG((DLOG_ALLOC, "lfs_vfree: headfree %lld -> %lld\n",
    675 			     (long long)nextfree, (long long)ino));
    676 			/* write the ifile block */
    677 			LFS_BWRITE_LOG(bp); /* Ifile */
    678 
    679 			/* If the list was empty, set tail too */
    680 			LFS_GET_TAILFREE(fs, cip, cbp, &otail);
    681 			if (otail == LFS_UNUSED_INUM) {
    682 				LFS_PUT_TAILFREE(fs, cip, cbp, ino);
    683 				DLOG((DLOG_ALLOC, "lfs_vfree: tailfree %lld "
    684 				      "-> %lld\n", (long long)otail,
    685 				      (long long)ino));
    686 			}
    687 		} else {
    688 			/*
    689 			 * Insert this inode into the list after tino.
    690 			 * We hold the segment lock so we don't have to
    691 			 * worry about blocks being written out of order.
    692 			 */
    693 
    694 			DLOG((DLOG_ALLOC, "lfs_vfree: insert ino %lld "
    695 			      " after %lld\n", ino, tino));
    696 
    697 			/* load the previous inode's ifile block */
    698 			LFS_IENTRY(ifp, fs, tino, bp);
    699 			/* update the list pointer */
    700 			onf = lfs_if_getnextfree(fs, ifp);
    701 			lfs_if_setnextfree(fs, ifp, ino);
    702 			/* write the block */
    703 			LFS_BWRITE_LOG(bp);	/* Ifile */
    704 
    705 			/* load this inode's ifile block */
    706 			LFS_IENTRY(ifp, fs, ino, bp);
    707 			/* update the list pointer */
    708 			lfs_if_setnextfree(fs, ifp, onf);
    709 			/* write the block */
    710 			LFS_BWRITE_LOG(bp);	/* Ifile */
    711 
    712 			/* If we're last, put us on the tail */
    713 			if (onf == LFS_UNUSED_INUM) {
    714 				LFS_GET_TAILFREE(fs, cip, cbp, &otail);
    715 				LFS_PUT_TAILFREE(fs, cip, cbp, ino);
    716 				DLOG((DLOG_ALLOC, "lfs_vfree: tailfree %lld "
    717 				      "-> %lld\n", (long long)otail,
    718 				      (long long)ino));
    719 			}
    720 		}
    721 	}
    722 	/* XXX: shouldn't this check be further up *before* we trash the fs? */
    723 	KASSERTMSG((ino != LFS_UNUSED_INUM), "inode 0 freed");
    724 
    725 	/*
    726 	 * Update the segment summary for the segment where the on-disk
    727 	 * copy used to be.
    728 	 */
    729 	if (!DADDR_IS_BAD(old_iaddr)) {
    730 		/* load it */
    731 		LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, old_iaddr), bp);
    732 		/* the number of bytes in the segment should not become < 0 */
    733 		KASSERTMSG((sup->su_nbytes >= DINOSIZE(fs)),
    734 		    "lfs_vfree: negative byte count"
    735 		    " (segment %" PRIu32 " short by %d)\n",
    736 		    lfs_dtosn(fs, old_iaddr),
    737 		    (int)DINOSIZE(fs) - sup->su_nbytes);
    738 		/* update the number of bytes in the segment */
    739 		sup->su_nbytes -= DINOSIZE(fs);
    740 		/* write the segment entry */
    741 		LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, old_iaddr), bp); /* Ifile */
    742 	}
    743 
    744 	/* Set superblock modified bit. */
    745 	mutex_enter(&lfs_lock);
    746 	fs->lfs_fmod = 1;
    747 	mutex_exit(&lfs_lock);
    748 
    749 	/* Decrement file count. */
    750 	lfs_sb_subnfiles(fs, 1);
    751 
    752 	lfs_segunlock(fs);
    753 
    754 	return (0);
    755 }
    756 
    757 /*
    758  * Sort the freelist and set up the free-inode bitmap.
    759  * To be called by lfs_mountfs().
    760  *
    761  * Takes the segmenet lock.
    762  */
    763 void
    764 lfs_order_freelist(struct lfs *fs, ino_t **orphanp, size_t *norphanp)
    765 {
    766 	CLEANERINFO *cip;
    767 	IFILE *ifp = NULL;
    768 	struct buf *bp;
    769 	ino_t ino, firstino, lastino, maxino;
    770 	ino_t *orphan = NULL;
    771 	size_t norphan = 0;
    772 	size_t norphan_alloc = 0;
    773 
    774 	ASSERT_NO_SEGLOCK(fs);
    775 	lfs_seglock(fs, SEGM_PROT);
    776 
    777 	/* largest inode on fs */
    778 	maxino = ((fs->lfs_ivnode->v_size >> lfs_sb_getbshift(fs)) -
    779 		  lfs_sb_getcleansz(fs) - lfs_sb_getsegtabsz(fs)) * lfs_sb_getifpb(fs);
    780 
    781 	/* allocate the in-memory inode freemap */
    782 	/* XXX: assert that fs->lfs_ino_bitmap is null here */
    783 	fs->lfs_ino_bitmap =
    784 		malloc(((maxino + BMMASK) >> BMSHIFT) * sizeof(lfs_bm_t),
    785 		       M_SEGMENT, M_WAITOK | M_ZERO);
    786 	KASSERT(fs->lfs_ino_bitmap != NULL);
    787 
    788 	/*
    789 	 * Scan the ifile.
    790 	 */
    791 
    792 	firstino = lastino = LFS_UNUSED_INUM;
    793 	for (ino = 0; ino < maxino; ino++) {
    794 		/* Load this inode's ifile entry. */
    795 		if (ino % lfs_sb_getifpb(fs) == 0)
    796 			LFS_IENTRY(ifp, fs, ino, bp);
    797 		else
    798 			LFS_IENTRY_NEXT(ifp, fs);
    799 
    800 		/* Don't put zero or ifile on the free list */
    801 		if (ino == LFS_UNUSED_INUM || ino == LFS_IFILE_INUM)
    802 			continue;
    803 
    804 		/*
    805 		 * Address orphaned files.
    806 		 *
    807 		 * The idea of this is to free inodes belonging to
    808 		 * files that were unlinked but not reclaimed, I guess
    809 		 * because if we're going to scan the whole ifile
    810 		 * anyway it costs very little to do this. I don't
    811 		 * immediately see any reason this should be disabled,
    812 		 * but presumably it doesn't work... not sure what
    813 		 * happens to such files currently. -- dholland 20160806
    814 		 */
    815 		if (lfs_if_getnextfree(fs, ifp) == LFS_ORPHAN_NEXTFREE(fs)) {
    816 			if (orphan == NULL) {
    817 				norphan_alloc = 32; /* XXX pulled from arse */
    818 				orphan = kmem_zalloc(sizeof(orphan[0]) *
    819 				    norphan_alloc, KM_SLEEP);
    820 			} else if (norphan == norphan_alloc) {
    821 				ino_t *orphan_new;
    822 				if (norphan_alloc >= 4096)
    823 					norphan_alloc += 4096;
    824 				else
    825 					norphan_alloc *= 2;
    826 				orphan_new = kmem_zalloc(sizeof(orphan[0]) *
    827 				    norphan_alloc, KM_SLEEP);
    828 				memcpy(orphan_new, orphan, sizeof(orphan[0]) *
    829 				    norphan);
    830 				kmem_free(orphan, sizeof(orphan[0]) * norphan);
    831 				orphan = orphan_new;
    832 			}
    833 			orphan[norphan++] = ino;
    834 		}
    835 
    836 		if (DADDR_IS_BAD(lfs_if_getdaddr(fs, ifp))) {
    837 
    838 			/*
    839 			 * This inode is free. Put it on the free list.
    840 			 */
    841 
    842 			if (firstino == LFS_UNUSED_INUM) {
    843 				/* XXX: assert lastino == LFS_UNUSED_INUM? */
    844 				/* remember the first free inode */
    845 				firstino = ino;
    846 			} else {
    847 				/* release this inode's ifile entry */
    848 				brelse(bp, 0);
    849 
    850 				/* XXX: assert lastino != LFS_UNUSED_INUM? */
    851 
    852 				/* load lastino's ifile entry */
    853 				LFS_IENTRY(ifp, fs, lastino, bp);
    854 				/* set the list pointer */
    855 				lfs_if_setnextfree(fs, ifp, ino);
    856 				/* write the block */
    857 				LFS_BWRITE_LOG(bp);
    858 
    859 				/* reload this inode's ifile entry */
    860 				LFS_IENTRY(ifp, fs, ino, bp);
    861 			}
    862 			/* remember the last free inode seen so far */
    863 			lastino = ino;
    864 
    865 			/* Mark this inode free in the in-memory freemap */
    866 			SET_BITMAP_FREE(fs, ino);
    867 		}
    868 
    869 		/* If moving to the next ifile block, release the buffer. */
    870 		if ((ino + 1) % lfs_sb_getifpb(fs) == 0)
    871 			brelse(bp, 0);
    872 	}
    873 
    874 	/* Write the freelist head and tail pointers */
    875 	/* XXX: do we need to mark the superblock dirty? */
    876 	LFS_PUT_HEADFREE(fs, cip, bp, firstino);
    877 	LFS_PUT_TAILFREE(fs, cip, bp, lastino);
    878 
    879 	/* done */
    880 	lfs_segunlock(fs);
    881 
    882 	/*
    883 	 * Shrink the array of orphans so we don't have to carry around
    884 	 * the allocation size.
    885 	 */
    886 	if (norphan < norphan_alloc) {
    887 		ino_t *orphan_new = kmem_alloc(sizeof(orphan[0]) * norphan,
    888 		    KM_SLEEP);
    889 		memcpy(orphan_new, orphan, sizeof(orphan[0]) * norphan);
    890 		kmem_free(orphan, sizeof(orphan[0]) * norphan_alloc);
    891 		orphan = orphan_new;
    892 		norphan_alloc = norphan;
    893 	}
    894 
    895 	*orphanp = orphan;
    896 	*norphanp = norphan;
    897 }
    898 
    899 /*
    900  * Mark a file orphaned (unlinked but not yet reclaimed) by inode
    901  * number. Do this with a magic freelist next pointer.
    902  *
    903  * XXX: howzabout some locking?
    904  */
    905 void
    906 lfs_orphan(struct lfs *fs, ino_t ino)
    907 {
    908 	IFILE *ifp;
    909 	struct buf *bp;
    910 
    911 	LFS_IENTRY(ifp, fs, ino, bp);
    912 	lfs_if_setnextfree(fs, ifp, LFS_ORPHAN_NEXTFREE(fs));
    913 	LFS_BWRITE_LOG(bp);
    914 }
    915 
    916 /*
    917  * Free orphans discovered during mount.  This is a separate stage
    918  * because it requires fs->lfs_suflags to be set up, which is not done
    919  * by the time we run lfs_order_freelist.  It's possible that we could
    920  * run lfs_order_freelist later (i.e., set up fs->lfs_suflags sooner)
    921  * but that requires more thought than I can put into this at the
    922  * moment.
    923  */
    924 void
    925 lfs_free_orphans(struct lfs *fs, ino_t *orphan, size_t norphan)
    926 {
    927 	size_t i;
    928 
    929 	for (i = 0; i < norphan; i++) {
    930 		ino_t ino = orphan[i];
    931 		unsigned segno;
    932 		struct vnode *vp;
    933 		struct inode *ip;
    934 		struct buf *bp;
    935 		IFILE *ifp;
    936 		SEGUSE *sup;
    937 		int error;
    938 
    939 		/* Get the segment the inode is in on disk.  */
    940 		LFS_IENTRY(ifp, fs, ino, bp);
    941 		KASSERT(!DADDR_IS_BAD(lfs_if_getdaddr(fs, ifp)));
    942 		segno = lfs_dtosn(fs, lfs_if_getdaddr(fs, ifp));
    943 		brelse(bp, 0);
    944 
    945 		/*
    946 		 * Try to get the vnode.  If we can't, tough -- hope
    947 		 * you have backups!
    948 		 */
    949 		error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, LK_EXCLUSIVE,
    950 		    &vp);
    951 		if (error) {
    952 			printf("orphan %jd vget error %d\n", (intmax_t)ino,
    953 			    error);
    954 			continue;
    955 		}
    956 
    957 		/*
    958 		 * Sanity-check the inode.
    959 		 *
    960 		 * XXX What to do if it is still referenced?
    961 		 */
    962 		ip = VTOI(vp);
    963 		if (ip->i_nlink != 0)
    964 			printf("orphan %jd nlink %d\n", (intmax_t)ino,
    965 			    ip->i_nlink);
    966 
    967 		/*
    968 		 * Truncate the inode, to free any blocks allocated for
    969 		 * it, and release it, to free the inode number.
    970 		 *
    971 		 * XXX Isn't it redundant to truncate?  Won't vput do
    972 		 * that for us?
    973 		 */
    974 		error = lfs_truncate(vp, 0, 0, NOCRED);
    975 		if (error)
    976 			printf("orphan %jd truncate error %d", (intmax_t)ino,
    977 			    error);
    978 		vput(vp);
    979 
    980 		/* Update the number of bytes in the segment summary.  */
    981 		LFS_SEGENTRY(sup, fs, segno, bp);
    982 		KASSERT(sup->su_nbytes >= DINOSIZE(fs));
    983 		sup->su_nbytes -= DINOSIZE(fs);
    984 		LFS_WRITESEGENTRY(sup, fs, segno, bp);
    985 
    986 		/* Drop the on-disk address.  */
    987 		LFS_IENTRY(ifp, fs, ino, bp);
    988 		lfs_if_setdaddr(fs, ifp, LFS_UNUSED_DADDR);
    989 		LFS_BWRITE_LOG(bp);
    990 	}
    991 
    992 	if (orphan)
    993 		kmem_free(orphan, sizeof(orphan[0]) * norphan);
    994 }
    995