Home | History | Annotate | Line # | Download | only in lfs
      1 /*	$NetBSD: lfs_kclean.c,v 1.1 2025/11/06 15:54:27 perseant Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2025 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Konrad E. Schroder <perseant (at) hhhh.org>.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 __KERNEL_RCSID(0, "$NetBSD: lfs_kclean.c,v 1.1 2025/11/06 15:54:27 perseant Exp $");
     34 
     35 #include <sys/param.h>
     36 #include <sys/systm.h>
     37 #include <sys/namei.h>
     38 #include <sys/proc.h>
     39 #include <sys/kernel.h>
     40 #include <sys/vnode.h>
     41 #include <sys/conf.h>
     42 #include <sys/kauth.h>
     43 #include <sys/buf.h>
     44 #include <sys/kthread.h>
     45 
     46 #include <ufs/lfs/ulfs_inode.h>
     47 #include <ufs/lfs/ulfsmount.h>
     48 #include <ufs/lfs/ulfs_extern.h>
     49 
     50 #include <ufs/lfs/lfs.h>
     51 #include <ufs/lfs/lfs_accessors.h>
     52 #include <ufs/lfs/lfs_kernel.h>
     53 #include <ufs/lfs/lfs_extern.h>
     54 
     55 static int ino_func_setclean(struct lfs_inofuncarg *);
     56 static int finfo_func_rewrite(struct lfs_finfofuncarg *);
     57 static int finfo_func_setclean(struct lfs_finfofuncarg *);
     58 static int rewrite_block(struct lfs *, struct vnode *, daddr_t, daddr_t,
     59 			 size_t, int *);
     60 
     61 static int clean(struct lfs *);
     62 static long segselect_cb_rosenblum(struct lfs *, int, SEGUSE *, long);
     63 static long segselect_greedy(struct lfs *, int, SEGUSE *);
     64 static long segselect_cb_time(struct lfs *, int, SEGUSE *);
     65 #if 0
     66 static long segselect_cb_serial(struct lfs *, int, SEGUSE *);
     67 #endif
     68 
     69 struct lwp * lfs_cleaner_daemon = NULL;
     70 extern kcondvar_t	lfs_allclean_wakeup;
     71 static int lfs_ncleaners = 0;
     72 
     73 static int
     74 ino_func_setclean(struct lfs_inofuncarg *lifa)
     75 {
     76 	struct lfs *fs;
     77 	daddr_t offset;
     78 	struct vnode *devvp, *vp;
     79 	union lfs_dinode *dip;
     80 	struct buf *dbp, *ibp;
     81 	int error;
     82 	IFILE *ifp;
     83 	unsigned i, num;
     84 	daddr_t true_addr;
     85 	ino_t ino;
     86 
     87 	fs = lifa->fs;
     88 	offset = lifa->offset;
     89 	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
     90 
     91 	/* Read inode block */
     92 	error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs),
     93 	    0, &dbp);
     94 	if (error) {
     95 		DLOG((DLOG_RF, "ino_func_setclean: bread returned %d\n",
     96 		      error));
     97 		return error;
     98 	}
     99 	memcpy(lifa->buf, dbp->b_data, dbp->b_bcount);
    100 	brelse(dbp, BC_AGE);
    101 
    102 	/* Check each inode against ifile entry */
    103 	num = LFS_INOPB(fs);
    104 	for (i = num; i-- > 0; ) {
    105 		dip = DINO_IN_BLOCK(fs, lifa->buf, i);
    106 		ino = lfs_dino_getinumber(fs, dip);
    107 		if (ino == LFS_IFILE_INUM) {
    108 			/* Check address against superblock */
    109 			true_addr = lfs_sb_getidaddr(fs);
    110 		} else {
    111 			/* Not ifile.  Check address against ifile. */
    112 			LFS_IENTRY(ifp, fs, ino, ibp);
    113 			true_addr = lfs_if_getdaddr(fs, ifp);
    114 			brelse(ibp, 0);
    115 		}
    116 		if (offset != true_addr)
    117 			continue;
    118 
    119 		LFS_ASSERT_MAXINO(fs, ino);
    120 
    121 		/* XXX We can use fastvget here! */
    122 
    123 		/*
    124 		 * An inode we need to relocate.
    125 		 * Get it if we can.
    126 		 */
    127 		if (ino == LFS_IFILE_INUM)
    128 			vp = fs->lfs_ivnode;
    129 		else
    130 			error = VFS_VGET(fs->lfs_ivnode->v_mount, ino,
    131 					 LK_EXCLUSIVE | LK_NOWAIT, &vp);
    132 		if (error)
    133 			continue;
    134 
    135 		KASSERT(VTOI(vp)->i_gen == lfs_dino_getgen(fs, dip));
    136 		lfs_setclean(fs, vp);
    137 		if (vp != fs->lfs_ivnode) {
    138 			VOP_UNLOCK(vp);
    139 			vrele(vp);
    140 		}
    141 	}
    142 
    143 	return error;
    144 }
    145 
    146 static int
    147 ino_func_rewrite(struct lfs_inofuncarg *lifa)
    148 {
    149 	struct lfs *fs;
    150 	daddr_t offset;
    151 	struct vnode *devvp, *vp;
    152 	union lfs_dinode *dip;
    153 	struct buf *dbp, *ibp;
    154 	int error;
    155 	IFILE *ifp;
    156 	unsigned i, num;
    157 	daddr_t true_addr;
    158 	ino_t ino;
    159 
    160 	fs = lifa->fs;
    161 	offset = lifa->offset;
    162 	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
    163 
    164 	/* Read inode block */
    165 	error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs),
    166 	    0, &dbp);
    167 	if (error) {
    168 		DLOG((DLOG_RF, "ino_func_rewrite: bread returned %d\n",
    169 		      error));
    170 		return error;
    171 	}
    172 	memcpy(lifa->buf, dbp->b_data, dbp->b_bcount);
    173 	brelse(dbp, BC_AGE);
    174 
    175 	/* Check each inode against ifile entry */
    176 	num = LFS_INOPB(fs);
    177 	for (i = num; i-- > 0; ) {
    178 		dip = DINO_IN_BLOCK(fs, lifa->buf, i);
    179 		ino = lfs_dino_getinumber(fs, dip);
    180 		if (ino == LFS_IFILE_INUM) {
    181 			/* Check address against superblock */
    182 			true_addr = lfs_sb_getidaddr(fs);
    183 		} else {
    184 			/* Not ifile.  Check address against ifile. */
    185 			LFS_IENTRY(ifp, fs, ino, ibp);
    186 			true_addr = lfs_if_getdaddr(fs, ifp);
    187 			brelse(ibp, 0);
    188 		}
    189 		if (offset != true_addr)
    190 			continue;
    191 
    192 		if (ino == LFS_IFILE_INUM)
    193 			continue;
    194 
    195 		LFS_ASSERT_MAXINO(fs, ino);
    196 
    197 		/* XXX We can use fastvget here! */
    198 
    199 		/*
    200 		 * An inode we need to relocate.
    201 		 * Get it if we can.
    202 		 */
    203 		error = VFS_VGET(fs->lfs_ivnode->v_mount, ino,
    204 				 LK_EXCLUSIVE | LK_NOWAIT, &vp);
    205 		if (error)
    206 			continue;
    207 
    208 		KASSERT(VTOI(vp)->i_gen == lfs_dino_getgen(fs, dip));
    209 
    210 		if (!(VTOI(vp)->i_state & IN_CLEANING)) {
    211 			lfs_setclean(fs, vp);
    212 			lfs_writeinode(fs, fs->lfs_sp, VTOI(vp));
    213 		}
    214 
    215 		VOP_UNLOCK(vp);
    216 		vrele(vp);
    217 
    218 	}
    219 
    220 	return error;
    221 }
    222 
    223 static int
    224 rewrite_block(struct lfs *fs, struct vnode *vp, daddr_t lbn, daddr_t offset, size_t size, int *have_finfop)
    225 {
    226 	daddr_t daddr;
    227 	int error;
    228 	struct buf *bp;
    229 	struct inode *ip;
    230 
    231 	KASSERT(have_finfop != NULL);
    232 
    233 	/* Look up current location of this block. */
    234 	error = VOP_BMAP(vp, lbn, NULL, &daddr, NULL);
    235 	if (error)
    236 		return error;
    237 
    238 	/* Skip any block that is not here. */
    239 	if (offset != 0 && LFS_DBTOFSB(fs, daddr) != offset)
    240 		return ESTALE;
    241 
    242 	/*
    243 	 * It is (was recently) here.  Read the block.
    244 	 */
    245 	//size = lfs_blksize(fs, VTOI(vp), lbn);
    246 	error = bread(vp, lbn, size, 0, &bp);
    247 	if (error)
    248 		return error;
    249 
    250 	if (vp == fs->lfs_ivnode) {
    251 		VOP_BWRITE(vp, bp);
    252 	} else {
    253 		/* Get ready to write. */
    254 		if (!*have_finfop) {
    255 			ip = VTOI(vp);
    256 			lfs_acquire_finfo(fs, ip->i_number, ip->i_gen);
    257 			fs->lfs_sp->vp = vp;
    258 			*have_finfop = 1;
    259 		}
    260 
    261 		KASSERT(bp->b_vp == vp);
    262 		/* bp->b_cflags |= BC_INVAL; */ /* brelse will kill the buffer */
    263 		lfs_bwrite_ext(bp, BW_CLEAN);
    264 		KASSERT(bp->b_vp == vp);
    265 		mutex_enter(&bufcache_lock);
    266 		while (lfs_gatherblock(fs->lfs_sp, bp, &bufcache_lock)) {
    267 			KASSERT(bp->b_vp != NULL);
    268 		}
    269 		mutex_exit(&bufcache_lock);
    270 
    271 		KASSERT(bp->b_flags & B_GATHERED);
    272 		KASSERT(fs->lfs_sp->cbpp[-1] == bp);
    273 	}
    274 	return 0;
    275 }
    276 
    277 static int
    278 finfo_func_rewrite(struct lfs_finfofuncarg *lffa)
    279 {
    280 	struct lfs *fs;
    281 	FINFO *fip;
    282 	daddr_t *offsetp;
    283 	int j, have_finfo, error;
    284 	size_t size, bytes;
    285 	ino_t ino;
    286 	uint32_t gen;
    287 	struct vnode *vp;
    288 	daddr_t lbn;
    289 	int *fragsp;
    290 
    291 	fs = lffa->fs;
    292 	fip = lffa->finfop;
    293 	offsetp = lffa->offsetp;
    294 	fragsp = (int *)lffa->arg;
    295 
    296 	/* Get the inode and check its version. */
    297 	ino = lfs_fi_getino(fs, fip);
    298 	gen = lfs_fi_getversion(fs, fip);
    299 	error = 0;
    300 	if (ino == LFS_IFILE_INUM)
    301 		vp = fs->lfs_ivnode;
    302 	else {
    303 		LFS_ASSERT_MAXINO(fs, ino);
    304 		error = VFS_VGET(fs->lfs_ivnode->v_mount, ino,
    305 				 LK_EXCLUSIVE|LK_NOWAIT, &vp);
    306 	}
    307 
    308 	/*
    309 	 * If we can't, or if version is wrong, or it has dirop blocks on it,
    310 	 * we can't relocate its blocks; but we still have to count
    311 	 * blocks through the partial segment to return the right offset.
    312 	 * XXX actually we can move DIROP vnodes' *old* data, as long
    313 	 * XXX as we are sure that we are moving *only* the old data---?
    314 	 */
    315 	if (error || VTOI(vp)->i_gen != gen || (vp->v_uflag & VU_DIROP)) {
    316 		if (error == 0)
    317 			error = ESTALE;
    318 
    319 		if (vp != NULL && vp != fs->lfs_ivnode) {
    320 			VOP_UNLOCK(vp);
    321 			vrele(vp);
    322 		}
    323 		vp = NULL;
    324 		bytes = ((lfs_fi_getnblocks(fs, fip) - 1) << lfs_sb_getbshift(fs))
    325 			+ lfs_fi_getlastlength(fs, fip);
    326 		*offsetp += lfs_btofsb(fs, bytes);
    327 
    328 		return error;
    329 	}
    330 
    331 	/*
    332 	 * We have the vnode and its version is correct.
    333 	 * Take a cleaning reference; and loop through the blocks
    334 	 * and rewrite them.
    335 	 */
    336 	lfs_setclean(fs, vp);
    337 	size = lfs_sb_getbsize(fs);
    338 	have_finfo = 0;
    339 	for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) {
    340 		if (j == lfs_fi_getnblocks(fs, fip) - 1)
    341 			size = lfs_fi_getlastlength(fs, fip);
    342 		/*
    343 		 * An error of ESTALE indicates that there was nothing
    344 		 * to rewrite; this is not a problem.  Any other error
    345 		 * causes us to skip the rest of this FINFO.
    346 		 */
    347 		if (vp != NULL && error == 0) {
    348 			lbn = lfs_fi_getblock(fs, fip, j);
    349 			error = rewrite_block(fs, vp, lbn, *offsetp,
    350 					      size, &have_finfo);
    351 			if (error == ESTALE)
    352 				error = 0;
    353 			if (fragsp != NULL && error == 0)
    354 				*fragsp += lfs_btofsb(fs, size);
    355 		}
    356 		*offsetp += lfs_btofsb(fs, size);
    357 	}
    358 
    359 	/*
    360 	 * If we acquired finfo, release it and write the blocks.
    361 	 */
    362 	if (have_finfo) {
    363 		lfs_updatemeta(fs->lfs_sp);
    364 		fs->lfs_sp->vp = NULL;
    365 		lfs_release_finfo(fs);
    366 		lfs_writeinode(fs, fs->lfs_sp, VTOI(vp));
    367 	}
    368 
    369 	/* Release vnode */
    370 	if (vp != fs->lfs_ivnode) {
    371 		VOP_UNLOCK(vp);
    372 		vrele(vp);
    373 	}
    374 
    375 	return error;
    376 }
    377 
    378 static int
    379 finfo_func_setclean(struct lfs_finfofuncarg *lffa)
    380 {
    381 	struct lfs *fs;
    382 	FINFO *fip;
    383 	daddr_t *offsetp;
    384 	int error;
    385 	size_t bytes;
    386 	ino_t ino;
    387 	uint32_t gen;
    388 	struct vnode *vp;
    389 
    390 	fs = lffa->fs;
    391 	fip = lffa->finfop;
    392 	offsetp = lffa->offsetp;
    393 
    394 	/* Get the inode and check its version. */
    395 	ino = lfs_fi_getino(fs, fip);
    396 	gen = lfs_fi_getversion(fs, fip);
    397 	error = 0;
    398 	if (ino == LFS_IFILE_INUM)
    399 		vp = fs->lfs_ivnode;
    400 	else {
    401 		LFS_ASSERT_MAXINO(fs, ino);
    402 		error = VFS_VGET(fs->lfs_ivnode->v_mount, ino,
    403 				 LK_EXCLUSIVE|LK_NOWAIT, &vp);
    404 	}
    405 
    406 	/* If we have it and its version is right, take a cleaning reference */
    407 	if (error == 0 && VTOI(vp)->i_gen == gen)
    408 		lfs_setclean(fs, vp);
    409 
    410 	if (vp == fs->lfs_ivnode)
    411 		vp = NULL;
    412 	else if (vp != NULL) {
    413 		VOP_UNLOCK(vp);
    414 		vrele(vp);
    415 		vp = NULL;
    416 	}
    417 
    418 	/* Skip to the next block */
    419 	bytes = ((lfs_fi_getnblocks(fs, fip) - 1) << lfs_sb_getbshift(fs))
    420 		+ lfs_fi_getlastlength(fs, fip);
    421 	*offsetp += lfs_btofsb(fs, bytes);
    422 
    423 	return error;
    424 }
    425 
    426 /*
    427  * Use the partial-segment parser to rewrite (clean) a segment.
    428  */
    429 int
    430 lfs_rewrite_segment(struct lfs *fs, int sn, int *fragsp, kauth_cred_t cred, struct lwp *l)
    431 {
    432 	daddr_t ooffset, offset, endpseg;
    433 
    434 	ASSERT_SEGLOCK(fs);
    435 
    436 	offset = lfs_sntod(fs, sn);
    437 	lfs_skip_superblock(fs, &offset);
    438 	endpseg = lfs_sntod(fs, sn + 1);
    439 
    440 	while (offset > 0 && offset != endpseg) {
    441 		/* First check summary validity (XXX unnecessary?) */
    442 		ooffset = offset;
    443 		lfs_parse_pseg(fs, &offset, 0, cred, NULL, l,
    444 			     NULL, NULL, CKSEG_CKSUM, NULL);
    445 		if (offset == ooffset)
    446 			break;
    447 
    448 		/*
    449 		 * Valid, proceed.
    450 		 *
    451 		 * First write the file blocks, marking their
    452 		 * inodes IN_CLEANING.
    453 		 */
    454 		offset = ooffset;
    455 		lfs_parse_pseg(fs, &offset, 0, cred, NULL, l,
    456 			       NULL, finfo_func_rewrite,
    457 			       CKSEG_NONE, fragsp);
    458 
    459 		/*
    460 		 * Now go back and pick up any inodes that
    461 		 * were not already marked IN_CLEANING, and
    462 		 * write them as well.
    463 		 */
    464 		offset = ooffset;
    465 		lfs_parse_pseg(fs, &offset, 0, cred, NULL, l,
    466 			       ino_func_rewrite, NULL,
    467 			       CKSEG_NONE, fragsp);
    468 	}
    469 	return 0;
    470 }
    471 
    472 /*
    473  * Rewrite the contents of one or more segments, in preparation for
    474  * marking them clean.
    475  */
    476 int
    477 lfs_rewrite_segments(struct lfs *fs, int *snn, int len, int *directp, int *offsetp, struct lwp *l)
    478 {
    479 	kauth_cred_t cred;
    480 	int i, error;
    481 	struct buf *bp;
    482 	SEGUSE *sup;
    483 	daddr_t offset, endpseg;
    484 
    485 	ASSERT_NO_SEGLOCK(fs);
    486 
    487 	cred = l ? l->l_cred : NOCRED;
    488 
    489 	/* Prevent new dirops and acquire the cleaner lock. */
    490 	lfs_writer_enter(fs, "rewritesegs");
    491 	if ((error = lfs_cleanerlock(fs)) != 0) {
    492 		lfs_writer_leave(fs);
    493 		return error;
    494 	}
    495 
    496 	/*
    497 	 * Pre-reference vnodes now that we have cleaner lock
    498 	 * but before we take the segment lock.  We don't want to
    499 	 * mix cleaning blocks with flushed vnodes.
    500 	 */
    501 	for (i = 0; i < len; i++) {
    502 		error = 0;
    503 		/* Refuse to clean segments that are ACTIVE */
    504 		LFS_SEGENTRY(sup, fs, snn[i], bp);
    505 		if (sup->su_flags & SEGUSE_ACTIVE
    506 		    || !(sup->su_flags & SEGUSE_DIRTY))
    507 			error = EINVAL;
    508 
    509 		brelse(bp, 0);
    510 		if (error)
    511 			break;
    512 
    513 		offset = lfs_sntod(fs, snn[i]);
    514 		lfs_skip_superblock(fs, &offset);
    515 		endpseg = lfs_sntod(fs, snn[i] + 1);
    516 
    517 		while (offset > 0 && offset != endpseg) {
    518 			lfs_parse_pseg(fs, &offset, 0, cred, NULL, l,
    519 				       ino_func_setclean, finfo_func_setclean,
    520 				       CKSEG_NONE, NULL);
    521 		}
    522 	}
    523 
    524 	/*
    525 	 * Actually rewrite the contents of the segment.
    526 	 */
    527 	lfs_seglock(fs, SEGM_CLEAN);
    528 
    529 	for (i = 0; i < len; i++) {
    530 		error = 0;
    531 		/* Refuse to clean segments that are ACTIVE */
    532 		LFS_SEGENTRY(sup, fs, snn[i], bp);
    533 		if (sup->su_flags & SEGUSE_ACTIVE
    534 		    || !(sup->su_flags & SEGUSE_DIRTY))
    535 			error = EINVAL;
    536 
    537 		brelse(bp, 0);
    538 		if (error)
    539 			break;
    540 
    541 		error = lfs_rewrite_segment(fs, snn[i], directp, cred, l);
    542 		if (error) {
    543 			printf("  rewrite_segment returned %d\n", error);
    544 			break;
    545 		}
    546 	}
    547 	while (lfs_writeseg(fs, fs->lfs_sp))
    548 		;
    549 
    550 	*offsetp = lfs_btofsb(fs, fs->lfs_sp->bytes_written);
    551 	lfs_segunlock(fs);
    552 	lfs_cleanerunlock(fs);
    553 	lfs_writer_leave(fs);
    554 
    555 	return error;
    556 }
    557 
    558 #if 0
    559 static bool
    560 lfs_isseq(const struct lfs *fs, long int lbn1, long int lbn2)
    561 {
    562 	return lbn2 == lbn1 + lfs_sb_getfrag(__UNCONST(fs));
    563 }
    564 
    565 /*
    566  * Rewrite the contents of a file in order to coalesce it.
    567  * We don't bother rewriting indirect blocks because they will have to
    568  * be rewritten anyway when we rewrite the direct blocks.
    569  */
    570 int
    571 lfs_rewrite_file(struct lfs *fs, ino_t ino, struct lwp *l)
    572 {
    573 	daddr_t lbn, hiblk, daddr;
    574 	int i, error, num, run;
    575 	struct vnode *vp;
    576 	struct indir indirs[ULFS_NIADDR+2];
    577 	size_t size;
    578 
    579 	ASSERT_SEGLOCK(fs);
    580 
    581 	LFS_ASSERT_MAXINO(fs, ino);
    582 
    583 	error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, LK_EXCLUSIVE, &vp);
    584 	if (error)
    585 		return error;
    586 
    587 	lfs_acquire_finfo(fs, ino, VTOI(vp)->i_gen);
    588 	for (lbn = 0, hiblk = VTOI(vp)->i_lfs_hiblk; lbn < hiblk; ++lbn) {
    589 		error = ulfs_bmaparray(vp, lbn, &daddr, &indirs[0], &num, &run,
    590 				       lfs_isseq);
    591 		if (daddr == UNASSIGNED)
    592 			continue;
    593 		for (i = 0; i <= run; i++) {
    594 			size = lfs_blksize(fs, VTOI(vp), lbn);
    595 			error = rewrite_block(fs, vp, lbn++, 0x0, size, NULL);
    596 			if (error)
    597 				break;
    598 		}
    599 	}
    600 	lfs_release_finfo(fs);
    601 	while (lfs_writeseg(fs, fs->lfs_sp))
    602 		;
    603 	lfs_segunlock(fs);
    604 
    605 	return error;
    606 }
    607 #endif /* 0 */
    608 
    609 
    610 static int
    611 ino_func_checkempty(struct lfs_inofuncarg *lifa)
    612 {
    613 	struct lfs *fs;
    614 	daddr_t offset;
    615 	struct vnode *devvp;
    616 	union lfs_dinode *dip;
    617 	struct buf *dbp, *ibp;
    618 	int error;
    619 	IFILE *ifp;
    620 	unsigned i, num;
    621 	daddr_t true_addr;
    622 	ino_t ino;
    623 
    624 	fs = lifa->fs;
    625 	offset = lifa->offset;
    626 	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
    627 
    628 	/* Read inode block */
    629 	error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs),
    630 	    0, &dbp);
    631 	if (error) {
    632 		DLOG((DLOG_RF, "ino_func_checkempty: bread returned %d\n",
    633 		      error));
    634 		return error;
    635 	}
    636 
    637 	/* Check each inode against ifile entry */
    638 	num = LFS_INOPB(fs);
    639 	for (i = num; i-- > 0; ) {
    640 		dip = DINO_IN_BLOCK(fs, dbp->b_data, i);
    641 		ino = lfs_dino_getinumber(fs, dip);
    642 		if (ino == LFS_IFILE_INUM) {
    643 			/* Check address against superblock */
    644 			true_addr = lfs_sb_getidaddr(fs);
    645 		} else {
    646 			/* Not ifile.  Check address against ifile. */
    647 			LFS_IENTRY(ifp, fs, ino, ibp);
    648 			true_addr = lfs_if_getdaddr(fs, ifp);
    649 			brelse(ibp, 0);
    650 		}
    651 		if (offset == true_addr) {
    652 			error = EEXIST;
    653 			break;
    654 		}
    655 	}
    656 	brelse(dbp, BC_AGE);
    657 
    658 	return error;
    659 }
    660 
    661 static int
    662 finfo_func_checkempty(struct lfs_finfofuncarg *lffa)
    663 {
    664 	struct lfs *fs;
    665 	FINFO *fip;
    666 	daddr_t *offsetp;
    667 	int j, error;
    668 	size_t size, bytes;
    669 	ino_t ino;
    670 	uint32_t gen;
    671 	struct vnode *vp;
    672 	daddr_t lbn, daddr;
    673 
    674 	fs = lffa->fs;
    675 	fip = lffa->finfop;
    676 	offsetp = lffa->offsetp;
    677 
    678 	/* Get the inode and check its version. */
    679 	ino = lfs_fi_getino(fs, fip);
    680 	gen = lfs_fi_getversion(fs, fip);
    681 	error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, LK_EXCLUSIVE, &vp);
    682 
    683 	/*
    684 	 * If we can't, or if version is wrong, this FINFO does not refer
    685 	 * to a live file.  Skip over it and continue.
    686 	 */
    687 	if (error || VTOI(vp)->i_gen != gen) {
    688 		if (error == 0)
    689 			error = ESTALE;
    690 
    691 		if (vp != NULL) {
    692 			VOP_UNLOCK(vp);
    693 			vrele(vp);
    694 			vp = NULL;
    695 		}
    696 		bytes = ((lfs_fi_getnblocks(fs, fip) - 1)
    697 			 << lfs_sb_getbshift(fs))
    698 			+ lfs_fi_getlastlength(fs, fip);
    699 		*offsetp += lfs_btofsb(fs, bytes);
    700 
    701 		return error;
    702 	}
    703 
    704 	/*
    705 	 * We have the vnode and its version is correct.
    706 	 * Loop through the blocks and check their currency.
    707 	 */
    708 	size = lfs_sb_getbsize(fs);
    709 	for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) {
    710 		if (j == lfs_fi_getnblocks(fs, fip) - 1)
    711 			size = lfs_fi_getlastlength(fs, fip);
    712 		if (vp != NULL) {
    713 			lbn = lfs_fi_getblock(fs, fip, j);
    714 
    715 			/* Look up current location of this block. */
    716 			error = VOP_BMAP(vp, lbn, NULL, &daddr, NULL);
    717 			if (error)
    718 				break;
    719 
    720 			/* If it is here, the segment is not empty. */
    721 			if (LFS_DBTOFSB(fs, daddr) == *offsetp) {
    722 				error = EEXIST;
    723 				break;
    724 			}
    725 		}
    726 		*offsetp += lfs_btofsb(fs, size);
    727 	}
    728 
    729 	/* Release vnode */
    730 	VOP_UNLOCK(vp);
    731 	vrele(vp);
    732 
    733 	return error;
    734 }
    735 
    736 int
    737 lfs_checkempty(struct lfs *fs, int sn, kauth_cred_t cred, struct lwp *l)
    738 {
    739 	daddr_t offset, endpseg;
    740 	int error;
    741 
    742 	ASSERT_SEGLOCK(fs);
    743 
    744 	offset = lfs_sntod(fs, sn);
    745 	lfs_skip_superblock(fs, &offset);
    746 	endpseg = lfs_sntod(fs, sn + 1);
    747 
    748 	while (offset > 0 && offset < endpseg) {
    749 		error = lfs_parse_pseg(fs, &offset, 0, cred, NULL, l,
    750 				     ino_func_checkempty,
    751 				     finfo_func_checkempty,
    752 				     CKSEG_NONE, NULL);
    753 		if (error)
    754 			return error;
    755 	}
    756 	return 0;
    757 }
    758 
    759 static long
    760 segselect_greedy(struct lfs *fs, int sn, SEGUSE *sup)
    761 {
    762 	return lfs_sb_getssize(fs) - sup->su_nbytes;
    763 }
    764 
    765 __inline static long
    766 segselect_cb_rosenblum(struct lfs *fs, int sn, SEGUSE *sup, long age)
    767 {
    768 	long benefit, cost;
    769 
    770 	benefit = (int64_t)lfs_sb_getssize(fs) - sup->su_nbytes -
    771 		(sup->su_nsums + 1) * lfs_sb_getfsize(fs);
    772 	if (sup->su_flags & SEGUSE_SUPERBLOCK)
    773 		benefit -= LFS_SBPAD;
    774 	if (lfs_sb_getbsize(fs) > lfs_sb_getfsize(fs)) /* fragmentation */
    775 		benefit -= (lfs_sb_getbsize(fs) / 2);
    776 	if (benefit <= 0) {
    777 		return 0;
    778 	}
    779 
    780 	cost = lfs_sb_getssize(fs) + sup->su_nbytes;
    781 	return (256 * benefit * age) / cost;
    782 }
    783 
    784 static long
    785 segselect_cb_time(struct lfs *fs, int sn, SEGUSE *sup)
    786 {
    787 	long age;
    788 
    789 	age = time_second - sup->su_lastmod;
    790 	if (age < 0)
    791 		age = 0;
    792 	return segselect_cb_rosenblum(fs, sn, sup, age);
    793 }
    794 
    795 #if 0
    796 /*
    797  * Same as the time comparator, but fetch the serial number from the
    798  * segment header to compare.
    799  *
    800  * This is ugly.  Whether serial number or wall time is better is a
    801  * worthy question, but if we want to use serial number to compute
    802  * age, we should record the serial number in su_lastmod instead of
    803  * the time.
    804  */
    805 static long
    806 segselect_cb_serial(struct lfs *fs, int sn, SEGUSE *sup)
    807 {
    808 	struct buf *bp;
    809 	uint32_t magic;
    810 	uint64_t age, serial;
    811 	daddr_t addr;
    812 
    813 	addr = lfs_segtod(fs, sn);
    814 	lfs_skip_superblock(fs, &addr);
    815 	bread(fs->lfs_devvp, LFS_FSBTODB(fs, addr),
    816 	      lfs_sb_getsumsize(fs), 0, &bp);
    817 	magic = lfs_ss_getmagic(fs, ((SEGSUM *)bp->b_data));
    818 	serial = lfs_ss_getserial(fs, ((SEGSUM *)bp->b_data));
    819 	brelse(bp, 0);
    820 
    821 	if (magic != SS_MAGIC)
    822 		return 0;
    823 
    824 	age = lfs_sb_getserial(fs) - serial;
    825 	return segselect_cb_rosenblum(fs, sn, sup, age);
    826 }
    827 #endif
    828 
    829 void
    830 lfs_cleanerd(void *arg)
    831 {
    832 	mount_iterator_t *iter;
    833  	struct mount *mp;
    834  	struct lfs *fs;
    835 	struct vfsops *vfs = NULL;
    836 	int lfsc;
    837 	int cleaned_something = 0;
    838 
    839 	mutex_enter(&lfs_lock);
    840 	KASSERTMSG(lfs_cleaner_daemon == NULL,
    841 		   "more than one LFS cleaner daemon");
    842 	lfs_cleaner_daemon = curlwp;
    843 	mutex_exit(&lfs_lock);
    844 
    845 	/* Take an extra reference to the LFS vfsops. */
    846 	vfs = vfs_getopsbyname(MOUNT_LFS);
    847 
    848  	mutex_enter(&lfs_lock);
    849  	for (;;) {
    850 		KASSERT(mutex_owned(&lfs_lock));
    851 		if (cleaned_something == 0)
    852 			cv_timedwait(&lfs_allclean_wakeup, &lfs_lock, hz/10 + 1);
    853 		KASSERT(mutex_owned(&lfs_lock));
    854 		cleaned_something = 0;
    855 
    856 		KASSERT(mutex_owned(&lfs_lock));
    857 		mutex_exit(&lfs_lock);
    858 
    859  		/*
    860  		 * Look through the list of LFSs to see if any of them
    861 		 * need cleaning.
    862  		 */
    863  		mountlist_iterator_init(&iter);
    864 		lfsc = 0;
    865 		while ((mp = mountlist_iterator_next(iter)) != NULL) {
    866 			KASSERT(!mutex_owned(&lfs_lock));
    867  			if (strncmp(mp->mnt_stat.f_fstypename, MOUNT_LFS,
    868  			    sizeof(mp->mnt_stat.f_fstypename)) == 0) {
    869  				fs = VFSTOULFS(mp)->um_lfs;
    870 
    871 				mutex_enter(&lfs_lock);
    872 				if (fs->lfs_clean_selector != NULL)
    873 					++lfsc;
    874 				mutex_exit(&lfs_lock);
    875 				cleaned_something += clean(fs);
    876 			}
    877  		}
    878 		if (lfsc == 0) {
    879 			mutex_enter(&lfs_lock);
    880 			lfs_cleaner_daemon = NULL;
    881 			mutex_exit(&lfs_lock);
    882 			mountlist_iterator_destroy(iter);
    883 			break;
    884 		}
    885  		mountlist_iterator_destroy(iter);
    886 
    887  		mutex_enter(&lfs_lock);
    888  	}
    889 	KASSERT(!mutex_owned(&lfs_lock));
    890 
    891 	/* Give up our extra reference so the module can be unloaded. */
    892 	mutex_enter(&vfs_list_lock);
    893 	if (vfs != NULL)
    894 		vfs->vfs_refcount--;
    895 	mutex_exit(&vfs_list_lock);
    896 
    897 	/* Done! */
    898 	kthread_exit(0);
    899 }
    900 
    901 /*
    902  * Look at the file system to see whether it needs cleaning, and if it does,
    903  * clean a segment.
    904  */
    905 static int
    906 clean(struct lfs *fs)
    907 {
    908 	struct buf *bp;
    909 	SEGUSE *sup;
    910 	int sn, maxsn, nclean, nready, nempty, nerror, nzero, again, target;
    911 	long prio, maxprio, maxeprio, thresh;
    912 	long (*func)(struct lfs *, int, SEGUSE *);
    913 	uint32_t __debugused segflags = 0;
    914 	daddr_t oldsn, bfree, avail;
    915 	int direct, offset;
    916 
    917 	func = fs->lfs_clean_selector;
    918 	if (func == NULL)
    919 		return 0;
    920 
    921 	thresh = fs->lfs_autoclean.thresh;
    922 	if (fs->lfs_flags & LFS_MUSTCLEAN)
    923 		thresh = 0;
    924 	else if (thresh < 0) {
    925 		/*
    926 		 * Compute a priority threshold based on availability ratio.
    927 		 * XXX These numbers only makes sense for the greedy cleaner.
    928 		 * What is an appropriate threshold for the cost-benefit
    929 		 * cleaner?
    930 		 */
    931 		bfree = lfs_sb_getbfree(fs)
    932 			+ lfs_segtod(fs, 1) * lfs_sb_getminfree(fs);
    933 		avail = lfs_sb_getavail(fs) - fs->lfs_ravail - fs->lfs_favail;
    934 		if (avail > bfree)
    935 			return 0;
    936 		thresh = lfs_sb_getssize(fs) * (bfree - avail)
    937 			/ (lfs_sb_getsize(fs) - avail);
    938 		if (thresh > lfs_sb_getsumsize(fs) + 5 * lfs_sb_getbsize(fs))
    939 			thresh = lfs_sb_getsumsize(fs) + 5 * lfs_sb_getbsize(fs);
    940 		if (thresh > lfs_sb_getssize(fs) - lfs_sb_getbsize(fs))
    941 			return 0;
    942 	}
    943 
    944 	target = fs->lfs_autoclean.target;
    945 	if (target <= 0) {
    946 		/* Default to half a segment target */
    947 		target = lfs_segtod(fs, 1) / 2;
    948 	}
    949 
    950 	oldsn = lfs_dtosn(fs, lfs_sb_getoffset(fs));
    951 
    952 	again = 0;
    953 	maxprio = maxeprio = -1;
    954 	nzero = nclean = nready = nempty = nerror = 0;
    955 	for (sn = 0; sn < lfs_sb_getnseg(fs); sn++) {
    956 
    957 		prio = 0;
    958 		LFS_SEGENTRY(sup, fs, sn, bp);
    959 		if (sup->su_flags & SEGUSE_ACTIVE)
    960 			prio = 0;
    961 		else if (!(sup->su_flags & SEGUSE_DIRTY))
    962 			++nclean;
    963 		else if (sup->su_flags & SEGUSE_READY)
    964 			++nready;
    965 		else if (sup->su_flags & SEGUSE_EMPTY)
    966 			++nempty;
    967 		else if (sup->su_nbytes == 0)
    968 			++nzero;
    969 		else
    970 			prio = (*func)(fs, sn, sup);
    971 
    972 		if (sup->su_flags & SEGUSE_ERROR) {
    973 			if (prio > maxeprio)
    974 				maxeprio = prio;
    975 			prio = 0;
    976 			++nerror;
    977 		}
    978 
    979 		if (prio > maxprio) {
    980 			maxprio = prio;
    981 			maxsn = sn;
    982 			segflags = sup->su_flags;
    983 		}
    984 		brelse(bp, 0);
    985 	}
    986 	DLOG((DLOG_CLEAN, "%s clean=%d/%d zero=%d empty=%d ready=%d maxsn=%d maxprio=%ld/%ld segflags=0x%lx\n",
    987 	       (maxprio > thresh ? "YES" : "NO "),
    988 	       nclean, (int)lfs_sb_getnseg(fs), nzero, nempty, nready,
    989 	       maxsn, maxprio, (unsigned long)thresh,
    990 	       (unsigned long)segflags));
    991 
    992 	/*
    993 	 * If we are trying to clean the segment we cleaned last,
    994 	 * cleaning did not work.  Mark this segment SEGUSE_ERROR
    995 	 * and try again.
    996 	 */
    997 	if (maxprio > 0 && fs->lfs_lastcleaned == maxsn) {
    998 		LFS_SEGENTRY(sup, fs, maxsn, bp);
    999 		sup->su_flags |= SEGUSE_ERROR;
   1000 		LFS_WRITESEGENTRY(sup, fs, sn, bp);
   1001 		return 1;
   1002 	}
   1003 
   1004 	/*
   1005 	 * If there were nothing but error segments, clear error.
   1006 	 * We will wait to try again.
   1007 	 */
   1008 	if (maxprio == 0 && maxeprio > 0) {
   1009 		DLOG((DLOG_CLEAN, "clear error on %d segments, try again\n",
   1010 		      nerror));
   1011 		lfs_seguse_clrflag_all(fs, SEGUSE_ERROR);
   1012 	}
   1013 
   1014 	/* Rewrite the highest-priority segment */
   1015 	if (maxprio > thresh) {
   1016 		direct = offset = 0;
   1017 		(void)lfs_rewrite_segments(fs, &maxsn, 1,
   1018 					   &direct, &offset, curlwp);
   1019 		DLOG((DLOG_CLEAN, "  direct=%d offset=%d\n", direct, offset));
   1020 		again += direct;
   1021 		fs->lfs_clean_accum += offset;
   1022 
   1023 		/* Don't clean this again immediately */
   1024 		fs->lfs_lastcleaned = maxsn;
   1025 	}
   1026 
   1027 	/*
   1028 	 * If we are in dire straits but we have segments already
   1029 	 * empty, force a double checkpoint to reclaim them.
   1030 	 */
   1031 	if (fs->lfs_flags & LFS_MUSTCLEAN) {
   1032 		if (nready + nempty > 0) {
   1033 			printf("force checkpoint with nready=%d nempty=%d nzero=%d\n",
   1034 			       nready, nempty, nzero);
   1035 			lfs_segwrite(fs->lfs_ivnode->v_mount,
   1036 				     SEGM_CKP | SEGM_FORCE_CKP | SEGM_SYNC);
   1037 			lfs_segwrite(fs->lfs_ivnode->v_mount,
   1038 				     SEGM_CKP | SEGM_FORCE_CKP | SEGM_SYNC);
   1039 			++again;
   1040 		}
   1041 	} else if (fs->lfs_clean_accum > target) {
   1042 		DLOG((DLOG_CLEAN, "checkpoint to flush\n"));
   1043 		lfs_segwrite(fs->lfs_ivnode->v_mount, SEGM_CKP);
   1044 		fs->lfs_clean_accum = 0;
   1045 	} else if (lfs_dtosn(fs, lfs_sb_getoffset(fs)) != oldsn
   1046 		   || nempty + nready > LFS_MAX_ACTIVE) { /* XXX arbitrary */
   1047 		DLOG((DLOG_CLEAN, "write to promote empty segments\n"));
   1048 		lfs_segwrite(fs->lfs_ivnode->v_mount, SEGM_CKP);
   1049 		fs->lfs_clean_accum = 0;
   1050 	}
   1051 
   1052 	return again;
   1053 }
   1054 
   1055 /*
   1056  * Rewrite a file in its entirety.
   1057  *
   1058  * Generally this would be done to coalesce a file that is scattered
   1059  * around the disk; but if the "scramble" flag is set, instead rewrite
   1060  * only the even-numbered blocks, which provides the opposite effect
   1061  * for testing purposes.
   1062  *
   1063  * It is the caller's responsibility to check the bounds of the inode
   1064  * numbers.
   1065  */
   1066 int
   1067 lfs_rewrite_file(struct lfs *fs, ino_t *inoa, int len, bool scramble,
   1068 		 int *directp, int *offsetp)
   1069 {
   1070 	daddr_t hiblk, lbn;
   1071 	struct vnode *vp;
   1072 	struct inode *ip;
   1073 	struct buf *bp;
   1074 	int i, error, flags;
   1075 
   1076 	*directp = 0;
   1077 	if ((error = lfs_cleanerlock(fs)) != 0)
   1078 		return error;
   1079 	flags = SEGM_PROT;
   1080 	lfs_seglock(fs, flags);
   1081 	for (i = 0; i < len; ++i) {
   1082 		error = VFS_VGET(fs->lfs_ivnode->v_mount, inoa[i], LK_EXCLUSIVE, &vp);
   1083 		if (error)
   1084 			goto out;
   1085 
   1086 		ip = VTOI(vp);
   1087 		if ((vp->v_uflag & VU_DIROP) || (ip->i_flags & IN_ADIROP)) {
   1088 			VOP_UNLOCK(vp);
   1089 			vrele(vp);
   1090 			error = EAGAIN;
   1091 			goto out;
   1092 		}
   1093 
   1094 		/* Highest block in this inode */
   1095 		hiblk = lfs_lblkno(fs, ip->i_size + lfs_sb_getbsize(fs) - 1) - 1;
   1096 
   1097 		for (lbn = 0; lbn <= hiblk; ++lbn) {
   1098 			if (scramble && (lbn & 0x01))
   1099 				continue;
   1100 
   1101 			if (lfs_needsflush(fs)) {
   1102 				lfs_segwrite(fs->lfs_ivnode->v_mount, flags);
   1103 			}
   1104 
   1105 			error = bread(vp, lbn, lfs_blksize(fs, ip, lbn), 0, &bp);
   1106 			if (error)
   1107 				break;
   1108 
   1109 			/* bp->b_cflags |= BC_INVAL; */
   1110 			lfs_bwrite_ext(bp, (flags & SEGM_CLEAN ? BW_CLEAN : 0));
   1111 			*directp += lfs_btofsb(fs, bp->b_bcount);
   1112 		}
   1113 
   1114 		/* Done with this vnode */
   1115 		VOP_UNLOCK(vp);
   1116 		vrele(vp);
   1117 		if (error)
   1118 			break;
   1119 	}
   1120 out:
   1121 	lfs_segwrite(fs->lfs_ivnode->v_mount, flags);
   1122 	*offsetp += lfs_btofsb(fs, fs->lfs_sp->bytes_written);
   1123 	lfs_segunlock(fs);
   1124 	lfs_cleanerunlock(fs);
   1125 
   1126 	return error;
   1127 }
   1128 
   1129 int
   1130 lfs_cleanctl(struct lfs *fs, struct lfs_autoclean_params *params)
   1131 {
   1132 	long (*cleanfunc)(struct lfs *, int, SEGUSE *);
   1133 
   1134 	fs->lfs_autoclean = *params;
   1135 
   1136 	cleanfunc = NULL;
   1137 	switch (fs->lfs_autoclean.mode) {
   1138 	case LFS_CLEANMODE_NONE:
   1139 		cleanfunc = NULL;
   1140 		break;
   1141 
   1142 	case LFS_CLEANMODE_GREEDY:
   1143 		cleanfunc = segselect_greedy;
   1144 		break;
   1145 
   1146 	case LFS_CLEANMODE_CB:
   1147 		cleanfunc = segselect_cb_time;
   1148 		break;
   1149 
   1150 	default:
   1151 		return EINVAL;
   1152 	}
   1153 
   1154 	mutex_enter(&lfs_lock);
   1155 	if (fs->lfs_clean_selector == NULL && cleanfunc != NULL)
   1156 		if (++lfs_ncleaners == 1) {
   1157 			printf("Starting cleaner thread\n");
   1158 			if (lfs_cleaner_daemon == NULL &&
   1159 			    kthread_create(PRI_BIO, 0, NULL,
   1160 					   lfs_cleanerd, NULL, NULL,
   1161 					   "lfs_cleaner") != 0)
   1162 				panic("fork lfs_cleaner");
   1163 		}
   1164 	if (fs->lfs_clean_selector != NULL && cleanfunc == NULL)
   1165 		if (--lfs_ncleaners == 0) {
   1166 			printf("Stopping cleaner thread\n");
   1167 			kthread_join(lfs_cleaner_daemon);
   1168 		}
   1169 	fs->lfs_clean_selector = cleanfunc;
   1170 	mutex_exit(&lfs_lock);
   1171 
   1172 	return 0;
   1173 }
   1174