Home | History | Annotate | Line # | Download | only in lfs
      1 /*	$NetBSD: lfs_kclean.c,v 1.4 2026/01/05 05:02:47 perseant Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2025 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Konrad E. Schroder <perseant (at) hhhh.org>.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 __KERNEL_RCSID(0, "$NetBSD: lfs_kclean.c,v 1.4 2026/01/05 05:02:47 perseant Exp $");
     34 
     35 #include <sys/param.h>
     36 #include <sys/systm.h>
     37 #include <sys/namei.h>
     38 #include <sys/proc.h>
     39 #include <sys/kernel.h>
     40 #include <sys/vnode.h>
     41 #include <sys/conf.h>
     42 #include <sys/kauth.h>
     43 #include <sys/buf.h>
     44 #include <sys/kthread.h>
     45 
     46 #include <ufs/lfs/ulfs_inode.h>
     47 #include <ufs/lfs/ulfsmount.h>
     48 #include <ufs/lfs/ulfs_extern.h>
     49 
     50 #include <ufs/lfs/lfs.h>
     51 #include <ufs/lfs/lfs_accessors.h>
     52 #include <ufs/lfs/lfs_kernel.h>
     53 #include <ufs/lfs/lfs_extern.h>
     54 
     55 static int ino_func_setclean(struct lfs_inofuncarg *);
     56 static int finfo_func_rewrite(struct lfs_finfofuncarg *);
     57 static int finfo_func_setclean(struct lfs_finfofuncarg *);
     58 static int rewrite_block(struct lfs *, struct vnode *, daddr_t, daddr_t,
     59 			 size_t, int *);
     60 
     61 static int ino_func_rewrite(struct lfs_inofuncarg *);
     62 static int ino_func_setclean(struct lfs_inofuncarg *);
     63 static int ino_func_checkempty(struct lfs_inofuncarg *);
     64 
     65 static int clean(struct lfs *);
     66 static long segselect_cb_rosenblum(struct lfs *, int, SEGUSE *, long);
     67 static long segselect_greedy(struct lfs *, int, SEGUSE *);
     68 static long segselect_cb_time(struct lfs *, int, SEGUSE *);
     69 #if 0
     70 static long segselect_cb_serial(struct lfs *, int, SEGUSE *);
     71 #endif
     72 static int check_clean_list(struct lfs *, ino_t);
     73 
     74 struct lwp * lfs_cleaner_daemon = NULL;
     75 extern kcondvar_t	lfs_allclean_wakeup;
     76 static int lfs_ncleaners = 0;
     77 
     78 static int
     79 ino_func_setclean(struct lfs_inofuncarg *lifa)
     80 {
     81 	struct lfs *fs;
     82 	daddr_t offset;
     83 	struct vnode *devvp, *vp;
     84 	union lfs_dinode *dip;
     85 	struct buf *dbp, *ibp;
     86 	int error;
     87 	IFILE *ifp;
     88 	unsigned i, num;
     89 	daddr_t true_addr;
     90 	ino_t ino;
     91 
     92 	fs = lifa->fs;
     93 	offset = lifa->offset;
     94 	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
     95 
     96 	/* Read inode block */
     97 	error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs),
     98 	    0, &dbp);
     99 	if (error) {
    100 		DLOG((DLOG_RF, "ino_func_setclean: bread returned %d\n",
    101 		      error));
    102 		return error;
    103 	}
    104 	memcpy(lifa->buf, dbp->b_data, dbp->b_bcount);
    105 	brelse(dbp, BC_AGE);
    106 
    107 	/* Check each inode against ifile entry */
    108 	num = LFS_INOPB(fs);
    109 	for (i = num; i-- > 0; ) {
    110 		dip = DINO_IN_BLOCK(fs, lifa->buf, i);
    111 		ino = lfs_dino_getinumber(fs, dip);
    112 		if (ino == LFS_IFILE_INUM) {
    113 			/* Check address against superblock */
    114 			true_addr = lfs_sb_getidaddr(fs);
    115 		} else {
    116 			/* Not ifile.  Check address against ifile. */
    117 			LFS_IENTRY(ifp, fs, ino, ibp);
    118 			true_addr = lfs_if_getdaddr(fs, ifp);
    119 			brelse(ibp, 0);
    120 		}
    121 		if (offset != true_addr)
    122 			continue;
    123 
    124 		LFS_ASSERT_MAXINO(fs, ino);
    125 
    126 		/* XXX We can use fastvget here! */
    127 
    128 		/*
    129 		 * An inode we need to relocate.
    130 		 * Get it if we can.
    131 		 */
    132 		if (ino == LFS_IFILE_INUM)
    133 			vp = fs->lfs_ivnode;
    134 		else
    135 			error = VFS_VGET(fs->lfs_ivnode->v_mount, ino,
    136 					 LK_EXCLUSIVE | LK_NOWAIT, &vp);
    137 		if (error)
    138 			continue;
    139 
    140 		KASSERT(VTOI(vp)->i_gen == lfs_dino_getgen(fs, dip));
    141 		lfs_setclean(fs, vp);
    142 		if (vp != fs->lfs_ivnode) {
    143 			VOP_UNLOCK(vp);
    144 			vrele(vp);
    145 		}
    146 	}
    147 
    148 	return error;
    149 }
    150 
    151 static int
    152 ino_func_rewrite(struct lfs_inofuncarg *lifa)
    153 {
    154 	struct lfs *fs;
    155 	daddr_t offset;
    156 	struct vnode *devvp, *vp;
    157 	union lfs_dinode *dip;
    158 	struct buf *dbp, *ibp;
    159 	int error;
    160 	IFILE *ifp;
    161 	unsigned i, num;
    162 	daddr_t true_addr;
    163 	ino_t ino;
    164 
    165 	fs = lifa->fs;
    166 	offset = lifa->offset;
    167 	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
    168 
    169 	/* Read inode block */
    170 	error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs),
    171 	    0, &dbp);
    172 	if (error) {
    173 		DLOG((DLOG_RF, "ino_func_rewrite: bread returned %d\n",
    174 		      error));
    175 		return error;
    176 	}
    177 	memcpy(lifa->buf, dbp->b_data, dbp->b_bcount);
    178 	brelse(dbp, BC_AGE);
    179 
    180 	/* Check each inode against ifile entry */
    181 	num = LFS_INOPB(fs);
    182 	for (i = num; i-- > 0; ) {
    183 		dip = DINO_IN_BLOCK(fs, lifa->buf, i);
    184 		ino = lfs_dino_getinumber(fs, dip);
    185 		if (ino == LFS_IFILE_INUM) {
    186 			/* Check address against superblock */
    187 			true_addr = lfs_sb_getidaddr(fs);
    188 		} else {
    189 			/* Not ifile.  Check address against ifile. */
    190 			LFS_IENTRY(ifp, fs, ino, ibp);
    191 			true_addr = lfs_if_getdaddr(fs, ifp);
    192 			brelse(ibp, 0);
    193 		}
    194 		if (offset != true_addr)
    195 			continue;
    196 
    197 		if (ino == LFS_IFILE_INUM)
    198 			continue;
    199 
    200 		LFS_ASSERT_MAXINO(fs, ino);
    201 
    202 		/* XXX We can use fastvget here! */
    203 
    204 		/*
    205 		 * An inode we need to relocate.
    206 		 * Get it if we can.
    207 		 */
    208 		error = check_clean_list(fs, ino);
    209 		if (error)
    210 			continue;
    211 		error = VFS_VGET(fs->lfs_ivnode->v_mount, ino,
    212 				 LK_EXCLUSIVE | LK_NOWAIT, &vp);
    213 		if (error)
    214 			continue;
    215 
    216 		KASSERT(VTOI(vp)->i_gen == lfs_dino_getgen(fs, dip));
    217 
    218 		if (!(VTOI(vp)->i_state & IN_CLEANING)) {
    219 			lfs_setclean(fs, vp);
    220 			lfs_writeinode(fs, fs->lfs_sp, VTOI(vp));
    221 		}
    222 
    223 		VOP_UNLOCK(vp);
    224 		vrele(vp);
    225 
    226 	}
    227 
    228 	return error;
    229 }
    230 
    231 static int
    232 rewrite_block(struct lfs *fs, struct vnode *vp, daddr_t lbn, daddr_t offset, size_t size, int *have_finfop)
    233 {
    234 	daddr_t daddr;
    235 	int error;
    236 	struct buf *bp;
    237 	struct inode *ip;
    238 
    239 	KASSERT(have_finfop != NULL);
    240 
    241 	/* Look up current location of this block. */
    242 	error = VOP_BMAP(vp, lbn, NULL, &daddr, NULL);
    243 	if (error)
    244 		return error;
    245 
    246 	/* Skip any block that is not here. */
    247 	if (offset != 0 && LFS_DBTOFSB(fs, daddr) != offset)
    248 		return ESTALE;
    249 
    250 	/*
    251 	 * It is (was recently) here.  Read the block.
    252 	 */
    253 	//size = lfs_blksize(fs, VTOI(vp), lbn);
    254 	error = bread(vp, lbn, size, 0, &bp);
    255 	if (error)
    256 		return error;
    257 
    258 	if (vp == fs->lfs_ivnode) {
    259 		VOP_BWRITE(vp, bp);
    260 	} else {
    261 		/* Get ready to write. */
    262 		if (!*have_finfop) {
    263 			ip = VTOI(vp);
    264 			lfs_acquire_finfo(fs, ip->i_number, ip->i_gen);
    265 			fs->lfs_sp->vp = vp;
    266 			*have_finfop = 1;
    267 		}
    268 
    269 		KASSERT(bp->b_vp == vp);
    270 		/* bp->b_cflags |= BC_INVAL; */ /* brelse will kill the buffer */
    271 		error = lfs_bwrite_ext(bp, BW_CLEAN);
    272 		if (error)
    273 			return error;
    274 		KASSERT(bp->b_vp == vp);
    275 		mutex_enter(&bufcache_lock);
    276 		while (lfs_gatherblock(fs->lfs_sp, bp, &bufcache_lock)) {
    277 			KASSERT(bp->b_vp != NULL);
    278 		}
    279 		mutex_exit(&bufcache_lock);
    280 
    281 		KASSERT(bp->b_flags & B_GATHERED);
    282 		KASSERT(fs->lfs_sp->cbpp[-1] == bp);
    283 	}
    284 	return 0;
    285 }
    286 
    287 static int
    288 check_clean_list(struct lfs *fs, ino_t ino)
    289 {
    290 	struct inode *ip;
    291 
    292 	/*
    293 	 * Look for the inode on the clean list.
    294 	 * If it is not there, we can't lock it without risking a deadlock.
    295 	 */
    296 	TAILQ_FOREACH(ip, &fs->lfs_cleanhd, i_lfs_clean) {
    297 		if (ip->i_number == ino) {
    298 			return 0;
    299 		}
    300 	}
    301 	return EWOULDBLOCK;
    302 }
    303 
    304 static int
    305 finfo_func_rewrite(struct lfs_finfofuncarg *lffa)
    306 {
    307 	struct lfs *fs;
    308 	FINFO *fip;
    309 	daddr_t *offsetp;
    310 	int j, have_finfo, error;
    311 	size_t size, bytes;
    312 	ino_t ino;
    313 	uint32_t gen;
    314 	struct vnode *vp;
    315 	daddr_t lbn;
    316 	int *fragsp;
    317 
    318 	fs = lffa->fs;
    319 	fip = lffa->finfop;
    320 	offsetp = lffa->offsetp;
    321 	fragsp = (int *)lffa->arg;
    322 
    323 	/* Get the inode and check its version. */
    324 	ino = lfs_fi_getino(fs, fip);
    325 	gen = lfs_fi_getversion(fs, fip);
    326 	error = 0;
    327 	if (ino == LFS_IFILE_INUM)
    328 		vp = fs->lfs_ivnode;
    329 	else {
    330 		LFS_ASSERT_MAXINO(fs, ino);
    331 		error = check_clean_list(fs, ino);
    332 		if (error)
    333 			vp = NULL;
    334 		else
    335 			error = VFS_VGET(fs->lfs_ivnode->v_mount, ino,
    336 					 LK_EXCLUSIVE|LK_NOWAIT, &vp);
    337 	}
    338 
    339 	/*
    340 	 * If we can't, or if version is wrong, or it has dirop blocks on it,
    341 	 * we can't relocate its blocks; but we still have to count
    342 	 * blocks through the partial segment to return the right offset.
    343 	 * XXX actually we can move DIROP vnodes' *old* data, as long
    344 	 * XXX as we are sure that we are moving *only* the old data---?
    345 	 */
    346 	if (error || VTOI(vp)->i_gen != gen || (vp->v_uflag & VU_DIROP)) {
    347 		if (error == 0)
    348 			error = ESTALE;
    349 
    350 		if (vp != NULL && vp != fs->lfs_ivnode) {
    351 			VOP_UNLOCK(vp);
    352 			vrele(vp);
    353 		}
    354 		vp = NULL;
    355 		bytes = ((lfs_fi_getnblocks(fs, fip) - 1) << lfs_sb_getbshift(fs))
    356 			+ lfs_fi_getlastlength(fs, fip);
    357 		*offsetp += lfs_btofsb(fs, bytes);
    358 
    359 		return error;
    360 	}
    361 
    362 	/*
    363 	 * We have the vnode and its version is correct.
    364 	 * Take a cleaning reference; and loop through the blocks
    365 	 * and rewrite them.
    366 	 */
    367 	lfs_setclean(fs, vp);
    368 	size = lfs_sb_getbsize(fs);
    369 	have_finfo = 0;
    370 	for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) {
    371 		if (j == lfs_fi_getnblocks(fs, fip) - 1)
    372 			size = lfs_fi_getlastlength(fs, fip);
    373 		/*
    374 		 * An error of ESTALE indicates that there was nothing
    375 		 * to rewrite; this is not a problem.  Any other error
    376 		 * causes us to skip the rest of this FINFO.
    377 		 */
    378 		if (vp != NULL && error == 0) {
    379 			lbn = lfs_fi_getblock(fs, fip, j);
    380 			error = rewrite_block(fs, vp, lbn, *offsetp,
    381 					      size, &have_finfo);
    382 			if (error == ESTALE)
    383 				error = 0;
    384 			if (fragsp != NULL && error == 0)
    385 				*fragsp += lfs_btofsb(fs, size);
    386 		}
    387 		*offsetp += lfs_btofsb(fs, size);
    388 	}
    389 
    390 	/*
    391 	 * If we acquired finfo, release it and write the blocks.
    392 	 */
    393 	if (have_finfo) {
    394 		lfs_updatemeta(fs->lfs_sp);
    395 		fs->lfs_sp->vp = NULL;
    396 		lfs_release_finfo(fs);
    397 		lfs_writeinode(fs, fs->lfs_sp, VTOI(vp));
    398 	}
    399 
    400 	/* Release vnode */
    401 	if (vp != fs->lfs_ivnode) {
    402 		VOP_UNLOCK(vp);
    403 		vrele(vp);
    404 	}
    405 
    406 	return error;
    407 }
    408 
    409 static int
    410 finfo_func_setclean(struct lfs_finfofuncarg *lffa)
    411 {
    412 	struct lfs *fs;
    413 	FINFO *fip;
    414 	daddr_t *offsetp;
    415 	int error;
    416 	size_t bytes;
    417 	ino_t ino;
    418 	uint32_t gen;
    419 	struct vnode *vp;
    420 
    421 	fs = lffa->fs;
    422 	fip = lffa->finfop;
    423 	offsetp = lffa->offsetp;
    424 
    425 	/* Get the inode and check its version. */
    426 	ino = lfs_fi_getino(fs, fip);
    427 	gen = lfs_fi_getversion(fs, fip);
    428 	error = 0;
    429 	if (ino == LFS_IFILE_INUM)
    430 		vp = fs->lfs_ivnode;
    431 	else {
    432 		LFS_ASSERT_MAXINO(fs, ino);
    433 		error = VFS_VGET(fs->lfs_ivnode->v_mount, ino,
    434 				 LK_EXCLUSIVE|LK_NOWAIT, &vp);
    435 	}
    436 
    437 	/* If we have it and its version is right, take a cleaning reference */
    438 	if (error == 0 && VTOI(vp)->i_gen == gen)
    439 		lfs_setclean(fs, vp);
    440 
    441 	if (vp == fs->lfs_ivnode)
    442 		vp = NULL;
    443 	else if (vp != NULL) {
    444 		VOP_UNLOCK(vp);
    445 		vrele(vp);
    446 		vp = NULL;
    447 	}
    448 
    449 	/* Skip to the next block */
    450 	bytes = ((lfs_fi_getnblocks(fs, fip) - 1) << lfs_sb_getbshift(fs))
    451 		+ lfs_fi_getlastlength(fs, fip);
    452 	*offsetp += lfs_btofsb(fs, bytes);
    453 
    454 	return error;
    455 }
    456 
    457 /*
    458  * Use the partial-segment parser to rewrite (clean) a segment.
    459  */
    460 int
    461 lfs_rewrite_segment(struct lfs *fs, int sn, int *fragsp, kauth_cred_t cred, struct lwp *l)
    462 {
    463 	daddr_t ooffset, offset, endpseg;
    464 
    465 	ASSERT_SEGLOCK(fs);
    466 
    467 	offset = lfs_sntod(fs, sn);
    468 	lfs_skip_superblock(fs, &offset);
    469 	endpseg = lfs_sntod(fs, sn + 1);
    470 
    471 	while (offset > 0 && offset != endpseg) {
    472 		/* First check summary validity (XXX unnecessary?) */
    473 		ooffset = offset;
    474 		lfs_parse_pseg(fs, &offset, 0, cred, NULL, l,
    475 			     NULL, NULL, CKSEG_CKSUM, NULL);
    476 		if (offset == ooffset)
    477 			break;
    478 
    479 		/*
    480 		 * Valid, proceed.
    481 		 *
    482 		 * First write the file blocks, marking their
    483 		 * inodes IN_CLEANING.
    484 		 */
    485 		offset = ooffset;
    486 		lfs_parse_pseg(fs, &offset, 0, cred, NULL, l,
    487 			       NULL, finfo_func_rewrite,
    488 			       CKSEG_NONE, fragsp);
    489 
    490 		/*
    491 		 * Now go back and pick up any inodes that
    492 		 * were not already marked IN_CLEANING, and
    493 		 * write them as well.
    494 		 */
    495 		offset = ooffset;
    496 		lfs_parse_pseg(fs, &offset, 0, cred, NULL, l,
    497 			       ino_func_rewrite, NULL,
    498 			       CKSEG_NONE, fragsp);
    499 	}
    500 	return 0;
    501 }
    502 
    503 /*
    504  * Rewrite the contents of one or more segments, in preparation for
    505  * marking them clean.
    506  */
    507 int
    508 lfs_rewrite_segments(struct lfs *fs, int *snn, int len, int *directp, int *offsetp, struct lwp *l)
    509 {
    510 	kauth_cred_t cred;
    511 	int i, error;
    512 	struct buf *bp;
    513 	SEGUSE *sup;
    514 	daddr_t offset, endpseg;
    515 
    516 	ASSERT_NO_SEGLOCK(fs);
    517 
    518 	cred = l ? l->l_cred : NOCRED;
    519 
    520 	/* Prevent new dirops and acquire the cleaner lock. */
    521 	lfs_writer_enter(fs, "rewritesegs");
    522 	if ((error = lfs_cleanerlock(fs)) != 0) {
    523 		lfs_writer_leave(fs);
    524 		return error;
    525 	}
    526 
    527 	/*
    528 	 * Pre-reference vnodes now that we have cleaner lock
    529 	 * but before we take the segment lock.  We don't want to
    530 	 * mix cleaning blocks with flushed vnodes.
    531 	 */
    532 	for (i = 0; i < len; i++) {
    533 		error = 0;
    534 		/* Refuse to clean segments that are ACTIVE */
    535 		LFS_SEGENTRY(sup, fs, snn[i], bp);
    536 		if (sup->su_flags & SEGUSE_ACTIVE
    537 		    || !(sup->su_flags & SEGUSE_DIRTY))
    538 			error = EINVAL;
    539 
    540 		brelse(bp, 0);
    541 		if (error)
    542 			break;
    543 
    544 		offset = lfs_sntod(fs, snn[i]);
    545 		lfs_skip_superblock(fs, &offset);
    546 		endpseg = lfs_sntod(fs, snn[i] + 1);
    547 
    548 		while (offset > 0 && offset != endpseg) {
    549 			lfs_parse_pseg(fs, &offset, 0, cred, NULL, l,
    550 				       ino_func_setclean, finfo_func_setclean,
    551 				       CKSEG_NONE, NULL);
    552 		}
    553 	}
    554 
    555 	/*
    556 	 * Actually rewrite the contents of the segment.
    557 	 */
    558 	lfs_seglock(fs, SEGM_CLEAN);
    559 
    560 	for (i = 0; i < len; i++) {
    561 		error = 0;
    562 		/* Refuse to clean segments that are ACTIVE */
    563 		LFS_SEGENTRY(sup, fs, snn[i], bp);
    564 		if (sup->su_flags & SEGUSE_ACTIVE
    565 		    || !(sup->su_flags & SEGUSE_DIRTY))
    566 			error = EINVAL;
    567 
    568 		brelse(bp, 0);
    569 		if (error)
    570 			break;
    571 
    572 		error = lfs_rewrite_segment(fs, snn[i], directp, cred, l);
    573 		if (error)
    574 			break;
    575 	}
    576 	while (lfs_writeseg(fs, fs->lfs_sp))
    577 		;
    578 
    579 	*offsetp = lfs_btofsb(fs, fs->lfs_sp->bytes_written);
    580 	lfs_segunlock(fs);
    581 	lfs_cleanerunlock(fs);
    582 	lfs_writer_leave(fs);
    583 
    584 	return error;
    585 }
    586 
    587 static int
    588 ino_func_checkempty(struct lfs_inofuncarg *lifa)
    589 {
    590 	struct lfs *fs;
    591 	daddr_t offset;
    592 	struct vnode *devvp;
    593 	union lfs_dinode *dip;
    594 	struct buf *dbp, *ibp;
    595 	int error;
    596 	IFILE *ifp;
    597 	unsigned i, num;
    598 	daddr_t true_addr;
    599 	ino_t ino;
    600 
    601 	fs = lifa->fs;
    602 	offset = lifa->offset;
    603 	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
    604 
    605 	/* Read inode block */
    606 	error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs),
    607 	    0, &dbp);
    608 	if (error) {
    609 		DLOG((DLOG_RF, "ino_func_checkempty: bread returned %d\n",
    610 		      error));
    611 		return error;
    612 	}
    613 
    614 	/* Check each inode against ifile entry */
    615 	num = LFS_INOPB(fs);
    616 	for (i = num; i-- > 0; ) {
    617 		dip = DINO_IN_BLOCK(fs, dbp->b_data, i);
    618 		ino = lfs_dino_getinumber(fs, dip);
    619 		if (ino == LFS_IFILE_INUM) {
    620 			/* Check address against superblock */
    621 			true_addr = lfs_sb_getidaddr(fs);
    622 		} else {
    623 			/* Not ifile.  Check address against ifile. */
    624 			LFS_IENTRY(ifp, fs, ino, ibp);
    625 			true_addr = lfs_if_getdaddr(fs, ifp);
    626 			brelse(ibp, 0);
    627 		}
    628 		if (offset == true_addr) {
    629 			error = EEXIST;
    630 			break;
    631 		}
    632 	}
    633 	brelse(dbp, BC_AGE);
    634 
    635 	return error;
    636 }
    637 
    638 static int
    639 finfo_func_checkempty(struct lfs_finfofuncarg *lffa)
    640 {
    641 	struct lfs *fs;
    642 	FINFO *fip;
    643 	daddr_t *offsetp;
    644 	int j, error;
    645 	size_t size, bytes;
    646 	ino_t ino;
    647 	uint32_t gen;
    648 	struct vnode *vp;
    649 	daddr_t lbn, daddr;
    650 
    651 	fs = lffa->fs;
    652 	fip = lffa->finfop;
    653 	offsetp = lffa->offsetp;
    654 
    655 	/* Get the inode and check its version. */
    656 	ino = lfs_fi_getino(fs, fip);
    657 	gen = lfs_fi_getversion(fs, fip);
    658 	error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, LK_EXCLUSIVE|LK_NOWAIT, &vp);
    659 
    660 	/*
    661 	 * If we can't, or if version is wrong, this FINFO does not refer
    662 	 * to a live file.  Skip over it and continue.
    663 	 */
    664 	if (error || VTOI(vp)->i_gen != gen) {
    665 		if (error == 0)
    666 			error = ESTALE;
    667 
    668 		if (vp != NULL) {
    669 			VOP_UNLOCK(vp);
    670 			vrele(vp);
    671 			vp = NULL;
    672 		}
    673 		bytes = ((lfs_fi_getnblocks(fs, fip) - 1)
    674 			 << lfs_sb_getbshift(fs))
    675 			+ lfs_fi_getlastlength(fs, fip);
    676 		*offsetp += lfs_btofsb(fs, bytes);
    677 
    678 		return error;
    679 	}
    680 
    681 	/*
    682 	 * We have the vnode and its version is correct.
    683 	 * Loop through the blocks and check their currency.
    684 	 */
    685 	size = lfs_sb_getbsize(fs);
    686 	for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) {
    687 		if (j == lfs_fi_getnblocks(fs, fip) - 1)
    688 			size = lfs_fi_getlastlength(fs, fip);
    689 		if (vp != NULL) {
    690 			lbn = lfs_fi_getblock(fs, fip, j);
    691 
    692 			/* Look up current location of this block. */
    693 			error = VOP_BMAP(vp, lbn, NULL, &daddr, NULL);
    694 			if (error)
    695 				break;
    696 
    697 			/* If it is here, the segment is not empty. */
    698 			if (LFS_DBTOFSB(fs, daddr) == *offsetp) {
    699 				error = EEXIST;
    700 				break;
    701 			}
    702 		}
    703 		*offsetp += lfs_btofsb(fs, size);
    704 	}
    705 
    706 	/* Release vnode */
    707 	VOP_UNLOCK(vp);
    708 	vrele(vp);
    709 
    710 	return error;
    711 }
    712 
    713 int
    714 lfs_checkempty(struct lfs *fs, int sn, kauth_cred_t cred, struct lwp *l)
    715 {
    716 	daddr_t offset, endpseg;
    717 	int error;
    718 
    719 	ASSERT_SEGLOCK(fs);
    720 
    721 	offset = lfs_sntod(fs, sn);
    722 	lfs_skip_superblock(fs, &offset);
    723 	endpseg = lfs_sntod(fs, sn + 1);
    724 
    725 	while (offset > 0 && offset < endpseg) {
    726 		error = lfs_parse_pseg(fs, &offset, 0, cred, NULL, l,
    727 				     ino_func_checkempty,
    728 				     finfo_func_checkempty,
    729 				     CKSEG_NONE, NULL);
    730 		if (error)
    731 			return error;
    732 	}
    733 	return 0;
    734 }
    735 
    736 static long
    737 segselect_greedy(struct lfs *fs, int sn, SEGUSE *sup)
    738 {
    739 	return lfs_sb_getssize(fs) - sup->su_nbytes;
    740 }
    741 
    742 __inline static long
    743 segselect_cb_rosenblum(struct lfs *fs, int sn, SEGUSE *sup, long age)
    744 {
    745 	long benefit, cost;
    746 
    747 	benefit = (int64_t)lfs_sb_getssize(fs) - sup->su_nbytes -
    748 		(sup->su_nsums + 1) * lfs_sb_getfsize(fs);
    749 	if (sup->su_flags & SEGUSE_SUPERBLOCK)
    750 		benefit -= LFS_SBPAD;
    751 	if (lfs_sb_getbsize(fs) > lfs_sb_getfsize(fs)) /* fragmentation */
    752 		benefit -= (lfs_sb_getbsize(fs) / 2);
    753 	if (benefit <= 0) {
    754 		return 0;
    755 	}
    756 
    757 	cost = lfs_sb_getssize(fs) + sup->su_nbytes;
    758 	return (256 * benefit * age) / cost;
    759 }
    760 
    761 static long
    762 segselect_cb_time(struct lfs *fs, int sn, SEGUSE *sup)
    763 {
    764 	long age;
    765 
    766 	age = time_second - sup->su_lastmod;
    767 	if (age < 0)
    768 		age = 0;
    769 	return segselect_cb_rosenblum(fs, sn, sup, age);
    770 }
    771 
    772 #if 0
    773 /*
    774  * Same as the time comparator, but fetch the serial number from the
    775  * segment header to compare.
    776  *
    777  * This is ugly.  Whether serial number or wall time is better is a
    778  * worthy question, but if we want to use serial number to compute
    779  * age, we should record the serial number in su_lastmod instead of
    780  * the time.
    781  */
    782 static long
    783 segselect_cb_serial(struct lfs *fs, int sn, SEGUSE *sup)
    784 {
    785 	struct buf *bp;
    786 	uint32_t magic;
    787 	uint64_t age, serial;
    788 	daddr_t addr;
    789 
    790 	addr = lfs_segtod(fs, sn);
    791 	lfs_skip_superblock(fs, &addr);
    792 	bread(fs->lfs_devvp, LFS_FSBTODB(fs, addr),
    793 	      lfs_sb_getsumsize(fs), 0, &bp);
    794 	magic = lfs_ss_getmagic(fs, ((SEGSUM *)bp->b_data));
    795 	serial = lfs_ss_getserial(fs, ((SEGSUM *)bp->b_data));
    796 	brelse(bp, 0);
    797 
    798 	if (magic != SS_MAGIC)
    799 		return 0;
    800 
    801 	age = lfs_sb_getserial(fs) - serial;
    802 	return segselect_cb_rosenblum(fs, sn, sup, age);
    803 }
    804 #endif
    805 
    806 void
    807 lfs_cleanerd(void *arg)
    808 {
    809 	mount_iterator_t *iter;
    810  	struct mount *mp;
    811  	struct lfs *fs;
    812 	struct vfsops *vfs = NULL;
    813 	int lfsc;
    814 	int cleaned_something = 0;
    815 
    816 	/* Take an extra reference to the LFS vfsops. */
    817 	vfs = vfs_getopsbyname(MOUNT_LFS);
    818 
    819  	mutex_enter(&lfs_lock);
    820  	for (;;) {
    821 		KASSERT(mutex_owned(&lfs_lock));
    822 		if (cleaned_something == 0)
    823 			cv_timedwait(&lfs_allclean_wakeup, &lfs_lock, hz/10 + 1);
    824 		KASSERT(mutex_owned(&lfs_lock));
    825 		cleaned_something = 0;
    826 
    827 		KASSERT(mutex_owned(&lfs_lock));
    828 		mutex_exit(&lfs_lock);
    829 
    830  		/*
    831  		 * Look through the list of LFSs to see if any of them
    832 		 * need cleaning.
    833  		 */
    834  		mountlist_iterator_init(&iter);
    835 		lfsc = 0;
    836 		while ((mp = mountlist_iterator_next(iter)) != NULL) {
    837 			KASSERT(!mutex_owned(&lfs_lock));
    838  			if (strncmp(mp->mnt_stat.f_fstypename, MOUNT_LFS,
    839  			    sizeof(mp->mnt_stat.f_fstypename)) == 0) {
    840  				fs = VFSTOULFS(mp)->um_lfs;
    841 
    842 				mutex_enter(&lfs_lock);
    843 				if (fs->lfs_clean_selector == NULL) {
    844 					/* Notify cleanctl */
    845 					if (fs->lfs_autoclean_status) {
    846 						fs->lfs_autoclean_status =
    847 							LFS_AUTOCLEAN_STATUS_OFF;
    848 						cv_broadcast(&fs->lfs_cleanquitcv);
    849 					}
    850 				} else
    851 					++lfsc;
    852 				mutex_exit(&lfs_lock);
    853 				cleaned_something += clean(fs);
    854 			}
    855  		}
    856 		if (lfsc == 0) {
    857 			mutex_enter(&lfs_lock);
    858 			lfs_cleaner_daemon = NULL;
    859 			mutex_exit(&lfs_lock);
    860 			mountlist_iterator_destroy(iter);
    861 			break;
    862 		}
    863  		mountlist_iterator_destroy(iter);
    864 
    865  		mutex_enter(&lfs_lock);
    866  	}
    867 	KASSERT(!mutex_owned(&lfs_lock));
    868 
    869 	/* Give up our extra reference so the module can be unloaded. */
    870 	mutex_enter(&vfs_list_lock);
    871 	if (vfs != NULL)
    872 		vfs->vfs_refcount--;
    873 	mutex_exit(&vfs_list_lock);
    874 
    875 	/* Done! */
    876 	kthread_exit(0);
    877 }
    878 
    879 /*
    880  * Look at the file system to see whether it needs cleaning, and if it does,
    881  * clean a segment.
    882  */
    883 static int
    884 clean(struct lfs *fs)
    885 {
    886 	struct buf *bp;
    887 	SEGUSE *sup;
    888 	int sn, maxsn, nclean, nready, nempty, nerror, nzero, again, target;
    889 	long prio, maxprio, maxeprio, thresh;
    890 	long (*func)(struct lfs *, int, SEGUSE *);
    891 	uint32_t __debugused segflags = 0;
    892 	daddr_t oldsn, bfree, avail;
    893 	int direct, offset;
    894 
    895 	mutex_enter(&lfs_lock);
    896 	func = fs->lfs_clean_selector;
    897 	mutex_exit(&lfs_lock);
    898 	if (func == NULL)
    899 		return 1; /* Run again so we get cleaned up immediately */
    900 
    901 	thresh = fs->lfs_autoclean.thresh;
    902 	if (fs->lfs_flags & LFS_MUSTCLEAN)
    903 		thresh = 0;
    904 	else if (thresh < 0) {
    905 		/*
    906 		 * Compute a priority threshold based on availability ratio.
    907 		 * XXX These numbers only makes sense for the greedy cleaner.
    908 		 * What is an appropriate threshold for the cost-benefit
    909 		 * cleaner?
    910 		 */
    911 		bfree = lfs_sb_getbfree(fs)
    912 			+ lfs_segtod(fs, 1) * lfs_sb_getminfree(fs);
    913 		avail = lfs_sb_getavail(fs) - fs->lfs_ravail - fs->lfs_favail;
    914 		if (avail > bfree)
    915 			return 0;
    916 		thresh = lfs_sb_getssize(fs) * (bfree - avail)
    917 			/ (lfs_sb_getsize(fs) - avail);
    918 		if (thresh > lfs_sb_getsumsize(fs) + 5 * lfs_sb_getbsize(fs))
    919 			thresh = lfs_sb_getsumsize(fs) + 5 * lfs_sb_getbsize(fs);
    920 		if (thresh > lfs_sb_getssize(fs) - lfs_sb_getbsize(fs))
    921 			return 0;
    922 	}
    923 
    924 	target = fs->lfs_autoclean.target;
    925 	if (target <= 0) {
    926 		/* Default to half a segment target */
    927 		target = lfs_segtod(fs, 1) / 2;
    928 	}
    929 
    930 	oldsn = lfs_dtosn(fs, lfs_sb_getoffset(fs));
    931 
    932 	again = 0;
    933 	maxprio = maxeprio = -1;
    934 	nzero = nclean = nready = nempty = nerror = 0;
    935 	for (sn = 0; sn < lfs_sb_getnseg(fs); sn++) {
    936 
    937 		prio = 0;
    938 		LFS_SEGENTRY(sup, fs, sn, bp);
    939 		if (sup->su_flags & SEGUSE_ACTIVE)
    940 			prio = 0;
    941 		else if (!(sup->su_flags & SEGUSE_DIRTY))
    942 			++nclean;
    943 		else if (sup->su_flags & SEGUSE_READY)
    944 			++nready;
    945 		else if (sup->su_flags & SEGUSE_EMPTY)
    946 			++nempty;
    947 		else if (sup->su_nbytes == 0)
    948 			++nzero;
    949 		else
    950 			prio = (*func)(fs, sn, sup);
    951 
    952 		if (sup->su_flags & SEGUSE_ERROR) {
    953 			if (prio > maxeprio)
    954 				maxeprio = prio;
    955 			prio = 0;
    956 			++nerror;
    957 		}
    958 
    959 		if (prio > maxprio) {
    960 			maxprio = prio;
    961 			maxsn = sn;
    962 			segflags = sup->su_flags;
    963 		}
    964 		brelse(bp, 0);
    965 	}
    966 	DLOG((DLOG_CLEAN, "%s clean=%d/%d zero=%d empty=%d ready=%d maxsn=%d maxprio=%ld/%ld segflags=0x%lx\n",
    967 	       (maxprio > thresh ? "YES" : "NO "),
    968 	       nclean, (int)lfs_sb_getnseg(fs), nzero, nempty, nready,
    969 	       maxsn, maxprio, (unsigned long)thresh,
    970 	       (unsigned long)segflags));
    971 
    972 	/*
    973 	 * If we are trying to clean the segment we cleaned last,
    974 	 * cleaning did not work.  Mark this segment SEGUSE_ERROR
    975 	 * and try again.
    976 	 */
    977 	if (maxprio > 0 && fs->lfs_lastcleaned == maxsn) {
    978 		LFS_SEGENTRY(sup, fs, maxsn, bp);
    979 		sup->su_flags |= SEGUSE_ERROR;
    980 		LFS_WRITESEGENTRY(sup, fs, sn, bp);
    981 		return 1;
    982 	}
    983 
    984 	/*
    985 	 * If there were nothing but error segments, clear error.
    986 	 * We will wait to try again.
    987 	 */
    988 	if (maxprio == 0 && maxeprio > 0) {
    989 		DLOG((DLOG_CLEAN, "clear error on %d segments, try again\n",
    990 		      nerror));
    991 		lfs_seguse_clrflag_all(fs, SEGUSE_ERROR);
    992 	}
    993 
    994 	/* Rewrite the highest-priority segment */
    995 	if (maxprio > thresh) {
    996 		direct = offset = 0;
    997 		(void)lfs_rewrite_segments(fs, &maxsn, 1,
    998 					   &direct, &offset, curlwp);
    999 		DLOG((DLOG_CLEAN, "  direct=%d offset=%d\n", direct, offset));
   1000 		again += direct;
   1001 		fs->lfs_clean_accum += offset;
   1002 
   1003 		/* Don't clean this again immediately */
   1004 		fs->lfs_lastcleaned = maxsn;
   1005 	}
   1006 
   1007 	/*
   1008 	 * If we are in dire straits but we have segments already
   1009 	 * empty, force a double checkpoint to reclaim them.
   1010 	 */
   1011 	if (fs->lfs_flags & LFS_MUSTCLEAN) {
   1012 		if (nready + nempty > 0) {
   1013 			DLOG((DLOG_CLEAN, "force checkpoint with nready=%d nempty=%d nzero=%d\n",
   1014 				nready, nempty, nzero));
   1015 			lfs_segwrite(fs->lfs_ivnode->v_mount,
   1016 				     SEGM_CKP | SEGM_FORCE_CKP | SEGM_SYNC);
   1017 			lfs_segwrite(fs->lfs_ivnode->v_mount,
   1018 				     SEGM_CKP | SEGM_FORCE_CKP | SEGM_SYNC);
   1019 			++again;
   1020 		}
   1021 	} else if (fs->lfs_clean_accum > target) {
   1022 		DLOG((DLOG_CLEAN, "checkpoint to flush\n"));
   1023 		lfs_segwrite(fs->lfs_ivnode->v_mount, SEGM_CKP);
   1024 		fs->lfs_clean_accum = 0;
   1025 	} else if (lfs_dtosn(fs, lfs_sb_getoffset(fs)) != oldsn
   1026 		   || nempty + nready > LFS_MAX_ACTIVE) { /* XXX arbitrary */
   1027 		DLOG((DLOG_CLEAN, "write to promote empty segments\n"));
   1028 		lfs_segwrite(fs->lfs_ivnode->v_mount, SEGM_CKP);
   1029 		fs->lfs_clean_accum = 0;
   1030 	}
   1031 
   1032 	return again;
   1033 }
   1034 
   1035 /*
   1036  * Rewrite a file in its entirety.
   1037  *
   1038  * Generally this would be done to coalesce a file that is scattered
   1039  * around the disk; but if the "scramble" flag is set, instead rewrite
   1040  * only the even-numbered blocks, which provides the opposite effect
   1041  * for testing purposes.
   1042  *
   1043  * It is the caller's responsibility to check the bounds of the inode
   1044  * numbers.
   1045  */
   1046 int
   1047 lfs_rewrite_file(struct lfs *fs, ino_t *inoa, int len, bool scramble,
   1048 		 int *directp, int *offsetp)
   1049 {
   1050 	daddr_t hiblk, lbn;
   1051 	struct vnode *vp;
   1052 	struct inode *ip;
   1053 	struct buf *bp;
   1054 	int i, error;
   1055 
   1056 	KASSERT(directp != NULL);
   1057 	KASSERT(offsetp != NULL);
   1058 
   1059 	*directp = 0;
   1060 	if ((error = lfs_cleanerlock(fs)) != 0)
   1061 		return error;
   1062 	lfs_seglock(fs, 0);
   1063 	for (i = 0; i < len; ++i) {
   1064 		error = VFS_VGET(fs->lfs_ivnode->v_mount, inoa[i],
   1065 		    LK_EXCLUSIVE | LK_NOWAIT, &vp);
   1066 		if (error)
   1067 			goto out;
   1068 
   1069 		ip = VTOI(vp);
   1070 		if ((vp->v_uflag & VU_DIROP) || (ip->i_flags & IN_ADIROP)) {
   1071 			VOP_UNLOCK(vp);
   1072 			vrele(vp);
   1073 			error = EAGAIN;
   1074 			goto out;
   1075 		}
   1076 
   1077 		/* Highest block in this inode */
   1078 		hiblk = lfs_lblkno(fs, ip->i_size + lfs_sb_getbsize(fs) - 1) - 1;
   1079 
   1080 		for (lbn = 0; lbn <= hiblk; ++lbn) {
   1081 			if (scramble && (lbn & 0x01))
   1082 				continue;
   1083 
   1084 			if (lfs_needsflush(fs)) {
   1085 				lfs_segwrite(fs->lfs_ivnode->v_mount, 0);
   1086 			}
   1087 
   1088 			error = bread(vp, lbn, lfs_blksize(fs, ip, lbn), 0, &bp);
   1089 			if (error)
   1090 				break;
   1091 
   1092 			/* bp->b_cflags |= BC_INVAL; */
   1093 			lfs_bwrite_ext(bp, 0);
   1094 			*directp += lfs_btofsb(fs, bp->b_bcount);
   1095 		}
   1096 
   1097 		/* Done with this vnode */
   1098 		VOP_UNLOCK(vp);
   1099 		vrele(vp);
   1100 		if (error)
   1101 			break;
   1102 	}
   1103 out:
   1104 	lfs_segwrite(fs->lfs_ivnode->v_mount, 0);
   1105 	*offsetp += lfs_btofsb(fs, fs->lfs_sp->bytes_written);
   1106 	lfs_segunlock(fs);
   1107 	lfs_cleanerunlock(fs);
   1108 
   1109 	return error;
   1110 }
   1111 
   1112 int
   1113 lfs_cleanctl(struct lfs *fs, struct lfs_autoclean_params *params)
   1114 {
   1115 	long (*cleanfunc)(struct lfs *, int, SEGUSE *);
   1116 
   1117 	fs->lfs_autoclean = *params;
   1118 
   1119 	cleanfunc = NULL;
   1120 	switch (fs->lfs_autoclean.mode) {
   1121 	case LFS_CLEANMODE_NONE:
   1122 		cleanfunc = NULL;
   1123 		break;
   1124 
   1125 	case LFS_CLEANMODE_GREEDY:
   1126 		cleanfunc = segselect_greedy;
   1127 		break;
   1128 
   1129 	case LFS_CLEANMODE_CB:
   1130 		cleanfunc = segselect_cb_time;
   1131 		break;
   1132 
   1133 	default:
   1134 		return EINVAL;
   1135 	}
   1136 
   1137 	mutex_enter(&lfs_lock);
   1138 	while (cleanfunc == NULL &&
   1139 	       fs->lfs_autoclean_status != LFS_AUTOCLEAN_STATUS_OFF) {
   1140 		cv_wait(&fs->lfs_cleanquitcv, &lfs_lock);
   1141 	}
   1142 	if (fs->lfs_clean_selector == NULL && cleanfunc != NULL)
   1143 		if (++lfs_ncleaners == 1) {
   1144 			if (lfs_cleaner_daemon == NULL &&
   1145 			    kthread_create(PRI_BIO, 0, NULL,
   1146 					   lfs_cleanerd, NULL,
   1147 					   &lfs_cleaner_daemon,
   1148 					   "lfs_cleaner") != 0)
   1149 				panic("fork lfs_cleaner");
   1150 		}
   1151 	if (fs->lfs_clean_selector != NULL && cleanfunc == NULL) {
   1152 		if (--lfs_ncleaners == 0) {
   1153 #if 0
   1154 			kthread_join(lfs_cleaner_daemon);
   1155 			lfs_cleaner_daemon = NULL;
   1156 #endif /* 0 */
   1157 		}
   1158 	}
   1159 	fs->lfs_clean_selector = cleanfunc;
   1160 	mutex_exit(&lfs_lock);
   1161 
   1162 	return 0;
   1163 }
   1164