lfs_kclean.c revision 1.1 1 /* $NetBSD: lfs_kclean.c,v 1.1 2025/11/06 15:54:27 perseant Exp $ */
2
3 /*-
4 * Copyright (c) 2025 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Konrad E. Schroder <perseant (at) hhhh.org>.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: lfs_kclean.c,v 1.1 2025/11/06 15:54:27 perseant Exp $");
34
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/namei.h>
38 #include <sys/proc.h>
39 #include <sys/kernel.h>
40 #include <sys/vnode.h>
41 #include <sys/conf.h>
42 #include <sys/kauth.h>
43 #include <sys/buf.h>
44 #include <sys/kthread.h>
45
46 #include <ufs/lfs/ulfs_inode.h>
47 #include <ufs/lfs/ulfsmount.h>
48 #include <ufs/lfs/ulfs_extern.h>
49
50 #include <ufs/lfs/lfs.h>
51 #include <ufs/lfs/lfs_accessors.h>
52 #include <ufs/lfs/lfs_kernel.h>
53 #include <ufs/lfs/lfs_extern.h>
54
55 static int ino_func_setclean(struct lfs_inofuncarg *);
56 static int finfo_func_rewrite(struct lfs_finfofuncarg *);
57 static int finfo_func_setclean(struct lfs_finfofuncarg *);
58 static int rewrite_block(struct lfs *, struct vnode *, daddr_t, daddr_t,
59 size_t, int *);
60
61 static int clean(struct lfs *);
62 static long segselect_cb_rosenblum(struct lfs *, int, SEGUSE *, long);
63 static long segselect_greedy(struct lfs *, int, SEGUSE *);
64 static long segselect_cb_time(struct lfs *, int, SEGUSE *);
65 #if 0
66 static long segselect_cb_serial(struct lfs *, int, SEGUSE *);
67 #endif
68
69 struct lwp * lfs_cleaner_daemon = NULL;
70 extern kcondvar_t lfs_allclean_wakeup;
71 static int lfs_ncleaners = 0;
72
73 static int
74 ino_func_setclean(struct lfs_inofuncarg *lifa)
75 {
76 struct lfs *fs;
77 daddr_t offset;
78 struct vnode *devvp, *vp;
79 union lfs_dinode *dip;
80 struct buf *dbp, *ibp;
81 int error;
82 IFILE *ifp;
83 unsigned i, num;
84 daddr_t true_addr;
85 ino_t ino;
86
87 fs = lifa->fs;
88 offset = lifa->offset;
89 devvp = VTOI(fs->lfs_ivnode)->i_devvp;
90
91 /* Read inode block */
92 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs),
93 0, &dbp);
94 if (error) {
95 DLOG((DLOG_RF, "ino_func_setclean: bread returned %d\n",
96 error));
97 return error;
98 }
99 memcpy(lifa->buf, dbp->b_data, dbp->b_bcount);
100 brelse(dbp, BC_AGE);
101
102 /* Check each inode against ifile entry */
103 num = LFS_INOPB(fs);
104 for (i = num; i-- > 0; ) {
105 dip = DINO_IN_BLOCK(fs, lifa->buf, i);
106 ino = lfs_dino_getinumber(fs, dip);
107 if (ino == LFS_IFILE_INUM) {
108 /* Check address against superblock */
109 true_addr = lfs_sb_getidaddr(fs);
110 } else {
111 /* Not ifile. Check address against ifile. */
112 LFS_IENTRY(ifp, fs, ino, ibp);
113 true_addr = lfs_if_getdaddr(fs, ifp);
114 brelse(ibp, 0);
115 }
116 if (offset != true_addr)
117 continue;
118
119 LFS_ASSERT_MAXINO(fs, ino);
120
121 /* XXX We can use fastvget here! */
122
123 /*
124 * An inode we need to relocate.
125 * Get it if we can.
126 */
127 if (ino == LFS_IFILE_INUM)
128 vp = fs->lfs_ivnode;
129 else
130 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino,
131 LK_EXCLUSIVE | LK_NOWAIT, &vp);
132 if (error)
133 continue;
134
135 KASSERT(VTOI(vp)->i_gen == lfs_dino_getgen(fs, dip));
136 lfs_setclean(fs, vp);
137 if (vp != fs->lfs_ivnode) {
138 VOP_UNLOCK(vp);
139 vrele(vp);
140 }
141 }
142
143 return error;
144 }
145
146 static int
147 ino_func_rewrite(struct lfs_inofuncarg *lifa)
148 {
149 struct lfs *fs;
150 daddr_t offset;
151 struct vnode *devvp, *vp;
152 union lfs_dinode *dip;
153 struct buf *dbp, *ibp;
154 int error;
155 IFILE *ifp;
156 unsigned i, num;
157 daddr_t true_addr;
158 ino_t ino;
159
160 fs = lifa->fs;
161 offset = lifa->offset;
162 devvp = VTOI(fs->lfs_ivnode)->i_devvp;
163
164 /* Read inode block */
165 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs),
166 0, &dbp);
167 if (error) {
168 DLOG((DLOG_RF, "ino_func_rewrite: bread returned %d\n",
169 error));
170 return error;
171 }
172 memcpy(lifa->buf, dbp->b_data, dbp->b_bcount);
173 brelse(dbp, BC_AGE);
174
175 /* Check each inode against ifile entry */
176 num = LFS_INOPB(fs);
177 for (i = num; i-- > 0; ) {
178 dip = DINO_IN_BLOCK(fs, lifa->buf, i);
179 ino = lfs_dino_getinumber(fs, dip);
180 if (ino == LFS_IFILE_INUM) {
181 /* Check address against superblock */
182 true_addr = lfs_sb_getidaddr(fs);
183 } else {
184 /* Not ifile. Check address against ifile. */
185 LFS_IENTRY(ifp, fs, ino, ibp);
186 true_addr = lfs_if_getdaddr(fs, ifp);
187 brelse(ibp, 0);
188 }
189 if (offset != true_addr)
190 continue;
191
192 if (ino == LFS_IFILE_INUM)
193 continue;
194
195 LFS_ASSERT_MAXINO(fs, ino);
196
197 /* XXX We can use fastvget here! */
198
199 /*
200 * An inode we need to relocate.
201 * Get it if we can.
202 */
203 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino,
204 LK_EXCLUSIVE | LK_NOWAIT, &vp);
205 if (error)
206 continue;
207
208 KASSERT(VTOI(vp)->i_gen == lfs_dino_getgen(fs, dip));
209
210 if (!(VTOI(vp)->i_state & IN_CLEANING)) {
211 lfs_setclean(fs, vp);
212 lfs_writeinode(fs, fs->lfs_sp, VTOI(vp));
213 }
214
215 VOP_UNLOCK(vp);
216 vrele(vp);
217
218 }
219
220 return error;
221 }
222
223 static int
224 rewrite_block(struct lfs *fs, struct vnode *vp, daddr_t lbn, daddr_t offset, size_t size, int *have_finfop)
225 {
226 daddr_t daddr;
227 int error;
228 struct buf *bp;
229 struct inode *ip;
230
231 KASSERT(have_finfop != NULL);
232
233 /* Look up current location of this block. */
234 error = VOP_BMAP(vp, lbn, NULL, &daddr, NULL);
235 if (error)
236 return error;
237
238 /* Skip any block that is not here. */
239 if (offset != 0 && LFS_DBTOFSB(fs, daddr) != offset)
240 return ESTALE;
241
242 /*
243 * It is (was recently) here. Read the block.
244 */
245 //size = lfs_blksize(fs, VTOI(vp), lbn);
246 error = bread(vp, lbn, size, 0, &bp);
247 if (error)
248 return error;
249
250 if (vp == fs->lfs_ivnode) {
251 VOP_BWRITE(vp, bp);
252 } else {
253 /* Get ready to write. */
254 if (!*have_finfop) {
255 ip = VTOI(vp);
256 lfs_acquire_finfo(fs, ip->i_number, ip->i_gen);
257 fs->lfs_sp->vp = vp;
258 *have_finfop = 1;
259 }
260
261 KASSERT(bp->b_vp == vp);
262 /* bp->b_cflags |= BC_INVAL; */ /* brelse will kill the buffer */
263 lfs_bwrite_ext(bp, BW_CLEAN);
264 KASSERT(bp->b_vp == vp);
265 mutex_enter(&bufcache_lock);
266 while (lfs_gatherblock(fs->lfs_sp, bp, &bufcache_lock)) {
267 KASSERT(bp->b_vp != NULL);
268 }
269 mutex_exit(&bufcache_lock);
270
271 KASSERT(bp->b_flags & B_GATHERED);
272 KASSERT(fs->lfs_sp->cbpp[-1] == bp);
273 }
274 return 0;
275 }
276
277 static int
278 finfo_func_rewrite(struct lfs_finfofuncarg *lffa)
279 {
280 struct lfs *fs;
281 FINFO *fip;
282 daddr_t *offsetp;
283 int j, have_finfo, error;
284 size_t size, bytes;
285 ino_t ino;
286 uint32_t gen;
287 struct vnode *vp;
288 daddr_t lbn;
289 int *fragsp;
290
291 fs = lffa->fs;
292 fip = lffa->finfop;
293 offsetp = lffa->offsetp;
294 fragsp = (int *)lffa->arg;
295
296 /* Get the inode and check its version. */
297 ino = lfs_fi_getino(fs, fip);
298 gen = lfs_fi_getversion(fs, fip);
299 error = 0;
300 if (ino == LFS_IFILE_INUM)
301 vp = fs->lfs_ivnode;
302 else {
303 LFS_ASSERT_MAXINO(fs, ino);
304 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino,
305 LK_EXCLUSIVE|LK_NOWAIT, &vp);
306 }
307
308 /*
309 * If we can't, or if version is wrong, or it has dirop blocks on it,
310 * we can't relocate its blocks; but we still have to count
311 * blocks through the partial segment to return the right offset.
312 * XXX actually we can move DIROP vnodes' *old* data, as long
313 * XXX as we are sure that we are moving *only* the old data---?
314 */
315 if (error || VTOI(vp)->i_gen != gen || (vp->v_uflag & VU_DIROP)) {
316 if (error == 0)
317 error = ESTALE;
318
319 if (vp != NULL && vp != fs->lfs_ivnode) {
320 VOP_UNLOCK(vp);
321 vrele(vp);
322 }
323 vp = NULL;
324 bytes = ((lfs_fi_getnblocks(fs, fip) - 1) << lfs_sb_getbshift(fs))
325 + lfs_fi_getlastlength(fs, fip);
326 *offsetp += lfs_btofsb(fs, bytes);
327
328 return error;
329 }
330
331 /*
332 * We have the vnode and its version is correct.
333 * Take a cleaning reference; and loop through the blocks
334 * and rewrite them.
335 */
336 lfs_setclean(fs, vp);
337 size = lfs_sb_getbsize(fs);
338 have_finfo = 0;
339 for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) {
340 if (j == lfs_fi_getnblocks(fs, fip) - 1)
341 size = lfs_fi_getlastlength(fs, fip);
342 /*
343 * An error of ESTALE indicates that there was nothing
344 * to rewrite; this is not a problem. Any other error
345 * causes us to skip the rest of this FINFO.
346 */
347 if (vp != NULL && error == 0) {
348 lbn = lfs_fi_getblock(fs, fip, j);
349 error = rewrite_block(fs, vp, lbn, *offsetp,
350 size, &have_finfo);
351 if (error == ESTALE)
352 error = 0;
353 if (fragsp != NULL && error == 0)
354 *fragsp += lfs_btofsb(fs, size);
355 }
356 *offsetp += lfs_btofsb(fs, size);
357 }
358
359 /*
360 * If we acquired finfo, release it and write the blocks.
361 */
362 if (have_finfo) {
363 lfs_updatemeta(fs->lfs_sp);
364 fs->lfs_sp->vp = NULL;
365 lfs_release_finfo(fs);
366 lfs_writeinode(fs, fs->lfs_sp, VTOI(vp));
367 }
368
369 /* Release vnode */
370 if (vp != fs->lfs_ivnode) {
371 VOP_UNLOCK(vp);
372 vrele(vp);
373 }
374
375 return error;
376 }
377
378 static int
379 finfo_func_setclean(struct lfs_finfofuncarg *lffa)
380 {
381 struct lfs *fs;
382 FINFO *fip;
383 daddr_t *offsetp;
384 int error;
385 size_t bytes;
386 ino_t ino;
387 uint32_t gen;
388 struct vnode *vp;
389
390 fs = lffa->fs;
391 fip = lffa->finfop;
392 offsetp = lffa->offsetp;
393
394 /* Get the inode and check its version. */
395 ino = lfs_fi_getino(fs, fip);
396 gen = lfs_fi_getversion(fs, fip);
397 error = 0;
398 if (ino == LFS_IFILE_INUM)
399 vp = fs->lfs_ivnode;
400 else {
401 LFS_ASSERT_MAXINO(fs, ino);
402 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino,
403 LK_EXCLUSIVE|LK_NOWAIT, &vp);
404 }
405
406 /* If we have it and its version is right, take a cleaning reference */
407 if (error == 0 && VTOI(vp)->i_gen == gen)
408 lfs_setclean(fs, vp);
409
410 if (vp == fs->lfs_ivnode)
411 vp = NULL;
412 else if (vp != NULL) {
413 VOP_UNLOCK(vp);
414 vrele(vp);
415 vp = NULL;
416 }
417
418 /* Skip to the next block */
419 bytes = ((lfs_fi_getnblocks(fs, fip) - 1) << lfs_sb_getbshift(fs))
420 + lfs_fi_getlastlength(fs, fip);
421 *offsetp += lfs_btofsb(fs, bytes);
422
423 return error;
424 }
425
426 /*
427 * Use the partial-segment parser to rewrite (clean) a segment.
428 */
429 int
430 lfs_rewrite_segment(struct lfs *fs, int sn, int *fragsp, kauth_cred_t cred, struct lwp *l)
431 {
432 daddr_t ooffset, offset, endpseg;
433
434 ASSERT_SEGLOCK(fs);
435
436 offset = lfs_sntod(fs, sn);
437 lfs_skip_superblock(fs, &offset);
438 endpseg = lfs_sntod(fs, sn + 1);
439
440 while (offset > 0 && offset != endpseg) {
441 /* First check summary validity (XXX unnecessary?) */
442 ooffset = offset;
443 lfs_parse_pseg(fs, &offset, 0, cred, NULL, l,
444 NULL, NULL, CKSEG_CKSUM, NULL);
445 if (offset == ooffset)
446 break;
447
448 /*
449 * Valid, proceed.
450 *
451 * First write the file blocks, marking their
452 * inodes IN_CLEANING.
453 */
454 offset = ooffset;
455 lfs_parse_pseg(fs, &offset, 0, cred, NULL, l,
456 NULL, finfo_func_rewrite,
457 CKSEG_NONE, fragsp);
458
459 /*
460 * Now go back and pick up any inodes that
461 * were not already marked IN_CLEANING, and
462 * write them as well.
463 */
464 offset = ooffset;
465 lfs_parse_pseg(fs, &offset, 0, cred, NULL, l,
466 ino_func_rewrite, NULL,
467 CKSEG_NONE, fragsp);
468 }
469 return 0;
470 }
471
472 /*
473 * Rewrite the contents of one or more segments, in preparation for
474 * marking them clean.
475 */
476 int
477 lfs_rewrite_segments(struct lfs *fs, int *snn, int len, int *directp, int *offsetp, struct lwp *l)
478 {
479 kauth_cred_t cred;
480 int i, error;
481 struct buf *bp;
482 SEGUSE *sup;
483 daddr_t offset, endpseg;
484
485 ASSERT_NO_SEGLOCK(fs);
486
487 cred = l ? l->l_cred : NOCRED;
488
489 /* Prevent new dirops and acquire the cleaner lock. */
490 lfs_writer_enter(fs, "rewritesegs");
491 if ((error = lfs_cleanerlock(fs)) != 0) {
492 lfs_writer_leave(fs);
493 return error;
494 }
495
496 /*
497 * Pre-reference vnodes now that we have cleaner lock
498 * but before we take the segment lock. We don't want to
499 * mix cleaning blocks with flushed vnodes.
500 */
501 for (i = 0; i < len; i++) {
502 error = 0;
503 /* Refuse to clean segments that are ACTIVE */
504 LFS_SEGENTRY(sup, fs, snn[i], bp);
505 if (sup->su_flags & SEGUSE_ACTIVE
506 || !(sup->su_flags & SEGUSE_DIRTY))
507 error = EINVAL;
508
509 brelse(bp, 0);
510 if (error)
511 break;
512
513 offset = lfs_sntod(fs, snn[i]);
514 lfs_skip_superblock(fs, &offset);
515 endpseg = lfs_sntod(fs, snn[i] + 1);
516
517 while (offset > 0 && offset != endpseg) {
518 lfs_parse_pseg(fs, &offset, 0, cred, NULL, l,
519 ino_func_setclean, finfo_func_setclean,
520 CKSEG_NONE, NULL);
521 }
522 }
523
524 /*
525 * Actually rewrite the contents of the segment.
526 */
527 lfs_seglock(fs, SEGM_CLEAN);
528
529 for (i = 0; i < len; i++) {
530 error = 0;
531 /* Refuse to clean segments that are ACTIVE */
532 LFS_SEGENTRY(sup, fs, snn[i], bp);
533 if (sup->su_flags & SEGUSE_ACTIVE
534 || !(sup->su_flags & SEGUSE_DIRTY))
535 error = EINVAL;
536
537 brelse(bp, 0);
538 if (error)
539 break;
540
541 error = lfs_rewrite_segment(fs, snn[i], directp, cred, l);
542 if (error) {
543 printf(" rewrite_segment returned %d\n", error);
544 break;
545 }
546 }
547 while (lfs_writeseg(fs, fs->lfs_sp))
548 ;
549
550 *offsetp = lfs_btofsb(fs, fs->lfs_sp->bytes_written);
551 lfs_segunlock(fs);
552 lfs_cleanerunlock(fs);
553 lfs_writer_leave(fs);
554
555 return error;
556 }
557
558 #if 0
559 static bool
560 lfs_isseq(const struct lfs *fs, long int lbn1, long int lbn2)
561 {
562 return lbn2 == lbn1 + lfs_sb_getfrag(__UNCONST(fs));
563 }
564
565 /*
566 * Rewrite the contents of a file in order to coalesce it.
567 * We don't bother rewriting indirect blocks because they will have to
568 * be rewritten anyway when we rewrite the direct blocks.
569 */
570 int
571 lfs_rewrite_file(struct lfs *fs, ino_t ino, struct lwp *l)
572 {
573 daddr_t lbn, hiblk, daddr;
574 int i, error, num, run;
575 struct vnode *vp;
576 struct indir indirs[ULFS_NIADDR+2];
577 size_t size;
578
579 ASSERT_SEGLOCK(fs);
580
581 LFS_ASSERT_MAXINO(fs, ino);
582
583 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, LK_EXCLUSIVE, &vp);
584 if (error)
585 return error;
586
587 lfs_acquire_finfo(fs, ino, VTOI(vp)->i_gen);
588 for (lbn = 0, hiblk = VTOI(vp)->i_lfs_hiblk; lbn < hiblk; ++lbn) {
589 error = ulfs_bmaparray(vp, lbn, &daddr, &indirs[0], &num, &run,
590 lfs_isseq);
591 if (daddr == UNASSIGNED)
592 continue;
593 for (i = 0; i <= run; i++) {
594 size = lfs_blksize(fs, VTOI(vp), lbn);
595 error = rewrite_block(fs, vp, lbn++, 0x0, size, NULL);
596 if (error)
597 break;
598 }
599 }
600 lfs_release_finfo(fs);
601 while (lfs_writeseg(fs, fs->lfs_sp))
602 ;
603 lfs_segunlock(fs);
604
605 return error;
606 }
607 #endif /* 0 */
608
609
610 static int
611 ino_func_checkempty(struct lfs_inofuncarg *lifa)
612 {
613 struct lfs *fs;
614 daddr_t offset;
615 struct vnode *devvp;
616 union lfs_dinode *dip;
617 struct buf *dbp, *ibp;
618 int error;
619 IFILE *ifp;
620 unsigned i, num;
621 daddr_t true_addr;
622 ino_t ino;
623
624 fs = lifa->fs;
625 offset = lifa->offset;
626 devvp = VTOI(fs->lfs_ivnode)->i_devvp;
627
628 /* Read inode block */
629 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs),
630 0, &dbp);
631 if (error) {
632 DLOG((DLOG_RF, "ino_func_checkempty: bread returned %d\n",
633 error));
634 return error;
635 }
636
637 /* Check each inode against ifile entry */
638 num = LFS_INOPB(fs);
639 for (i = num; i-- > 0; ) {
640 dip = DINO_IN_BLOCK(fs, dbp->b_data, i);
641 ino = lfs_dino_getinumber(fs, dip);
642 if (ino == LFS_IFILE_INUM) {
643 /* Check address against superblock */
644 true_addr = lfs_sb_getidaddr(fs);
645 } else {
646 /* Not ifile. Check address against ifile. */
647 LFS_IENTRY(ifp, fs, ino, ibp);
648 true_addr = lfs_if_getdaddr(fs, ifp);
649 brelse(ibp, 0);
650 }
651 if (offset == true_addr) {
652 error = EEXIST;
653 break;
654 }
655 }
656 brelse(dbp, BC_AGE);
657
658 return error;
659 }
660
661 static int
662 finfo_func_checkempty(struct lfs_finfofuncarg *lffa)
663 {
664 struct lfs *fs;
665 FINFO *fip;
666 daddr_t *offsetp;
667 int j, error;
668 size_t size, bytes;
669 ino_t ino;
670 uint32_t gen;
671 struct vnode *vp;
672 daddr_t lbn, daddr;
673
674 fs = lffa->fs;
675 fip = lffa->finfop;
676 offsetp = lffa->offsetp;
677
678 /* Get the inode and check its version. */
679 ino = lfs_fi_getino(fs, fip);
680 gen = lfs_fi_getversion(fs, fip);
681 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, LK_EXCLUSIVE, &vp);
682
683 /*
684 * If we can't, or if version is wrong, this FINFO does not refer
685 * to a live file. Skip over it and continue.
686 */
687 if (error || VTOI(vp)->i_gen != gen) {
688 if (error == 0)
689 error = ESTALE;
690
691 if (vp != NULL) {
692 VOP_UNLOCK(vp);
693 vrele(vp);
694 vp = NULL;
695 }
696 bytes = ((lfs_fi_getnblocks(fs, fip) - 1)
697 << lfs_sb_getbshift(fs))
698 + lfs_fi_getlastlength(fs, fip);
699 *offsetp += lfs_btofsb(fs, bytes);
700
701 return error;
702 }
703
704 /*
705 * We have the vnode and its version is correct.
706 * Loop through the blocks and check their currency.
707 */
708 size = lfs_sb_getbsize(fs);
709 for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) {
710 if (j == lfs_fi_getnblocks(fs, fip) - 1)
711 size = lfs_fi_getlastlength(fs, fip);
712 if (vp != NULL) {
713 lbn = lfs_fi_getblock(fs, fip, j);
714
715 /* Look up current location of this block. */
716 error = VOP_BMAP(vp, lbn, NULL, &daddr, NULL);
717 if (error)
718 break;
719
720 /* If it is here, the segment is not empty. */
721 if (LFS_DBTOFSB(fs, daddr) == *offsetp) {
722 error = EEXIST;
723 break;
724 }
725 }
726 *offsetp += lfs_btofsb(fs, size);
727 }
728
729 /* Release vnode */
730 VOP_UNLOCK(vp);
731 vrele(vp);
732
733 return error;
734 }
735
736 int
737 lfs_checkempty(struct lfs *fs, int sn, kauth_cred_t cred, struct lwp *l)
738 {
739 daddr_t offset, endpseg;
740 int error;
741
742 ASSERT_SEGLOCK(fs);
743
744 offset = lfs_sntod(fs, sn);
745 lfs_skip_superblock(fs, &offset);
746 endpseg = lfs_sntod(fs, sn + 1);
747
748 while (offset > 0 && offset < endpseg) {
749 error = lfs_parse_pseg(fs, &offset, 0, cred, NULL, l,
750 ino_func_checkempty,
751 finfo_func_checkempty,
752 CKSEG_NONE, NULL);
753 if (error)
754 return error;
755 }
756 return 0;
757 }
758
759 static long
760 segselect_greedy(struct lfs *fs, int sn, SEGUSE *sup)
761 {
762 return lfs_sb_getssize(fs) - sup->su_nbytes;
763 }
764
765 __inline static long
766 segselect_cb_rosenblum(struct lfs *fs, int sn, SEGUSE *sup, long age)
767 {
768 long benefit, cost;
769
770 benefit = (int64_t)lfs_sb_getssize(fs) - sup->su_nbytes -
771 (sup->su_nsums + 1) * lfs_sb_getfsize(fs);
772 if (sup->su_flags & SEGUSE_SUPERBLOCK)
773 benefit -= LFS_SBPAD;
774 if (lfs_sb_getbsize(fs) > lfs_sb_getfsize(fs)) /* fragmentation */
775 benefit -= (lfs_sb_getbsize(fs) / 2);
776 if (benefit <= 0) {
777 return 0;
778 }
779
780 cost = lfs_sb_getssize(fs) + sup->su_nbytes;
781 return (256 * benefit * age) / cost;
782 }
783
784 static long
785 segselect_cb_time(struct lfs *fs, int sn, SEGUSE *sup)
786 {
787 long age;
788
789 age = time_second - sup->su_lastmod;
790 if (age < 0)
791 age = 0;
792 return segselect_cb_rosenblum(fs, sn, sup, age);
793 }
794
795 #if 0
796 /*
797 * Same as the time comparator, but fetch the serial number from the
798 * segment header to compare.
799 *
800 * This is ugly. Whether serial number or wall time is better is a
801 * worthy question, but if we want to use serial number to compute
802 * age, we should record the serial number in su_lastmod instead of
803 * the time.
804 */
805 static long
806 segselect_cb_serial(struct lfs *fs, int sn, SEGUSE *sup)
807 {
808 struct buf *bp;
809 uint32_t magic;
810 uint64_t age, serial;
811 daddr_t addr;
812
813 addr = lfs_segtod(fs, sn);
814 lfs_skip_superblock(fs, &addr);
815 bread(fs->lfs_devvp, LFS_FSBTODB(fs, addr),
816 lfs_sb_getsumsize(fs), 0, &bp);
817 magic = lfs_ss_getmagic(fs, ((SEGSUM *)bp->b_data));
818 serial = lfs_ss_getserial(fs, ((SEGSUM *)bp->b_data));
819 brelse(bp, 0);
820
821 if (magic != SS_MAGIC)
822 return 0;
823
824 age = lfs_sb_getserial(fs) - serial;
825 return segselect_cb_rosenblum(fs, sn, sup, age);
826 }
827 #endif
828
829 void
830 lfs_cleanerd(void *arg)
831 {
832 mount_iterator_t *iter;
833 struct mount *mp;
834 struct lfs *fs;
835 struct vfsops *vfs = NULL;
836 int lfsc;
837 int cleaned_something = 0;
838
839 mutex_enter(&lfs_lock);
840 KASSERTMSG(lfs_cleaner_daemon == NULL,
841 "more than one LFS cleaner daemon");
842 lfs_cleaner_daemon = curlwp;
843 mutex_exit(&lfs_lock);
844
845 /* Take an extra reference to the LFS vfsops. */
846 vfs = vfs_getopsbyname(MOUNT_LFS);
847
848 mutex_enter(&lfs_lock);
849 for (;;) {
850 KASSERT(mutex_owned(&lfs_lock));
851 if (cleaned_something == 0)
852 cv_timedwait(&lfs_allclean_wakeup, &lfs_lock, hz/10 + 1);
853 KASSERT(mutex_owned(&lfs_lock));
854 cleaned_something = 0;
855
856 KASSERT(mutex_owned(&lfs_lock));
857 mutex_exit(&lfs_lock);
858
859 /*
860 * Look through the list of LFSs to see if any of them
861 * need cleaning.
862 */
863 mountlist_iterator_init(&iter);
864 lfsc = 0;
865 while ((mp = mountlist_iterator_next(iter)) != NULL) {
866 KASSERT(!mutex_owned(&lfs_lock));
867 if (strncmp(mp->mnt_stat.f_fstypename, MOUNT_LFS,
868 sizeof(mp->mnt_stat.f_fstypename)) == 0) {
869 fs = VFSTOULFS(mp)->um_lfs;
870
871 mutex_enter(&lfs_lock);
872 if (fs->lfs_clean_selector != NULL)
873 ++lfsc;
874 mutex_exit(&lfs_lock);
875 cleaned_something += clean(fs);
876 }
877 }
878 if (lfsc == 0) {
879 mutex_enter(&lfs_lock);
880 lfs_cleaner_daemon = NULL;
881 mutex_exit(&lfs_lock);
882 mountlist_iterator_destroy(iter);
883 break;
884 }
885 mountlist_iterator_destroy(iter);
886
887 mutex_enter(&lfs_lock);
888 }
889 KASSERT(!mutex_owned(&lfs_lock));
890
891 /* Give up our extra reference so the module can be unloaded. */
892 mutex_enter(&vfs_list_lock);
893 if (vfs != NULL)
894 vfs->vfs_refcount--;
895 mutex_exit(&vfs_list_lock);
896
897 /* Done! */
898 kthread_exit(0);
899 }
900
901 /*
902 * Look at the file system to see whether it needs cleaning, and if it does,
903 * clean a segment.
904 */
905 static int
906 clean(struct lfs *fs)
907 {
908 struct buf *bp;
909 SEGUSE *sup;
910 int sn, maxsn, nclean, nready, nempty, nerror, nzero, again, target;
911 long prio, maxprio, maxeprio, thresh;
912 long (*func)(struct lfs *, int, SEGUSE *);
913 uint32_t __debugused segflags = 0;
914 daddr_t oldsn, bfree, avail;
915 int direct, offset;
916
917 func = fs->lfs_clean_selector;
918 if (func == NULL)
919 return 0;
920
921 thresh = fs->lfs_autoclean.thresh;
922 if (fs->lfs_flags & LFS_MUSTCLEAN)
923 thresh = 0;
924 else if (thresh < 0) {
925 /*
926 * Compute a priority threshold based on availability ratio.
927 * XXX These numbers only makes sense for the greedy cleaner.
928 * What is an appropriate threshold for the cost-benefit
929 * cleaner?
930 */
931 bfree = lfs_sb_getbfree(fs)
932 + lfs_segtod(fs, 1) * lfs_sb_getminfree(fs);
933 avail = lfs_sb_getavail(fs) - fs->lfs_ravail - fs->lfs_favail;
934 if (avail > bfree)
935 return 0;
936 thresh = lfs_sb_getssize(fs) * (bfree - avail)
937 / (lfs_sb_getsize(fs) - avail);
938 if (thresh > lfs_sb_getsumsize(fs) + 5 * lfs_sb_getbsize(fs))
939 thresh = lfs_sb_getsumsize(fs) + 5 * lfs_sb_getbsize(fs);
940 if (thresh > lfs_sb_getssize(fs) - lfs_sb_getbsize(fs))
941 return 0;
942 }
943
944 target = fs->lfs_autoclean.target;
945 if (target <= 0) {
946 /* Default to half a segment target */
947 target = lfs_segtod(fs, 1) / 2;
948 }
949
950 oldsn = lfs_dtosn(fs, lfs_sb_getoffset(fs));
951
952 again = 0;
953 maxprio = maxeprio = -1;
954 nzero = nclean = nready = nempty = nerror = 0;
955 for (sn = 0; sn < lfs_sb_getnseg(fs); sn++) {
956
957 prio = 0;
958 LFS_SEGENTRY(sup, fs, sn, bp);
959 if (sup->su_flags & SEGUSE_ACTIVE)
960 prio = 0;
961 else if (!(sup->su_flags & SEGUSE_DIRTY))
962 ++nclean;
963 else if (sup->su_flags & SEGUSE_READY)
964 ++nready;
965 else if (sup->su_flags & SEGUSE_EMPTY)
966 ++nempty;
967 else if (sup->su_nbytes == 0)
968 ++nzero;
969 else
970 prio = (*func)(fs, sn, sup);
971
972 if (sup->su_flags & SEGUSE_ERROR) {
973 if (prio > maxeprio)
974 maxeprio = prio;
975 prio = 0;
976 ++nerror;
977 }
978
979 if (prio > maxprio) {
980 maxprio = prio;
981 maxsn = sn;
982 segflags = sup->su_flags;
983 }
984 brelse(bp, 0);
985 }
986 DLOG((DLOG_CLEAN, "%s clean=%d/%d zero=%d empty=%d ready=%d maxsn=%d maxprio=%ld/%ld segflags=0x%lx\n",
987 (maxprio > thresh ? "YES" : "NO "),
988 nclean, (int)lfs_sb_getnseg(fs), nzero, nempty, nready,
989 maxsn, maxprio, (unsigned long)thresh,
990 (unsigned long)segflags));
991
992 /*
993 * If we are trying to clean the segment we cleaned last,
994 * cleaning did not work. Mark this segment SEGUSE_ERROR
995 * and try again.
996 */
997 if (maxprio > 0 && fs->lfs_lastcleaned == maxsn) {
998 LFS_SEGENTRY(sup, fs, maxsn, bp);
999 sup->su_flags |= SEGUSE_ERROR;
1000 LFS_WRITESEGENTRY(sup, fs, sn, bp);
1001 return 1;
1002 }
1003
1004 /*
1005 * If there were nothing but error segments, clear error.
1006 * We will wait to try again.
1007 */
1008 if (maxprio == 0 && maxeprio > 0) {
1009 DLOG((DLOG_CLEAN, "clear error on %d segments, try again\n",
1010 nerror));
1011 lfs_seguse_clrflag_all(fs, SEGUSE_ERROR);
1012 }
1013
1014 /* Rewrite the highest-priority segment */
1015 if (maxprio > thresh) {
1016 direct = offset = 0;
1017 (void)lfs_rewrite_segments(fs, &maxsn, 1,
1018 &direct, &offset, curlwp);
1019 DLOG((DLOG_CLEAN, " direct=%d offset=%d\n", direct, offset));
1020 again += direct;
1021 fs->lfs_clean_accum += offset;
1022
1023 /* Don't clean this again immediately */
1024 fs->lfs_lastcleaned = maxsn;
1025 }
1026
1027 /*
1028 * If we are in dire straits but we have segments already
1029 * empty, force a double checkpoint to reclaim them.
1030 */
1031 if (fs->lfs_flags & LFS_MUSTCLEAN) {
1032 if (nready + nempty > 0) {
1033 printf("force checkpoint with nready=%d nempty=%d nzero=%d\n",
1034 nready, nempty, nzero);
1035 lfs_segwrite(fs->lfs_ivnode->v_mount,
1036 SEGM_CKP | SEGM_FORCE_CKP | SEGM_SYNC);
1037 lfs_segwrite(fs->lfs_ivnode->v_mount,
1038 SEGM_CKP | SEGM_FORCE_CKP | SEGM_SYNC);
1039 ++again;
1040 }
1041 } else if (fs->lfs_clean_accum > target) {
1042 DLOG((DLOG_CLEAN, "checkpoint to flush\n"));
1043 lfs_segwrite(fs->lfs_ivnode->v_mount, SEGM_CKP);
1044 fs->lfs_clean_accum = 0;
1045 } else if (lfs_dtosn(fs, lfs_sb_getoffset(fs)) != oldsn
1046 || nempty + nready > LFS_MAX_ACTIVE) { /* XXX arbitrary */
1047 DLOG((DLOG_CLEAN, "write to promote empty segments\n"));
1048 lfs_segwrite(fs->lfs_ivnode->v_mount, SEGM_CKP);
1049 fs->lfs_clean_accum = 0;
1050 }
1051
1052 return again;
1053 }
1054
1055 /*
1056 * Rewrite a file in its entirety.
1057 *
1058 * Generally this would be done to coalesce a file that is scattered
1059 * around the disk; but if the "scramble" flag is set, instead rewrite
1060 * only the even-numbered blocks, which provides the opposite effect
1061 * for testing purposes.
1062 *
1063 * It is the caller's responsibility to check the bounds of the inode
1064 * numbers.
1065 */
1066 int
1067 lfs_rewrite_file(struct lfs *fs, ino_t *inoa, int len, bool scramble,
1068 int *directp, int *offsetp)
1069 {
1070 daddr_t hiblk, lbn;
1071 struct vnode *vp;
1072 struct inode *ip;
1073 struct buf *bp;
1074 int i, error, flags;
1075
1076 *directp = 0;
1077 if ((error = lfs_cleanerlock(fs)) != 0)
1078 return error;
1079 flags = SEGM_PROT;
1080 lfs_seglock(fs, flags);
1081 for (i = 0; i < len; ++i) {
1082 error = VFS_VGET(fs->lfs_ivnode->v_mount, inoa[i], LK_EXCLUSIVE, &vp);
1083 if (error)
1084 goto out;
1085
1086 ip = VTOI(vp);
1087 if ((vp->v_uflag & VU_DIROP) || (ip->i_flags & IN_ADIROP)) {
1088 VOP_UNLOCK(vp);
1089 vrele(vp);
1090 error = EAGAIN;
1091 goto out;
1092 }
1093
1094 /* Highest block in this inode */
1095 hiblk = lfs_lblkno(fs, ip->i_size + lfs_sb_getbsize(fs) - 1) - 1;
1096
1097 for (lbn = 0; lbn <= hiblk; ++lbn) {
1098 if (scramble && (lbn & 0x01))
1099 continue;
1100
1101 if (lfs_needsflush(fs)) {
1102 lfs_segwrite(fs->lfs_ivnode->v_mount, flags);
1103 }
1104
1105 error = bread(vp, lbn, lfs_blksize(fs, ip, lbn), 0, &bp);
1106 if (error)
1107 break;
1108
1109 /* bp->b_cflags |= BC_INVAL; */
1110 lfs_bwrite_ext(bp, (flags & SEGM_CLEAN ? BW_CLEAN : 0));
1111 *directp += lfs_btofsb(fs, bp->b_bcount);
1112 }
1113
1114 /* Done with this vnode */
1115 VOP_UNLOCK(vp);
1116 vrele(vp);
1117 if (error)
1118 break;
1119 }
1120 out:
1121 lfs_segwrite(fs->lfs_ivnode->v_mount, flags);
1122 *offsetp += lfs_btofsb(fs, fs->lfs_sp->bytes_written);
1123 lfs_segunlock(fs);
1124 lfs_cleanerunlock(fs);
1125
1126 return error;
1127 }
1128
1129 int
1130 lfs_cleanctl(struct lfs *fs, struct lfs_autoclean_params *params)
1131 {
1132 long (*cleanfunc)(struct lfs *, int, SEGUSE *);
1133
1134 fs->lfs_autoclean = *params;
1135
1136 cleanfunc = NULL;
1137 switch (fs->lfs_autoclean.mode) {
1138 case LFS_CLEANMODE_NONE:
1139 cleanfunc = NULL;
1140 break;
1141
1142 case LFS_CLEANMODE_GREEDY:
1143 cleanfunc = segselect_greedy;
1144 break;
1145
1146 case LFS_CLEANMODE_CB:
1147 cleanfunc = segselect_cb_time;
1148 break;
1149
1150 default:
1151 return EINVAL;
1152 }
1153
1154 mutex_enter(&lfs_lock);
1155 if (fs->lfs_clean_selector == NULL && cleanfunc != NULL)
1156 if (++lfs_ncleaners == 1) {
1157 printf("Starting cleaner thread\n");
1158 if (lfs_cleaner_daemon == NULL &&
1159 kthread_create(PRI_BIO, 0, NULL,
1160 lfs_cleanerd, NULL, NULL,
1161 "lfs_cleaner") != 0)
1162 panic("fork lfs_cleaner");
1163 }
1164 if (fs->lfs_clean_selector != NULL && cleanfunc == NULL)
1165 if (--lfs_ncleaners == 0) {
1166 printf("Stopping cleaner thread\n");
1167 kthread_join(lfs_cleaner_daemon);
1168 }
1169 fs->lfs_clean_selector = cleanfunc;
1170 mutex_exit(&lfs_lock);
1171
1172 return 0;
1173 }
1174