lfs_cleanerd.c revision 1.59 1 /* $NetBSD: lfs_cleanerd.c,v 1.59 2019/08/22 20:28:08 brad Exp $ */
2
3 /*-
4 * Copyright (c) 2005 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Konrad E. Schroder <perseant (at) hhhh.org>.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * The cleaner daemon for the NetBSD Log-structured File System.
34 * Only tested for use with version 2 LFSs.
35 */
36
37 #include <sys/syslog.h>
38 #include <sys/param.h>
39 #include <sys/mount.h>
40 #include <sys/stat.h>
41 #include <ufs/lfs/lfs.h>
42
43 #include <assert.h>
44 #include <err.h>
45 #include <errno.h>
46 #include <fcntl.h>
47 #include <semaphore.h>
48 #include <stdbool.h>
49 #include <stdio.h>
50 #include <stdlib.h>
51 #include <string.h>
52 #include <unistd.h>
53 #include <time.h>
54 #include <util.h>
55
56 #include "bufcache.h"
57 #include "vnode.h"
58 #include "lfs_user.h"
59 #include "fdfs.h"
60 #include "cleaner.h"
61 #include "kernelops.h"
62 #include "mount_lfs.h"
63
64 /*
65 * Global variables.
66 */
67 /* XXX these top few should really be fs-specific */
68 int use_fs_idle; /* Use fs idle rather than cpu idle time */
69 int use_bytes; /* Use bytes written rather than segments cleaned */
70 double load_threshold; /* How idle is idle (CPU idle) */
71 int atatime; /* How many segments (bytes) to clean at a time */
72
73 int nfss; /* Number of filesystems monitored by this cleanerd */
74 struct clfs **fsp; /* Array of extended filesystem structures */
75 int segwait_timeout; /* Time to wait in lfs_segwait() */
76 int do_quit; /* Quit after one cleaning loop */
77 int do_coalesce; /* Coalesce filesystem */
78 int do_small; /* Use small writes through markv */
79 char *copylog_filename; /* File to use for fs debugging analysis */
80 int inval_segment; /* Segment to invalidate */
81 int stat_report; /* Report statistics for this period of cycles */
82 int debug; /* Turn on debugging */
83 struct cleaner_stats {
84 double util_tot;
85 double util_sos;
86 off_t bytes_read;
87 off_t bytes_written;
88 off_t segs_cleaned;
89 off_t segs_empty;
90 off_t segs_error;
91 } cleaner_stats;
92
93 extern u_int32_t cksum(void *, size_t);
94 extern u_int32_t lfs_sb_cksum(struct dlfs *);
95 extern u_int32_t lfs_cksum_part(void *, size_t, u_int32_t);
96 extern int ulfs_getlbns(struct lfs *, struct uvnode *, daddr_t, struct indir *, int *);
97
98 /* Ugh */
99 #define FSMNT_SIZE MAX(sizeof(((struct dlfs *)0)->dlfs_fsmnt), \
100 sizeof(((struct dlfs64 *)0)->dlfs_fsmnt))
101
102
103 /* Compat */
104 void pwarn(const char *unused, ...) { /* Does nothing */ };
105
106 /*
107 * Log a message if debugging is turned on.
108 */
109 void
110 dlog(const char *fmt, ...)
111 {
112 va_list ap;
113
114 if (debug == 0)
115 return;
116
117 va_start(ap, fmt);
118 vsyslog(LOG_DEBUG, fmt, ap);
119 va_end(ap);
120 }
121
122 /*
123 * Remove the specified filesystem from the list, due to its having
124 * become unmounted or other error condition.
125 */
126 void
127 handle_error(struct clfs **cfsp, int n)
128 {
129 syslog(LOG_NOTICE, "%s: detaching cleaner", lfs_sb_getfsmnt(cfsp[n]));
130 free(cfsp[n]);
131 if (n != nfss - 1)
132 cfsp[n] = cfsp[nfss - 1];
133 --nfss;
134 }
135
136 /*
137 * Reinitialize a filesystem if, e.g., its size changed.
138 */
139 int
140 reinit_fs(struct clfs *fs)
141 {
142 char fsname[FSMNT_SIZE];
143
144 memcpy(fsname, lfs_sb_getfsmnt(fs), sizeof(fsname));
145 fsname[sizeof(fsname) - 1] = '\0';
146
147 kops.ko_close(fs->clfs_ifilefd);
148 kops.ko_close(fs->clfs_devfd);
149 fd_reclaim(fs->clfs_devvp);
150 fd_reclaim(fs->lfs_ivnode);
151 free(fs->clfs_dev);
152 free(fs->clfs_segtab);
153 free(fs->clfs_segtabp);
154
155 return init_fs(fs, fsname);
156 }
157
158 #ifdef REPAIR_ZERO_FINFO
159 /*
160 * Use fsck's lfs routines to load the Ifile from an unmounted fs.
161 * We interpret "fsname" as the name of the raw disk device.
162 */
163 int
164 init_unmounted_fs(struct clfs *fs, char *fsname)
165 {
166 struct lfs *disc_fs;
167 int i;
168
169 fs->clfs_dev = fsname;
170 if ((fs->clfs_devfd = kops.ko_open(fs->clfs_dev, O_RDWR)) < 0) {
171 syslog(LOG_ERR, "couldn't open device %s read/write",
172 fs->clfs_dev);
173 return -1;
174 }
175
176 disc_fs = lfs_init(fs->clfs_devfd, 0, 0, 0, 0);
177
178 fs->lfs_dlfs = disc_fs->lfs_dlfs; /* Structure copy */
179 strncpy(fs->lfs_fsmnt, fsname, MNAMELEN);
180 fs->lfs_ivnode = (struct uvnode *)disc_fs->lfs_ivnode;
181 fs->clfs_devvp = fd_vget(fs->clfs_devfd, fs->lfs_fsize, fs->lfs_ssize,
182 atatime);
183
184 /* Allocate and clear segtab */
185 fs->clfs_segtab = (struct clfs_seguse *)malloc(lfs_sb_getnseg(fs) *
186 sizeof(*fs->clfs_segtab));
187 fs->clfs_segtabp = (struct clfs_seguse **)malloc(lfs_sb_getnseg(fs) *
188 sizeof(*fs->clfs_segtabp));
189 for (i = 0; i < lfs_sb_getnseg(fs); i++) {
190 fs->clfs_segtabp[i] = &(fs->clfs_segtab[i]);
191 fs->clfs_segtab[i].flags = 0x0;
192 }
193 syslog(LOG_NOTICE, "%s: unmounted cleaner starting", fsname);
194
195 return 0;
196 }
197 #endif
198
199 /*
200 * Set up the file descriptors, including the Ifile descriptor.
201 * If we can't get the Ifile, this is not an LFS (or the kernel is
202 * too old to support the fcntl).
203 * XXX Merge this and init_unmounted_fs, switching on whether
204 * XXX "fsname" is a dir or a char special device. Should
205 * XXX also be able to read unmounted devices out of fstab, the way
206 * XXX fsck does.
207 */
208 int
209 init_fs(struct clfs *fs, char *fsname)
210 {
211 char mnttmp[FSMNT_SIZE];
212 struct statvfs sf;
213 int rootfd;
214 int i;
215 void *sbuf;
216 size_t mlen;
217
218 /*
219 * Get the raw device from the block device.
220 * XXX this is ugly. Is there a way to discover the raw device
221 * XXX for a given mount point?
222 */
223 if (kops.ko_statvfs(fsname, &sf, ST_WAIT) < 0)
224 return -1;
225 mlen = strlen(sf.f_mntfromname) + 2;
226 fs->clfs_dev = malloc(mlen);
227 if (fs->clfs_dev == NULL) {
228 syslog(LOG_ERR, "couldn't malloc device name string: %m");
229 return -1;
230 }
231 if (getdiskrawname(fs->clfs_dev, mlen, sf.f_mntfromname) == NULL) {
232 syslog(LOG_ERR, "couldn't convert '%s' ro raw name: %m",
233 sf.f_mntfromname);
234 return -1;
235 }
236 if ((fs->clfs_devfd = kops.ko_open(fs->clfs_dev, O_RDONLY, 0)) < 0) {
237 syslog(LOG_ERR, "couldn't open device %s for reading",
238 fs->clfs_dev);
239 return -1;
240 }
241
242 /* Find the Ifile and open it */
243 if ((rootfd = kops.ko_open(fsname, O_RDONLY, 0)) < 0)
244 return -2;
245 if (kops.ko_fcntl(rootfd, LFCNIFILEFH, &fs->clfs_ifilefh) < 0)
246 return -3;
247 if ((fs->clfs_ifilefd = kops.ko_fhopen(&fs->clfs_ifilefh,
248 sizeof(fs->clfs_ifilefh), O_RDONLY)) < 0)
249 return -4;
250 kops.ko_close(rootfd);
251
252 sbuf = malloc(LFS_SBPAD);
253 if (sbuf == NULL) {
254 syslog(LOG_ERR, "couldn't malloc superblock buffer");
255 return -1;
256 }
257
258 /* Load in the superblock */
259 if (kops.ko_pread(fs->clfs_devfd, sbuf, LFS_SBPAD, LFS_LABELPAD) < 0) {
260 free(sbuf);
261 return -1;
262 }
263
264 __CTASSERT(sizeof(struct dlfs) == sizeof(struct dlfs64));
265 memcpy(&fs->lfs_dlfs_u, sbuf, sizeof(struct dlfs));
266 free(sbuf);
267
268 /* If it is not LFS, complain and exit! */
269 switch (fs->lfs_dlfs_u.u_32.dlfs_magic) {
270 case LFS_MAGIC:
271 fs->lfs_is64 = false;
272 fs->lfs_dobyteswap = false;
273 break;
274 case LFS_MAGIC_SWAPPED:
275 fs->lfs_is64 = false;
276 fs->lfs_dobyteswap = true;
277 break;
278 case LFS64_MAGIC:
279 fs->lfs_is64 = true;
280 fs->lfs_dobyteswap = false;
281 break;
282 case LFS64_MAGIC_SWAPPED:
283 fs->lfs_is64 = true;
284 fs->lfs_dobyteswap = true;
285 break;
286 default:
287 syslog(LOG_ERR, "%s: not LFS", fsname);
288 return -1;
289 }
290 /* XXX: can this ever need to be set? does the cleaner even care? */
291 fs->lfs_hasolddirfmt = 0;
292
293 /* If this is not a version 2 filesystem, complain and exit */
294 if (lfs_sb_getversion(fs) != 2) {
295 syslog(LOG_ERR, "%s: not a version 2 LFS", fsname);
296 return -1;
297 }
298
299 /* Assume fsname is the mounted name */
300 strncpy(mnttmp, fsname, sizeof(mnttmp));
301 mnttmp[sizeof(mnttmp) - 1] = '\0';
302 lfs_sb_setfsmnt(fs, mnttmp);
303
304 /* Set up vnodes for Ifile and raw device */
305 fs->lfs_ivnode = fd_vget(fs->clfs_ifilefd, lfs_sb_getbsize(fs), 0, 0);
306 fs->clfs_devvp = fd_vget(fs->clfs_devfd, lfs_sb_getfsize(fs), lfs_sb_getssize(fs),
307 atatime);
308
309 /* Allocate and clear segtab */
310 fs->clfs_segtab = (struct clfs_seguse *)malloc(lfs_sb_getnseg(fs) *
311 sizeof(*fs->clfs_segtab));
312 fs->clfs_segtabp = (struct clfs_seguse **)malloc(lfs_sb_getnseg(fs) *
313 sizeof(*fs->clfs_segtabp));
314 if (fs->clfs_segtab == NULL || fs->clfs_segtabp == NULL) {
315 syslog(LOG_ERR, "%s: couldn't malloc segment table: %m",
316 fs->clfs_dev);
317 return -1;
318 }
319
320 for (i = 0; i < lfs_sb_getnseg(fs); i++) {
321 fs->clfs_segtabp[i] = &(fs->clfs_segtab[i]);
322 fs->clfs_segtab[i].flags = 0x0;
323 }
324
325 syslog(LOG_NOTICE, "%s: attaching cleaner", fsname);
326 return 0;
327 }
328
329 /*
330 * Invalidate all the currently held Ifile blocks so they will be
331 * reread when we clean. Check the size while we're at it, and
332 * resize the buffer cache if necessary.
333 */
334 void
335 reload_ifile(struct clfs *fs)
336 {
337 struct ubuf *bp;
338 struct stat st;
339 int ohashmax;
340 extern int hashmax;
341
342 while ((bp = LIST_FIRST(&fs->lfs_ivnode->v_dirtyblkhd)) != NULL) {
343 bremfree(bp);
344 buf_destroy(bp);
345 }
346 while ((bp = LIST_FIRST(&fs->lfs_ivnode->v_cleanblkhd)) != NULL) {
347 bremfree(bp);
348 buf_destroy(bp);
349 }
350
351 /* If Ifile is larger than buffer cache, rehash */
352 fstat(fs->clfs_ifilefd, &st);
353 if (st.st_size / lfs_sb_getbsize(fs) > hashmax) {
354 ohashmax = hashmax;
355 bufrehash(st.st_size / lfs_sb_getbsize(fs));
356 dlog("%s: resized buffer hash from %d to %d",
357 lfs_sb_getfsmnt(fs), ohashmax, hashmax);
358 }
359 }
360
361 /*
362 * Get IFILE entry for the given inode, store in ifpp. The buffer
363 * which contains that data is returned in bpp, and must be brelse()d
364 * by the caller.
365 *
366 * XXX this is cutpaste of LFS_IENTRY from lfs.h; unify the two.
367 */
368 void
369 lfs_ientry(IFILE **ifpp, struct clfs *fs, ino_t ino, struct ubuf **bpp)
370 {
371 IFILE64 *ifp64;
372 IFILE32 *ifp32;
373 IFILE_V1 *ifp_v1;
374 int error;
375
376 error = bread(fs->lfs_ivnode,
377 ino / lfs_sb_getifpb(fs) + lfs_sb_getcleansz(fs) +
378 lfs_sb_getsegtabsz(fs), lfs_sb_getbsize(fs), 0, bpp);
379 if (error)
380 syslog(LOG_ERR, "%s: ientry failed for ino %d",
381 lfs_sb_getfsmnt(fs), (int)ino);
382 if (fs->lfs_is64) {
383 ifp64 = (IFILE64 *)(*bpp)->b_data;
384 ifp64 += ino % lfs_sb_getifpb(fs);
385 *ifpp = (IFILE *)ifp64;
386 } else if (lfs_sb_getversion(fs) > 1) {
387 ifp32 = (IFILE32 *)(*bpp)->b_data;
388 ifp32 += ino % lfs_sb_getifpb(fs);
389 *ifpp = (IFILE *)ifp32;
390 } else {
391 ifp_v1 = (IFILE_V1 *)(*bpp)->b_data;
392 ifp_v1 += ino % lfs_sb_getifpb(fs);
393 *ifpp = (IFILE *)ifp_v1;
394 }
395 return;
396 }
397
398 #ifdef TEST_PATTERN
399 /*
400 * Check ULFS_ROOTINO for file data. The assumption is that we are running
401 * the "twofiles" test with the rest of the filesystem empty. Files
402 * created by "twofiles" match the test pattern, but ULFS_ROOTINO and the
403 * executable itself (assumed to be inode 3) should not match.
404 */
405 static void
406 check_test_pattern(BLOCK_INFO *bip)
407 {
408 int j;
409 unsigned char *cp = bip->bi_bp;
410
411 /* Check inode sanity */
412 if (bip->bi_lbn == LFS_UNUSED_LBN) {
413 assert(((struct ulfs1_dinode *)bip->bi_bp)->di_inumber ==
414 bip->bi_inode);
415 }
416
417 /* These can have the test pattern and it's all good */
418 if (bip->bi_inode > 3)
419 return;
420
421 for (j = 0; j < bip->bi_size; j++) {
422 if (cp[j] != (j & 0xff))
423 break;
424 }
425 assert(j < bip->bi_size);
426 }
427 #endif /* TEST_PATTERN */
428
429 /*
430 * Parse the partial segment at daddr, adding its information to
431 * bip. Return the address of the next partial segment to read.
432 */
433 static daddr_t
434 parse_pseg(struct clfs *fs, daddr_t daddr, BLOCK_INFO **bipp, int *bic)
435 {
436 SEGSUM *ssp;
437 IFILE *ifp;
438 BLOCK_INFO *bip, *nbip;
439 daddr_t idaddr, odaddr;
440 FINFO *fip;
441 IINFO *iip;
442 struct ubuf *ifbp;
443 union lfs_dinode *dip;
444 u_int32_t ck, vers;
445 int fic, inoc, obic;
446 size_t sumstart;
447 int i;
448 char *cp;
449
450 odaddr = daddr;
451 obic = *bic;
452 bip = *bipp;
453
454 /*
455 * Retrieve the segment header, set up the SEGSUM pointer
456 * as well as the first FINFO and inode address pointer.
457 */
458 cp = fd_ptrget(fs->clfs_devvp, daddr);
459 ssp = (SEGSUM *)cp;
460 iip = SEGSUM_IINFOSTART(fs, cp);
461 fip = SEGSUM_FINFOBASE(fs, cp);
462
463 /*
464 * Check segment header magic and checksum
465 */
466 if (lfs_ss_getmagic(fs, ssp) != SS_MAGIC) {
467 syslog(LOG_WARNING, "%s: sumsum magic number bad at 0x%jx:"
468 " read 0x%x, expected 0x%x", lfs_sb_getfsmnt(fs),
469 (intmax_t)daddr, lfs_ss_getmagic(fs, ssp), SS_MAGIC);
470 return 0x0;
471 }
472 sumstart = lfs_ss_getsumstart(fs);
473 ck = cksum((char *)ssp + sumstart, lfs_sb_getsumsize(fs) - sumstart);
474 if (ck != lfs_ss_getsumsum(fs, ssp)) {
475 syslog(LOG_WARNING, "%s: sumsum checksum mismatch at 0x%jx:"
476 " read 0x%x, computed 0x%x", lfs_sb_getfsmnt(fs),
477 (intmax_t)daddr, lfs_ss_getsumsum(fs, ssp), ck);
478 return 0x0;
479 }
480
481 /* Initialize data sum */
482 ck = 0;
483
484 /* Point daddr at next block after segment summary */
485 ++daddr;
486
487 /*
488 * Loop over file info and inode pointers. We always move daddr
489 * forward here because we are also computing the data checksum
490 * as we go.
491 */
492 fic = inoc = 0;
493 while (fic < lfs_ss_getnfinfo(fs, ssp) || inoc < lfs_ss_getninos(fs, ssp)) {
494 /*
495 * We must have either a file block or an inode block.
496 * If we don't have either one, it's an error.
497 */
498 if (fic >= lfs_ss_getnfinfo(fs, ssp) && lfs_ii_getblock(fs, iip) != daddr) {
499 syslog(LOG_WARNING, "%s: bad pseg at %jx (seg %d)",
500 lfs_sb_getfsmnt(fs), (intmax_t)odaddr, lfs_dtosn(fs, odaddr));
501 *bipp = bip;
502 return 0x0;
503 }
504
505 /*
506 * Note each inode from the inode blocks
507 */
508 if (inoc < lfs_ss_getninos(fs, ssp) && lfs_ii_getblock(fs, iip) == daddr) {
509 cp = fd_ptrget(fs->clfs_devvp, daddr);
510 ck = lfs_cksum_part(cp, sizeof(u_int32_t), ck);
511 for (i = 0; i < lfs_sb_getinopb(fs); i++) {
512 dip = DINO_IN_BLOCK(fs, cp, i);
513 if (lfs_dino_getinumber(fs, dip) == 0)
514 break;
515
516 /*
517 * Check currency before adding it
518 */
519 #ifndef REPAIR_ZERO_FINFO
520 lfs_ientry(&ifp, fs, lfs_dino_getinumber(fs, dip), &ifbp);
521 idaddr = lfs_if_getdaddr(fs, ifp);
522 brelse(ifbp, 0);
523 if (idaddr != daddr)
524 #endif
525 continue;
526
527 /*
528 * A current inode. Add it.
529 */
530 ++*bic;
531 nbip = (BLOCK_INFO *)realloc(bip, *bic *
532 sizeof(*bip));
533 if (nbip)
534 bip = nbip;
535 else {
536 --*bic;
537 *bipp = bip;
538 return 0x0;
539 }
540 bip[*bic - 1].bi_inode = lfs_dino_getinumber(fs, dip);
541 bip[*bic - 1].bi_lbn = LFS_UNUSED_LBN;
542 bip[*bic - 1].bi_daddr = daddr;
543 bip[*bic - 1].bi_segcreate = lfs_ss_getcreate(fs, ssp);
544 bip[*bic - 1].bi_version = lfs_dino_getgen(fs, dip);
545 bip[*bic - 1].bi_bp = dip;
546 bip[*bic - 1].bi_size = DINOSIZE(fs);
547 }
548 inoc += i;
549 daddr += lfs_btofsb(fs, lfs_sb_getibsize(fs));
550 iip = NEXTLOWER_IINFO(fs, iip);
551 continue;
552 }
553
554 /*
555 * Note each file block from the finfo blocks
556 */
557 if (fic >= lfs_ss_getnfinfo(fs, ssp))
558 continue;
559
560 /* Count this finfo, whether or not we use it */
561 ++fic;
562
563 /*
564 * If this finfo has nblocks==0, it was written wrong.
565 * Kernels with this problem always wrote this zero-sized
566 * finfo last, so just ignore it.
567 */
568 if (lfs_fi_getnblocks(fs, fip) == 0) {
569 #ifdef REPAIR_ZERO_FINFO
570 struct ubuf *nbp;
571 SEGSUM *nssp;
572
573 syslog(LOG_WARNING, "fixing short FINFO at %jx (seg %d)",
574 (intmax_t)odaddr, lfs_dtosn(fs, odaddr));
575 bread(fs->clfs_devvp, odaddr, lfs_sb_getfsize(fs),
576 0, &nbp);
577 nssp = (SEGSUM *)nbp->b_data;
578 --nssp->ss_nfinfo;
579 nssp->ss_sumsum = cksum(&nssp->ss_datasum,
580 lfs_sb_getsumsize(fs) - sizeof(nssp->ss_sumsum));
581 bwrite(nbp);
582 #endif
583 syslog(LOG_WARNING, "zero-length FINFO at %jx (seg %d)",
584 (intmax_t)odaddr, lfs_dtosn(fs, odaddr));
585 continue;
586 }
587
588 /*
589 * Check currency before adding blocks
590 */
591 #ifdef REPAIR_ZERO_FINFO
592 vers = -1;
593 #else
594 lfs_ientry(&ifp, fs, lfs_fi_getino(fs, fip), &ifbp);
595 vers = lfs_if_getversion(fs, ifp);
596 brelse(ifbp, 0);
597 #endif
598 if (vers != lfs_fi_getversion(fs, fip)) {
599 size_t size;
600
601 /* Read all the blocks from the data summary */
602 for (i = 0; i < lfs_fi_getnblocks(fs, fip); i++) {
603 size = (i == lfs_fi_getnblocks(fs, fip) - 1) ?
604 lfs_fi_getlastlength(fs, fip) : lfs_sb_getbsize(fs);
605 cp = fd_ptrget(fs->clfs_devvp, daddr);
606 ck = lfs_cksum_part(cp, sizeof(u_int32_t), ck);
607 daddr += lfs_btofsb(fs, size);
608 }
609 fip = NEXT_FINFO(fs, fip);
610 continue;
611 }
612
613 /* Add all the blocks from the finfos (current or not) */
614 nbip = (BLOCK_INFO *)realloc(bip, (*bic + lfs_fi_getnblocks(fs, fip)) *
615 sizeof(*bip));
616 if (nbip)
617 bip = nbip;
618 else {
619 *bipp = bip;
620 return 0x0;
621 }
622
623 for (i = 0; i < lfs_fi_getnblocks(fs, fip); i++) {
624 bip[*bic + i].bi_inode = lfs_fi_getino(fs, fip);
625 bip[*bic + i].bi_lbn = lfs_fi_getblock(fs, fip, i);
626 bip[*bic + i].bi_daddr = daddr;
627 bip[*bic + i].bi_segcreate = lfs_ss_getcreate(fs, ssp);
628 bip[*bic + i].bi_version = lfs_fi_getversion(fs, fip);
629 bip[*bic + i].bi_size = (i == lfs_fi_getnblocks(fs, fip) - 1) ?
630 lfs_fi_getlastlength(fs, fip) : lfs_sb_getbsize(fs);
631 cp = fd_ptrget(fs->clfs_devvp, daddr);
632 ck = lfs_cksum_part(cp, sizeof(u_int32_t), ck);
633 bip[*bic + i].bi_bp = cp;
634 daddr += lfs_btofsb(fs, bip[*bic + i].bi_size);
635
636 #ifdef TEST_PATTERN
637 check_test_pattern(bip + *bic + i); /* XXXDEBUG */
638 #endif
639 }
640 *bic += lfs_fi_getnblocks(fs, fip);
641 fip = NEXT_FINFO(fs, fip);
642 }
643
644 #ifndef REPAIR_ZERO_FINFO
645 if (lfs_ss_getdatasum(fs, ssp) != ck) {
646 syslog(LOG_WARNING, "%s: data checksum bad at 0x%jx:"
647 " read 0x%x, computed 0x%x", lfs_sb_getfsmnt(fs),
648 (intmax_t)odaddr,
649 lfs_ss_getdatasum(fs, ssp), ck);
650 *bic = obic;
651 return 0x0;
652 }
653 #endif
654
655 *bipp = bip;
656 return daddr;
657 }
658
659 static void
660 log_segment_read(struct clfs *fs, int sn)
661 {
662 FILE *fp;
663 char *cp;
664
665 /*
666 * Write the segment read, and its contents, into a log file in
667 * the current directory. We don't need to log the location of
668 * the segment, since that can be inferred from the segments up
669 * to this point (ss_nextseg field of the previously written segment).
670 *
671 * We can use this info later to reconstruct the filesystem at any
672 * given point in time for analysis, by replaying the log forward
673 * indexed by the segment serial numbers; but it is not suitable
674 * for everyday use since the copylog will be simply enormous.
675 */
676 cp = fd_ptrget(fs->clfs_devvp, lfs_sntod(fs, sn));
677
678 fp = fopen(copylog_filename, "ab");
679 if (fp != NULL) {
680 if (fwrite(cp, (size_t)lfs_sb_getssize(fs), 1, fp) != 1) {
681 perror("writing segment to copy log");
682 }
683 }
684 fclose(fp);
685 }
686
687 /*
688 * Read a segment to populate the BLOCK_INFO structures.
689 * Return the number of partial segments read and parsed.
690 */
691 int
692 load_segment(struct clfs *fs, int sn, BLOCK_INFO **bipp, int *bic)
693 {
694 daddr_t daddr;
695 int i, npseg;
696
697 daddr = lfs_sntod(fs, sn);
698 if (daddr < lfs_btofsb(fs, LFS_LABELPAD))
699 daddr = lfs_btofsb(fs, LFS_LABELPAD);
700 for (i = 0; i < LFS_MAXNUMSB; i++) {
701 if (lfs_sb_getsboff(fs, i) == daddr) {
702 daddr += lfs_btofsb(fs, LFS_SBPAD);
703 break;
704 }
705 }
706
707 /* Preload the segment buffer */
708 if (fd_preload(fs->clfs_devvp, lfs_sntod(fs, sn)) < 0)
709 return -1;
710
711 if (copylog_filename)
712 log_segment_read(fs, sn);
713
714 /* Note bytes read for stats */
715 cleaner_stats.segs_cleaned++;
716 cleaner_stats.bytes_read += lfs_sb_getssize(fs);
717 ++fs->clfs_nactive;
718
719 npseg = 0;
720 while(lfs_dtosn(fs, daddr) == sn &&
721 lfs_dtosn(fs, daddr + lfs_btofsb(fs, lfs_sb_getbsize(fs))) == sn) {
722 daddr = parse_pseg(fs, daddr, bipp, bic);
723 if (daddr == 0x0) {
724 ++cleaner_stats.segs_error;
725 break;
726 }
727 ++npseg;
728 }
729
730 return npseg;
731 }
732
733 void
734 calc_cb(struct clfs *fs, int sn, struct clfs_seguse *t)
735 {
736 time_t now;
737 int64_t age, benefit, cost;
738
739 time(&now);
740 age = (now < t->lastmod ? 0 : now - t->lastmod);
741
742 /* Under no circumstances clean active or already-clean segments */
743 if ((t->flags & SEGUSE_ACTIVE) || !(t->flags & SEGUSE_DIRTY)) {
744 t->priority = 0;
745 return;
746 }
747
748 /*
749 * If the segment is empty, there is no reason to clean it.
750 * Clear its error condition, if any, since we are never going to
751 * try to parse this one.
752 */
753 if (t->nbytes == 0) {
754 t->flags &= ~SEGUSE_ERROR; /* Strip error once empty */
755 t->priority = 0;
756 return;
757 }
758
759 if (t->flags & SEGUSE_ERROR) { /* No good if not already empty */
760 /* No benefit */
761 t->priority = 0;
762 return;
763 }
764
765 if (t->nbytes > lfs_sb_getssize(fs)) {
766 /* Another type of error */
767 syslog(LOG_WARNING, "segment %d: bad seguse count %d",
768 sn, t->nbytes);
769 t->flags |= SEGUSE_ERROR;
770 t->priority = 0;
771 return;
772 }
773
774 /*
775 * The non-degenerate case. Use Rosenblum's cost-benefit algorithm.
776 * Calculate the benefit from cleaning this segment (one segment,
777 * minus fragmentation, dirty blocks and a segment summary block)
778 * and weigh that against the cost (bytes read plus bytes written).
779 * We count the summary headers as "dirty" to avoid cleaning very
780 * old and very full segments.
781 */
782 benefit = (int64_t)lfs_sb_getssize(fs) - t->nbytes -
783 (t->nsums + 1) * lfs_sb_getfsize(fs);
784 if (lfs_sb_getbsize(fs) > lfs_sb_getfsize(fs)) /* fragmentation */
785 benefit -= (lfs_sb_getbsize(fs) / 2);
786 if (benefit <= 0) {
787 t->priority = 0;
788 return;
789 }
790
791 cost = lfs_sb_getssize(fs) + t->nbytes;
792 t->priority = (256 * benefit * age) / cost;
793
794 return;
795 }
796
797 /*
798 * Comparator for BLOCK_INFO structures. Anything not in one of the segments
799 * we're looking at sorts higher; after that we sort first by inode number
800 * and then by block number (unsigned, i.e., negative sorts higher) *but*
801 * sort inodes before data blocks.
802 */
803 static int
804 bi_comparator(const void *va, const void *vb)
805 {
806 const BLOCK_INFO *a, *b;
807
808 a = (const BLOCK_INFO *)va;
809 b = (const BLOCK_INFO *)vb;
810
811 /* Check for out-of-place block */
812 if (a->bi_segcreate == a->bi_daddr &&
813 b->bi_segcreate != b->bi_daddr)
814 return -1;
815 if (a->bi_segcreate != a->bi_daddr &&
816 b->bi_segcreate == b->bi_daddr)
817 return 1;
818 if (a->bi_size <= 0 && b->bi_size > 0)
819 return 1;
820 if (b->bi_size <= 0 && a->bi_size > 0)
821 return -1;
822
823 /* Check inode number */
824 if (a->bi_inode != b->bi_inode)
825 return a->bi_inode - b->bi_inode;
826
827 /* Check lbn */
828 if (a->bi_lbn == LFS_UNUSED_LBN) /* Inodes sort lower than blocks */
829 return -1;
830 if (b->bi_lbn == LFS_UNUSED_LBN)
831 return 1;
832 if ((u_int64_t)a->bi_lbn > (u_int64_t)b->bi_lbn)
833 return 1;
834 else
835 return -1;
836
837 return 0;
838 }
839
840 /*
841 * Comparator for sort_segments: cost-benefit equation.
842 */
843 static int
844 cb_comparator(const void *va, const void *vb)
845 {
846 const struct clfs_seguse *a, *b;
847
848 a = *(const struct clfs_seguse * const *)va;
849 b = *(const struct clfs_seguse * const *)vb;
850 return a->priority > b->priority ? -1 : 1;
851 }
852
853 void
854 toss_old_blocks(struct clfs *fs, BLOCK_INFO **bipp, blkcnt_t *bic, int *sizep)
855 {
856 blkcnt_t i;
857 int r;
858 BLOCK_INFO *bip = *bipp;
859 struct lfs_fcntl_markv /* {
860 BLOCK_INFO *blkiov;
861 int blkcnt;
862 } */ lim;
863
864 if (bic == 0 || bip == NULL)
865 return;
866
867 /*
868 * Kludge: Store the disk address in segcreate so we know which
869 * ones to toss.
870 */
871 for (i = 0; i < *bic; i++)
872 bip[i].bi_segcreate = bip[i].bi_daddr;
873
874 /*
875 * XXX: blkcnt_t is 64 bits, so *bic might overflow size_t
876 * (the argument type of heapsort's number argument) on a
877 * 32-bit platform. However, if so we won't have got this far
878 * because we'll have failed trying to allocate the array. So
879 * while *bic here might cause a 64->32 truncation, it's safe.
880 */
881 /* Sort the blocks */
882 heapsort(bip, *bic, sizeof(BLOCK_INFO), bi_comparator);
883
884 /* Use bmapv to locate the blocks */
885 lim.blkiov = bip;
886 lim.blkcnt = *bic;
887 if ((r = kops.ko_fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim)) < 0) {
888 syslog(LOG_WARNING, "%s: bmapv returned %d (%m)",
889 lfs_sb_getfsmnt(fs), r);
890 return;
891 }
892
893 /* Toss blocks not in this segment */
894 heapsort(bip, *bic, sizeof(BLOCK_INFO), bi_comparator);
895
896 /* Get rid of stale blocks */
897 if (sizep)
898 *sizep = 0;
899 for (i = 0; i < *bic; i++) {
900 if (bip[i].bi_segcreate != bip[i].bi_daddr)
901 break;
902 if (sizep)
903 *sizep += bip[i].bi_size;
904 }
905 *bic = i; /* XXX should we shrink bip? */
906 *bipp = bip;
907
908 return;
909 }
910
911 /*
912 * Clean a segment and mark it invalid.
913 */
914 int
915 invalidate_segment(struct clfs *fs, int sn)
916 {
917 BLOCK_INFO *bip;
918 int i, r, bic;
919 blkcnt_t widebic;
920 off_t nb;
921 double util;
922 struct lfs_fcntl_markv /* {
923 BLOCK_INFO *blkiov;
924 int blkcnt;
925 } */ lim;
926
927 dlog("%s: inval seg %d", lfs_sb_getfsmnt(fs), sn);
928
929 bip = NULL;
930 bic = 0;
931 fs->clfs_nactive = 0;
932 if (load_segment(fs, sn, &bip, &bic) <= 0)
933 return -1;
934 widebic = bic;
935 toss_old_blocks(fs, &bip, &widebic, NULL);
936 bic = widebic;
937
938 /* Record statistics */
939 for (i = nb = 0; i < bic; i++)
940 nb += bip[i].bi_size;
941 util = ((double)nb) / (fs->clfs_nactive * lfs_sb_getssize(fs));
942 cleaner_stats.util_tot += util;
943 cleaner_stats.util_sos += util * util;
944 cleaner_stats.bytes_written += nb;
945
946 /*
947 * Use markv to move the blocks.
948 */
949 lim.blkiov = bip;
950 lim.blkcnt = bic;
951 if ((r = kops.ko_fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim)) < 0) {
952 syslog(LOG_WARNING, "%s: markv returned %d (%m) "
953 "for seg %d", lfs_sb_getfsmnt(fs), r, sn);
954 return r;
955 }
956
957 /*
958 * Finally call invalidate to invalidate the segment.
959 */
960 if ((r = kops.ko_fcntl(fs->clfs_ifilefd, LFCNINVAL, &sn)) < 0) {
961 syslog(LOG_WARNING, "%s: inval returned %d (%m) "
962 "for seg %d", lfs_sb_getfsmnt(fs), r, sn);
963 return r;
964 }
965
966 return 0;
967 }
968
969 /*
970 * Check to see if the given ino/lbn pair is represented in the BLOCK_INFO
971 * array we are sending to the kernel, or if the kernel will have to add it.
972 * The kernel will only add each such pair once, though, so keep track of
973 * previous requests in a separate "extra" BLOCK_INFO array. Returns 1
974 * if the block needs to be added, 0 if it is already represented.
975 */
976 static int
977 check_or_add(ino_t ino, daddr_t lbn, BLOCK_INFO *bip, int bic, BLOCK_INFO **ebipp, int *ebicp)
978 {
979 BLOCK_INFO *t, *ebip = *ebipp;
980 int ebic = *ebicp;
981 int k;
982
983 for (k = 0; k < bic; k++) {
984 if (bip[k].bi_inode != ino)
985 break;
986 if (bip[k].bi_lbn == lbn) {
987 return 0;
988 }
989 }
990
991 /* Look on the list of extra blocks, too */
992 for (k = 0; k < ebic; k++) {
993 if (ebip[k].bi_inode == ino && ebip[k].bi_lbn == lbn) {
994 return 0;
995 }
996 }
997
998 ++ebic;
999 t = realloc(ebip, ebic * sizeof(BLOCK_INFO));
1000 if (t == NULL)
1001 return 1; /* Note *ebicp is unchanged */
1002
1003 ebip = t;
1004 ebip[ebic - 1].bi_inode = ino;
1005 ebip[ebic - 1].bi_lbn = lbn;
1006
1007 *ebipp = ebip;
1008 *ebicp = ebic;
1009 return 1;
1010 }
1011
1012 /*
1013 * Look for indirect blocks we will have to write which are not
1014 * contained in this collection of blocks. This constitutes
1015 * a hidden cleaning cost, since we are unaware of it until we
1016 * have already read the segments. Return the total cost, and fill
1017 * in *ifc with the part of that cost due to rewriting the Ifile.
1018 */
1019 static off_t
1020 check_hidden_cost(struct clfs *fs, BLOCK_INFO *bip, int bic, off_t *ifc)
1021 {
1022 int start;
1023 struct indir in[ULFS_NIADDR + 1];
1024 int num;
1025 int i, j, ebic;
1026 BLOCK_INFO *ebip;
1027 daddr_t lbn;
1028
1029 start = 0;
1030 ebip = NULL;
1031 ebic = 0;
1032 for (i = 0; i < bic; i++) {
1033 if (i == 0 || bip[i].bi_inode != bip[start].bi_inode) {
1034 start = i;
1035 /*
1036 * Look for IFILE blocks, unless this is the Ifile.
1037 */
1038 if (bip[i].bi_inode != LFS_IFILE_INUM) {
1039 lbn = lfs_sb_getcleansz(fs) + bip[i].bi_inode /
1040 lfs_sb_getifpb(fs);
1041 *ifc += check_or_add(LFS_IFILE_INUM, lbn,
1042 bip, bic, &ebip, &ebic);
1043 }
1044 }
1045 if (bip[i].bi_lbn == LFS_UNUSED_LBN)
1046 continue;
1047 if (bip[i].bi_lbn < ULFS_NDADDR)
1048 continue;
1049
1050 /* XXX the struct lfs cast is completely wrong/unsafe */
1051 ulfs_getlbns((struct lfs *)fs, NULL, (daddr_t)bip[i].bi_lbn, in, &num);
1052 for (j = 0; j < num; j++) {
1053 check_or_add(bip[i].bi_inode, in[j].in_lbn,
1054 bip + start, bic - start, &ebip, &ebic);
1055 }
1056 }
1057 return ebic;
1058 }
1059
1060 /*
1061 * Select segments to clean, add blocks from these segments to a cleaning
1062 * list, and send this list through lfs_markv() to move them to new
1063 * locations on disk.
1064 */
1065 static int
1066 clean_fs(struct clfs *fs, const CLEANERINFO64 *cip)
1067 {
1068 int i, j, ngood, sn, bic, r, npos;
1069 blkcnt_t widebic;
1070 int bytes, totbytes;
1071 struct ubuf *bp;
1072 SEGUSE *sup;
1073 static BLOCK_INFO *bip;
1074 struct lfs_fcntl_markv /* {
1075 BLOCK_INFO *blkiov;
1076 int blkcnt;
1077 } */ lim;
1078 int mc;
1079 BLOCK_INFO *mbip;
1080 int inc;
1081 off_t nb;
1082 off_t goal;
1083 off_t extra, if_extra;
1084 double util;
1085
1086 /* Read the segment table into our private structure */
1087 npos = 0;
1088 for (i = 0; i < lfs_sb_getnseg(fs); i+= lfs_sb_getsepb(fs)) {
1089 bread(fs->lfs_ivnode,
1090 lfs_sb_getcleansz(fs) + i / lfs_sb_getsepb(fs),
1091 lfs_sb_getbsize(fs), 0, &bp);
1092 for (j = 0; j < lfs_sb_getsepb(fs) && i + j < lfs_sb_getnseg(fs); j++) {
1093 sup = ((SEGUSE *)bp->b_data) + j;
1094 fs->clfs_segtab[i + j].nbytes = sup->su_nbytes;
1095 fs->clfs_segtab[i + j].nsums = sup->su_nsums;
1096 fs->clfs_segtab[i + j].lastmod = sup->su_lastmod;
1097 /* Keep error status but renew other flags */
1098 fs->clfs_segtab[i + j].flags &= SEGUSE_ERROR;
1099 fs->clfs_segtab[i + j].flags |= sup->su_flags;
1100
1101 /* Compute cost-benefit coefficient */
1102 calc_cb(fs, i + j, fs->clfs_segtab + i + j);
1103 if (fs->clfs_segtab[i + j].priority > 0)
1104 ++npos;
1105 }
1106 brelse(bp, 0);
1107 }
1108
1109 /* Sort segments based on cleanliness, fulness, and condition */
1110 heapsort(fs->clfs_segtabp, lfs_sb_getnseg(fs), sizeof(struct clfs_seguse *),
1111 cb_comparator);
1112
1113 /* If no segment is cleanable, just return */
1114 if (fs->clfs_segtabp[0]->priority == 0) {
1115 dlog("%s: no segment cleanable", lfs_sb_getfsmnt(fs));
1116 return 0;
1117 }
1118
1119 /* Load some segments' blocks into bip */
1120 bic = 0;
1121 fs->clfs_nactive = 0;
1122 ngood = 0;
1123 if (use_bytes) {
1124 /* Set attainable goal */
1125 goal = lfs_sb_getssize(fs) * atatime;
1126 if (goal > (cip->clean - 1) * lfs_sb_getssize(fs) / 2)
1127 goal = MAX((cip->clean - 1) * lfs_sb_getssize(fs),
1128 lfs_sb_getssize(fs)) / 2;
1129
1130 dlog("%s: cleaning with goal %" PRId64
1131 " bytes (%d segs clean, %d cleanable)",
1132 lfs_sb_getfsmnt(fs), goal, cip->clean, npos);
1133 syslog(LOG_INFO, "%s: cleaning with goal %" PRId64
1134 " bytes (%d segs clean, %d cleanable)",
1135 lfs_sb_getfsmnt(fs), goal, cip->clean, npos);
1136 totbytes = 0;
1137 for (i = 0; i < lfs_sb_getnseg(fs) && totbytes < goal; i++) {
1138 if (fs->clfs_segtabp[i]->priority == 0)
1139 break;
1140 /* Upper bound on number of segments at once */
1141 if (ngood * lfs_sb_getssize(fs) > 4 * goal)
1142 break;
1143 sn = (fs->clfs_segtabp[i] - fs->clfs_segtab);
1144 dlog("%s: add seg %d prio %" PRIu64
1145 " containing %ld bytes",
1146 lfs_sb_getfsmnt(fs), sn, fs->clfs_segtabp[i]->priority,
1147 fs->clfs_segtabp[i]->nbytes);
1148 if ((r = load_segment(fs, sn, &bip, &bic)) > 0) {
1149 ++ngood;
1150 widebic = bic;
1151 toss_old_blocks(fs, &bip, &widebic, &bytes);
1152 bic = widebic;
1153 totbytes += bytes;
1154 } else if (r == 0)
1155 fd_release(fs->clfs_devvp);
1156 else
1157 break;
1158 }
1159 } else {
1160 /* Set attainable goal */
1161 goal = atatime;
1162 if (goal > cip->clean - 1)
1163 goal = MAX(cip->clean - 1, 1);
1164
1165 dlog("%s: cleaning with goal %d segments (%d clean, %d cleanable)",
1166 lfs_sb_getfsmnt(fs), (int)goal, cip->clean, npos);
1167 for (i = 0; i < lfs_sb_getnseg(fs) && ngood < goal; i++) {
1168 if (fs->clfs_segtabp[i]->priority == 0)
1169 break;
1170 sn = (fs->clfs_segtabp[i] - fs->clfs_segtab);
1171 dlog("%s: add seg %d prio %" PRIu64,
1172 lfs_sb_getfsmnt(fs), sn, fs->clfs_segtabp[i]->priority);
1173 if ((r = load_segment(fs, sn, &bip, &bic)) > 0)
1174 ++ngood;
1175 else if (r == 0)
1176 fd_release(fs->clfs_devvp);
1177 else
1178 break;
1179 }
1180 widebic = bic;
1181 toss_old_blocks(fs, &bip, &widebic, NULL);
1182 bic = widebic;
1183 }
1184
1185 /* If there is nothing to do, try again later. */
1186 if (bic == 0) {
1187 dlog("%s: no blocks to clean in %d cleanable segments",
1188 lfs_sb_getfsmnt(fs), (int)ngood);
1189 fd_release_all(fs->clfs_devvp);
1190 return 0;
1191 }
1192
1193 /* Record statistics */
1194 for (i = nb = 0; i < bic; i++)
1195 nb += bip[i].bi_size;
1196 util = ((double)nb) / (fs->clfs_nactive * lfs_sb_getssize(fs));
1197 cleaner_stats.util_tot += util;
1198 cleaner_stats.util_sos += util * util;
1199 cleaner_stats.bytes_written += nb;
1200
1201 /*
1202 * Check out our blocks to see if there are hidden cleaning costs.
1203 * If there are, we might be cleaning ourselves deeper into a hole
1204 * rather than doing anything useful.
1205 * XXX do something about this.
1206 */
1207 if_extra = 0;
1208 extra = lfs_sb_getbsize(fs) * (off_t)check_hidden_cost(fs, bip, bic, &if_extra);
1209 if_extra *= lfs_sb_getbsize(fs);
1210
1211 /*
1212 * Use markv to move the blocks.
1213 */
1214 if (do_small)
1215 inc = MAXPHYS / lfs_sb_getbsize(fs) - 1;
1216 else
1217 inc = LFS_MARKV_MAXBLKCNT / 2;
1218 for (mc = 0, mbip = bip; mc < bic; mc += inc, mbip += inc) {
1219 lim.blkiov = mbip;
1220 lim.blkcnt = (bic - mc > inc ? inc : bic - mc);
1221 #ifdef TEST_PATTERN
1222 dlog("checking blocks %d-%d", mc, mc + lim.blkcnt - 1);
1223 for (i = 0; i < lim.blkcnt; i++) {
1224 check_test_pattern(mbip + i);
1225 }
1226 #endif /* TEST_PATTERN */
1227 dlog("sending blocks %d-%d", mc, mc + lim.blkcnt - 1);
1228 if ((r = kops.ko_fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim))<0) {
1229 int oerrno = errno;
1230 syslog(LOG_WARNING, "%s: markv returned %d (errno %d, %m)",
1231 lfs_sb_getfsmnt(fs), r, errno);
1232 if (oerrno != EAGAIN && oerrno != ESHUTDOWN) {
1233 syslog(LOG_DEBUG, "%s: errno %d, returning",
1234 lfs_sb_getfsmnt(fs), oerrno);
1235 fd_release_all(fs->clfs_devvp);
1236 return r;
1237 }
1238 if (oerrno == ESHUTDOWN) {
1239 syslog(LOG_NOTICE, "%s: filesystem unmounted",
1240 lfs_sb_getfsmnt(fs));
1241 fd_release_all(fs->clfs_devvp);
1242 return r;
1243 }
1244 }
1245 }
1246
1247 /*
1248 * Report progress (or lack thereof)
1249 */
1250 syslog(LOG_INFO, "%s: wrote %" PRId64 " dirty + %"
1251 PRId64 " supporting indirect + %"
1252 PRId64 " supporting Ifile = %"
1253 PRId64 " bytes to clean %d segs (%" PRId64 "%% recovery)",
1254 lfs_sb_getfsmnt(fs), (int64_t)nb, (int64_t)(extra - if_extra),
1255 (int64_t)if_extra, (int64_t)(nb + extra), ngood,
1256 (ngood ? (int64_t)(100 - (100 * (nb + extra)) /
1257 (ngood * lfs_sb_getssize(fs))) :
1258 (int64_t)0));
1259 if (nb + extra >= ngood * lfs_sb_getssize(fs))
1260 syslog(LOG_WARNING, "%s: cleaner not making forward progress",
1261 lfs_sb_getfsmnt(fs));
1262
1263 /*
1264 * Finally call reclaim to prompt cleaning of the segments.
1265 */
1266 kops.ko_fcntl(fs->clfs_ifilefd, LFCNRECLAIM, NULL);
1267
1268 fd_release_all(fs->clfs_devvp);
1269 return 0;
1270 }
1271
1272 /*
1273 * Read the cleanerinfo block and apply cleaning policy to determine whether
1274 * the given filesystem needs to be cleaned. Returns 1 if it does, 0 if it
1275 * does not, or -1 on error.
1276 */
1277 static int
1278 needs_cleaning(struct clfs *fs, CLEANERINFO64 *cip)
1279 {
1280 CLEANERINFO *cipu;
1281 struct ubuf *bp;
1282 struct stat st;
1283 daddr_t fsb_per_seg, max_free_segs;
1284 time_t now;
1285 double loadavg;
1286
1287 /* If this fs is "on hold", don't clean it. */
1288 if (fs->clfs_onhold) {
1289 #if defined(__GNUC__) && \
1290 (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && \
1291 defined(__OPTIMIZE_SIZE__)
1292 /*
1293 * XXX: Work around apparent bug with GCC >= 4.8 and -Os: it
1294 * claims that ci.clean is uninitialized in clean_fs (at one
1295 * of the several uses of it, which is neither the first nor
1296 * last use) -- this doesn't happen with plain -O2.
1297 *
1298 * Hopefully in the future further rearrangements will allow
1299 * removing this hack.
1300 */
1301 cip->clean = 0;
1302 #endif
1303 return 0;
1304 }
1305
1306 /*
1307 * Read the cleanerinfo block from the Ifile. We don't want
1308 * the cached information, so invalidate the buffer before
1309 * handing it back.
1310 */
1311 if (bread(fs->lfs_ivnode, 0, lfs_sb_getbsize(fs), 0, &bp)) {
1312 syslog(LOG_ERR, "%s: can't read inode", lfs_sb_getfsmnt(fs));
1313 return -1;
1314 }
1315 cipu = (CLEANERINFO *)bp->b_data;
1316 if (fs->lfs_is64) {
1317 /* Structure copy */
1318 *cip = cipu->u_64;
1319 } else {
1320 /* Copy the fields and promote to 64 bit */
1321 cip->clean = cipu->u_32.clean;
1322 cip->dirty = cipu->u_32.dirty;
1323 cip->bfree = cipu->u_32.bfree;
1324 cip->avail = cipu->u_32.avail;
1325 cip->free_head = cipu->u_32.free_head;
1326 cip->free_tail = cipu->u_32.free_tail;
1327 cip->flags = cipu->u_32.flags;
1328 }
1329 brelse(bp, B_INVAL);
1330 cleaner_stats.bytes_read += lfs_sb_getbsize(fs);
1331
1332 /*
1333 * If the number of segments changed under us, reinit.
1334 * We don't have to start over from scratch, however,
1335 * since we don't hold any buffers.
1336 */
1337 if (lfs_sb_getnseg(fs) != cip->clean + cip->dirty) {
1338 if (reinit_fs(fs) < 0) {
1339 /* The normal case for unmount */
1340 syslog(LOG_NOTICE, "%s: filesystem unmounted", lfs_sb_getfsmnt(fs));
1341 return -1;
1342 }
1343 syslog(LOG_NOTICE, "%s: nsegs changed", lfs_sb_getfsmnt(fs));
1344 }
1345
1346 /* Compute theoretical "free segments" maximum based on usage */
1347 fsb_per_seg = lfs_segtod(fs, 1);
1348 max_free_segs = MAX(cip->bfree, 0) / fsb_per_seg + lfs_sb_getminfreeseg(fs);
1349
1350 dlog("%s: bfree = %d, avail = %d, clean = %d/%d",
1351 lfs_sb_getfsmnt(fs), cip->bfree, cip->avail, cip->clean,
1352 lfs_sb_getnseg(fs));
1353
1354 /* If the writer is waiting on us, clean it */
1355 if (cip->clean <= lfs_sb_getminfreeseg(fs) ||
1356 (cip->flags & LFS_CLEANER_MUST_CLEAN))
1357 return 1;
1358
1359 /* If there are enough segments, don't clean it */
1360 if (cip->bfree - cip->avail <= fsb_per_seg &&
1361 cip->avail > fsb_per_seg)
1362 return 0;
1363
1364 /* If we are in dire straits, clean it */
1365 if (cip->bfree - cip->avail > fsb_per_seg &&
1366 cip->avail <= fsb_per_seg)
1367 return 1;
1368
1369 /* If under busy threshold, clean regardless of load */
1370 if (cip->clean < max_free_segs * BUSY_LIM)
1371 return 1;
1372
1373 /* Check busy status; clean if idle and under idle limit */
1374 if (use_fs_idle) {
1375 /* Filesystem idle */
1376 time(&now);
1377 if (fstat(fs->clfs_ifilefd, &st) < 0) {
1378 syslog(LOG_ERR, "%s: failed to stat ifile",
1379 lfs_sb_getfsmnt(fs));
1380 return -1;
1381 }
1382 if (now - st.st_mtime > segwait_timeout &&
1383 cip->clean < max_free_segs * IDLE_LIM)
1384 return 1;
1385 } else {
1386 /* CPU idle - use one-minute load avg */
1387 if (getloadavg(&loadavg, 1) == -1) {
1388 syslog(LOG_ERR, "%s: failed to get load avg",
1389 lfs_sb_getfsmnt(fs));
1390 return -1;
1391 }
1392 if (loadavg < load_threshold &&
1393 cip->clean < max_free_segs * IDLE_LIM)
1394 return 1;
1395 }
1396
1397 return 0;
1398 }
1399
1400 /*
1401 * Report statistics. If the signal was SIGUSR2, clear the statistics too.
1402 * If the signal was SIGINT, exit.
1403 */
1404 static void
1405 sig_report(int sig)
1406 {
1407 double avg = 0.0, stddev;
1408
1409 avg = cleaner_stats.util_tot / MAX(cleaner_stats.segs_cleaned, 1.0);
1410 stddev = cleaner_stats.util_sos / MAX(cleaner_stats.segs_cleaned -
1411 avg * avg, 1.0);
1412 syslog(LOG_INFO, "bytes read: %" PRId64, cleaner_stats.bytes_read);
1413 syslog(LOG_INFO, "bytes written: %" PRId64, cleaner_stats.bytes_written);
1414 syslog(LOG_INFO, "segments cleaned: %" PRId64, cleaner_stats.segs_cleaned);
1415 #if 0
1416 /* "Empty segments" is meaningless, since the kernel handles those */
1417 syslog(LOG_INFO, "empty segments: %" PRId64, cleaner_stats.segs_empty);
1418 #endif
1419 syslog(LOG_INFO, "error segments: %" PRId64, cleaner_stats.segs_error);
1420 syslog(LOG_INFO, "utilization total: %g", cleaner_stats.util_tot);
1421 syslog(LOG_INFO, "utilization sos: %g", cleaner_stats.util_sos);
1422 syslog(LOG_INFO, "utilization avg: %4.2f", avg);
1423 syslog(LOG_INFO, "utilization sdev: %9.6f", stddev);
1424
1425 if (debug)
1426 bufstats();
1427
1428 if (sig == SIGUSR2)
1429 memset(&cleaner_stats, 0, sizeof(cleaner_stats));
1430 if (sig == SIGINT)
1431 exit(0);
1432 }
1433
1434 static void
1435 sig_exit(int sig)
1436 {
1437 exit(0);
1438 }
1439
1440 static void
1441 usage(void)
1442 {
1443 errx(1, "usage: lfs_cleanerd [-bcdfmqs] [-i segnum] [-l load] "
1444 "[-n nsegs] [-r report_freq] [-t timeout] fs_name ...");
1445 }
1446
1447 #ifndef LFS_CLEANER_AS_LIB
1448 /*
1449 * Main.
1450 */
1451 int
1452 main(int argc, char **argv)
1453 {
1454
1455 return lfs_cleaner_main(argc, argv);
1456 }
1457 #endif
1458
1459 int
1460 lfs_cleaner_main(int argc, char **argv)
1461 {
1462 int i, opt, error, r, loopcount, nodetach;
1463 struct timeval tv;
1464 #ifdef LFS_CLEANER_AS_LIB
1465 sem_t *semaddr = NULL;
1466 #endif
1467 CLEANERINFO64 ci;
1468 #ifndef USE_CLIENT_SERVER
1469 char *cp, *pidname;
1470 #endif
1471
1472 /*
1473 * Set up defaults
1474 */
1475 atatime = 1;
1476 segwait_timeout = 300; /* Five minutes */
1477 load_threshold = 0.2;
1478 stat_report = 0;
1479 inval_segment = -1;
1480 copylog_filename = NULL;
1481 nodetach = 0;
1482
1483 /*
1484 * Parse command-line arguments
1485 */
1486 while ((opt = getopt(argc, argv, "bC:cdDfi:l:mn:qr:sS:t:")) != -1) {
1487 switch (opt) {
1488 case 'b': /* Use bytes written, not segments read */
1489 use_bytes = 1;
1490 break;
1491 case 'C': /* copy log */
1492 copylog_filename = optarg;
1493 break;
1494 case 'c': /* Coalesce files */
1495 do_coalesce++;
1496 break;
1497 case 'd': /* Debug mode. */
1498 nodetach++;
1499 debug++;
1500 break;
1501 case 'D': /* stay-on-foreground */
1502 nodetach++;
1503 break;
1504 case 'f': /* Use fs idle time rather than cpu idle */
1505 use_fs_idle = 1;
1506 break;
1507 case 'i': /* Invalidate this segment */
1508 inval_segment = atoi(optarg);
1509 break;
1510 case 'l': /* Load below which to clean */
1511 load_threshold = atof(optarg);
1512 break;
1513 case 'm': /* [compat only] */
1514 break;
1515 case 'n': /* How many segs to clean at once */
1516 atatime = atoi(optarg);
1517 break;
1518 case 'q': /* Quit after one run */
1519 do_quit = 1;
1520 break;
1521 case 'r': /* Report every stat_report segments */
1522 stat_report = atoi(optarg);
1523 break;
1524 case 's': /* Small writes */
1525 do_small = 1;
1526 break;
1527 #ifdef LFS_CLEANER_AS_LIB
1528 case 'S': /* semaphore */
1529 semaddr = (void*)(uintptr_t)strtoull(optarg,NULL,0);
1530 break;
1531 #endif
1532 case 't': /* timeout */
1533 segwait_timeout = atoi(optarg);
1534 break;
1535 default:
1536 usage();
1537 /* NOTREACHED */
1538 }
1539 }
1540 argc -= optind;
1541 argv += optind;
1542
1543 if (argc < 1)
1544 usage();
1545 if (inval_segment >= 0 && argc != 1) {
1546 errx(1, "lfs_cleanerd: may only specify one filesystem when "
1547 "using -i flag");
1548 }
1549
1550 if (do_coalesce) {
1551 errx(1, "lfs_cleanerd: -c disabled due to reports of file "
1552 "corruption; you may re-enable it by rebuilding the "
1553 "cleaner");
1554 }
1555
1556 /*
1557 * Set up daemon mode or foreground mode
1558 */
1559 if (nodetach) {
1560 openlog("lfs_cleanerd", LOG_NDELAY | LOG_PID | LOG_PERROR,
1561 LOG_DAEMON);
1562 signal(SIGINT, sig_report);
1563 } else {
1564 if (daemon(0, 0) == -1)
1565 err(1, "lfs_cleanerd: couldn't become a daemon!");
1566 openlog("lfs_cleanerd", LOG_NDELAY | LOG_PID, LOG_DAEMON);
1567 signal(SIGINT, sig_exit);
1568 }
1569
1570 /*
1571 * Look for an already-running master daemon. If there is one,
1572 * send it our filesystems to add to its list and exit.
1573 * If there is none, become the master.
1574 */
1575 #ifdef USE_CLIENT_SERVER
1576 try_to_become_master(argc, argv);
1577 #else
1578 /* XXX think about this */
1579 asprintf(&pidname, "lfs_cleanerd:m:%s", argv[0]);
1580 if (pidname == NULL) {
1581 syslog(LOG_ERR, "malloc failed: %m");
1582 exit(1);
1583 }
1584 for (cp = pidname; cp != NULL; cp = strchr(cp, '/'))
1585 *cp = '|';
1586 pidfile(pidname);
1587 #endif
1588
1589 /*
1590 * Signals mean daemon should report its statistics
1591 */
1592 memset(&cleaner_stats, 0, sizeof(cleaner_stats));
1593 signal(SIGUSR1, sig_report);
1594 signal(SIGUSR2, sig_report);
1595
1596 /*
1597 * Start up buffer cache. We only use this for the Ifile,
1598 * and we will resize it if necessary, so it can start small.
1599 */
1600 bufinit(4);
1601
1602 #ifdef REPAIR_ZERO_FINFO
1603 {
1604 BLOCK_INFO *bip = NULL;
1605 int bic = 0;
1606
1607 nfss = 1;
1608 fsp = (struct clfs **)malloc(sizeof(*fsp));
1609 fsp[0] = (struct clfs *)calloc(1, sizeof(**fsp));
1610
1611 if (init_unmounted_fs(fsp[0], argv[0]) < 0) {
1612 err(1, "init_unmounted_fs");
1613 }
1614 dlog("Filesystem has %d segments", fsp[0]->lfs_nseg);
1615 for (i = 0; i < fsp[0]->lfs_nseg; i++) {
1616 load_segment(fsp[0], i, &bip, &bic);
1617 bic = 0;
1618 }
1619 exit(0);
1620 }
1621 #endif
1622
1623 /*
1624 * Initialize cleaning structures, open devices, etc.
1625 */
1626 nfss = argc;
1627 fsp = (struct clfs **)malloc(nfss * sizeof(*fsp));
1628 if (fsp == NULL) {
1629 syslog(LOG_ERR, "couldn't allocate fs table: %m");
1630 exit(1);
1631 }
1632 for (i = 0; i < nfss; i++) {
1633 fsp[i] = (struct clfs *)calloc(1, sizeof(**fsp));
1634 if ((r = init_fs(fsp[i], argv[i])) < 0) {
1635 syslog(LOG_ERR, "%s: couldn't init: error code %d",
1636 argv[i], r);
1637 handle_error(fsp, i);
1638 --i; /* Do the new #i over again */
1639 }
1640 }
1641
1642 /*
1643 * If asked to coalesce, do so and exit.
1644 */
1645 if (do_coalesce) {
1646 for (i = 0; i < nfss; i++)
1647 clean_all_inodes(fsp[i]);
1648 exit(0);
1649 }
1650
1651 /*
1652 * If asked to invalidate a segment, do that and exit.
1653 */
1654 if (inval_segment >= 0) {
1655 invalidate_segment(fsp[0], inval_segment);
1656 exit(0);
1657 }
1658
1659 /*
1660 * Main cleaning loop.
1661 */
1662 loopcount = 0;
1663 #ifdef LFS_CLEANER_AS_LIB
1664 if (semaddr)
1665 sem_post(semaddr);
1666 #endif
1667 error = 0;
1668 while (nfss > 0) {
1669 int cleaned_one;
1670 do {
1671 #ifdef USE_CLIENT_SERVER
1672 check_control_socket();
1673 #endif
1674 cleaned_one = 0;
1675 for (i = 0; i < nfss; i++) {
1676 if ((error = needs_cleaning(fsp[i], &ci)) < 0) {
1677 syslog(LOG_DEBUG, "%s: needs_cleaning returned %d",
1678 getprogname(), error);
1679 handle_error(fsp, i);
1680 continue;
1681 }
1682 if (error == 0) /* No need to clean */
1683 continue;
1684
1685 reload_ifile(fsp[i]);
1686 if ((error = clean_fs(fsp[i], &ci)) < 0) {
1687 syslog(LOG_DEBUG, "%s: clean_fs returned %d",
1688 getprogname(), error);
1689 handle_error(fsp, i);
1690 continue;
1691 }
1692 ++cleaned_one;
1693 }
1694 ++loopcount;
1695 if (stat_report && loopcount % stat_report == 0)
1696 sig_report(0);
1697 if (do_quit)
1698 exit(0);
1699 } while(cleaned_one);
1700 tv.tv_sec = segwait_timeout;
1701 tv.tv_usec = 0;
1702 /* XXX: why couldn't others work if fsp socket is shutdown? */
1703 error = kops.ko_fcntl(fsp[0]->clfs_ifilefd,LFCNSEGWAITALL,&tv);
1704 if (error) {
1705 if (errno == ESHUTDOWN) {
1706 for (i = 0; i < nfss; i++) {
1707 syslog(LOG_INFO, "%s: shutdown",
1708 getprogname());
1709 handle_error(fsp, i);
1710 assert(nfss == 0);
1711 }
1712 } else {
1713 #ifdef LFS_CLEANER_AS_LIB
1714 error = ESHUTDOWN;
1715 break;
1716 #else
1717 err(1, "LFCNSEGWAITALL");
1718 #endif
1719 }
1720 }
1721 }
1722
1723 /* NOTREACHED */
1724 return error;
1725 }
1726