coalesce.c revision 1.29 1 /* $NetBSD: coalesce.c,v 1.29 2015/08/12 18:23:16 dholland Exp $ */
2
3 /*-
4 * Copyright (c) 2002, 2005 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Konrad E. Schroder <perseant (at) hhhh.org>.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include <sys/param.h>
33 #include <sys/mount.h>
34 #include <sys/time.h>
35 #include <sys/resource.h>
36 #include <sys/types.h>
37 #include <sys/wait.h>
38 #include <sys/mman.h>
39
40 #include <ufs/lfs/lfs.h>
41
42 #include <fcntl.h>
43 #include <signal.h>
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <time.h>
48 #include <unistd.h>
49 #include <util.h>
50 #include <errno.h>
51 #include <err.h>
52 #include <assert.h>
53
54 #include <syslog.h>
55
56 #include "bufcache.h"
57 #include "vnode.h"
58 #include "cleaner.h"
59 #include "kernelops.h"
60
61 extern int debug, do_mmap;
62
63 /*
64 * XXX return the arg to just int when/if we don't need it for
65 * potentially huge block counts any more.
66 */
67 static int
68 log2int(intmax_t n)
69 {
70 int log;
71
72 log = 0;
73 while (n > 0) {
74 ++log;
75 n >>= 1;
76 }
77 return log - 1;
78 }
79
80 enum coalesce_returncodes {
81 COALESCE_OK = 0,
82 COALESCE_NOINODE,
83 COALESCE_TOOSMALL,
84 COALESCE_BADSIZE,
85 COALESCE_BADBLOCKSIZE,
86 COALESCE_NOMEM,
87 COALESCE_BADBMAPV,
88 COALESCE_BADMARKV,
89 COALESCE_NOTWORTHIT,
90 COALESCE_NOTHINGLEFT,
91 COALESCE_EIO,
92
93 COALESCE_MAXERROR
94 };
95
96 const char *coalesce_return[] = {
97 "Successfully coalesced",
98 "File not in use or inode not found",
99 "Not large enough to coalesce",
100 "Negative size",
101 "Not enough blocks to account for size",
102 "Malloc failed",
103 "LFCNBMAPV failed",
104 "Not broken enough to fix",
105 "Too many blocks not found",
106 "Too many blocks found in active segments",
107 "I/O error",
108
109 "No such error"
110 };
111
112 static struct ulfs1_dinode *
113 get_dinode(struct clfs *fs, ino_t ino)
114 {
115 IFILE *ifp;
116 daddr_t daddr;
117 struct ubuf *bp;
118 struct ulfs1_dinode *dip, *r;
119
120 lfs_ientry(&ifp, fs, ino, &bp);
121 daddr = ifp->if_daddr;
122 brelse(bp, 0);
123
124 if (daddr == 0x0)
125 return NULL;
126
127 bread(fs->clfs_devvp, daddr, lfs_sb_getibsize(fs), 0, &bp);
128 for (dip = (struct ulfs1_dinode *)bp->b_data;
129 dip < (struct ulfs1_dinode *)(bp->b_data + lfs_sb_getibsize(fs)); dip++)
130 if (dip->di_inumber == ino) {
131 r = (struct ulfs1_dinode *)malloc(sizeof(*r));
132 if (r == NULL)
133 break;
134 memcpy(r, dip, sizeof(*r));
135 brelse(bp, 0);
136 return r;
137 }
138 brelse(bp, 0);
139 return NULL;
140 }
141
142 /*
143 * Find out if this inode's data blocks are discontinuous; if they are,
144 * rewrite them using markv. Return the number of inodes rewritten.
145 */
146 static int
147 clean_inode(struct clfs *fs, ino_t ino)
148 {
149 BLOCK_INFO *bip = NULL, *tbip;
150 CLEANERINFO cip;
151 struct ubuf *bp;
152 struct ulfs1_dinode *dip;
153 struct clfs_seguse *sup;
154 struct lfs_fcntl_markv /* {
155 BLOCK_INFO *blkiov;
156 int blkcnt;
157 } */ lim;
158 daddr_t toff;
159 int noff;
160 blkcnt_t i, nb, onb;
161 int retval;
162 int bps;
163
164 dip = get_dinode(fs, ino);
165 if (dip == NULL)
166 return COALESCE_NOINODE;
167
168 /* Compute file block size, set up for bmapv */
169 onb = nb = lfs_lblkno(fs, dip->di_size);
170
171 /* XXX for now, don't do any file small enough to have fragments */
172 if (nb < ULFS_NDADDR) {
173 free(dip);
174 return COALESCE_TOOSMALL;
175 }
176
177 /* Sanity checks */
178 #if 0 /* di_size is uint64_t -- this is a noop */
179 if (dip->di_size < 0) {
180 dlog("ino %d, negative size (%" PRId64 ")", ino, dip->di_size);
181 free(dip);
182 return COALESCE_BADSIZE;
183 }
184 #endif
185 if (nb > dip->di_blocks) {
186 dlog("ino %ju, computed blocks %jd > held blocks %ju",
187 (uintmax_t)ino, (intmax_t)nb,
188 (uintmax_t)dip->di_blocks);
189 free(dip);
190 return COALESCE_BADBLOCKSIZE;
191 }
192
193 /*
194 * XXX: We should really coalesce really large files in
195 * chunks, as there's substantial diminishing returns and
196 * mallocing huge amounts of memory just to get those returns
197 * is pretty silly. But that requires a big rework of this
198 * code. (On the plus side though then we can stop worrying
199 * about block counts > 2^31.)
200 */
201
202 /* ugh, no DADDR_T_MAX */
203 __CTASSERT(sizeof(daddr_t) == sizeof(int64_t));
204 if (nb > INT64_MAX / sizeof(BLOCK_INFO)) {
205 syslog(LOG_WARNING, "ino %ju, %jd blocks: array too large\n",
206 (uintmax_t)ino, (uintmax_t)nb);
207 free(dip);
208 return COALESCE_NOMEM;
209 }
210
211 bip = (BLOCK_INFO *)malloc(sizeof(BLOCK_INFO) * nb);
212 if (bip == NULL) {
213 syslog(LOG_WARNING, "ino %llu, %jd blocks: %s\n",
214 (unsigned long long)ino, (intmax_t)nb,
215 strerror(errno));
216 free(dip);
217 return COALESCE_NOMEM;
218 }
219 for (i = 0; i < nb; i++) {
220 memset(bip + i, 0, sizeof(BLOCK_INFO));
221 bip[i].bi_inode = ino;
222 bip[i].bi_lbn = i;
223 bip[i].bi_version = dip->di_gen;
224 /* Don't set the size, but let lfs_bmap fill it in */
225 }
226 /*
227 * The kernel also contains this check; but as lim.blkcnt is
228 * only 32 bits wide, we need to check ourselves too in case
229 * we'd otherwise truncate a value > 2^31, as that might
230 * succeed and create bizarre results.
231 */
232 if (nb > LFS_MARKV_MAXBLKCNT) {
233 syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: Too large\n",
234 lfs_sb_getfsmnt(fs));
235 retval = COALESCE_BADBMAPV;
236 goto out;
237 }
238 lim.blkiov = bip;
239 lim.blkcnt = nb;
240 if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim) < 0) {
241 syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: %m",
242 lfs_sb_getfsmnt(fs));
243 retval = COALESCE_BADBMAPV;
244 goto out;
245 }
246 #if 0
247 for (i = 0; i < nb; i++) {
248 printf("bi_size = %d, bi_ino = %ju, "
249 "bi_lbn = %jd, bi_daddr = %jd\n",
250 bip[i].bi_size, (uintmax_t)bip[i].bi_inode,
251 (intmax_t)bip[i].bi_lbn,
252 (intmax_t)bip[i].bi_daddr);
253 }
254 #endif
255 noff = 0;
256 toff = 0;
257 for (i = 1; i < nb; i++) {
258 if (bip[i].bi_daddr != bip[i - 1].bi_daddr + lfs_sb_getfrag(fs))
259 ++noff;
260 toff += abs(bip[i].bi_daddr - bip[i - 1].bi_daddr
261 - lfs_sb_getfrag(fs)) >> lfs_sb_getfbshift(fs);
262 }
263
264 /*
265 * If this file is not discontinuous, there's no point in rewriting it.
266 *
267 * Explicitly allow a certain amount of discontinuity, since large
268 * files will be broken among segments and medium-sized files
269 * can have a break or two and it's okay.
270 */
271 if (nb <= 1 || noff == 0 || noff < log2int(nb) ||
272 lfs_segtod(fs, noff) * 2 < nb) {
273 retval = COALESCE_NOTWORTHIT;
274 goto out;
275 } else if (debug)
276 syslog(LOG_DEBUG, "ino %llu total discontinuity "
277 "%d (%jd) for %jd blocks", (unsigned long long)ino,
278 noff, (intmax_t)toff, (intmax_t)nb);
279
280 /* Search for blocks in active segments; don't move them. */
281 for (i = 0; i < nb; i++) {
282 if (bip[i].bi_daddr <= 0)
283 continue;
284 sup = &fs->clfs_segtab[lfs_dtosn(fs, bip[i].bi_daddr)];
285 if (sup->flags & SEGUSE_ACTIVE)
286 bip[i].bi_daddr = LFS_UNUSED_DADDR; /* 0 */
287 }
288
289 /*
290 * Get rid of any blocks we've marked dead. If this is an older
291 * kernel that doesn't have bmapv fill in the block sizes, we'll
292 * toss everything here.
293 */
294 onb = nb;
295 toss_old_blocks(fs, &bip, &nb, NULL);
296 nb = i;
297
298 /*
299 * We may have tossed enough blocks that it is no longer worthwhile
300 * to rewrite this inode.
301 */
302 if (nb == 0 || onb - nb > log2int(onb)) {
303 if (debug)
304 syslog(LOG_DEBUG, "too many blocks tossed, not rewriting");
305 retval = COALESCE_NOTHINGLEFT;
306 goto out;
307 }
308
309 /*
310 * We are going to rewrite this inode.
311 * For any remaining blocks, read in their contents.
312 */
313 for (i = 0; i < nb; i++) {
314 bip[i].bi_bp = malloc(bip[i].bi_size);
315 if (bip[i].bi_bp == NULL) {
316 syslog(LOG_WARNING, "allocate block buffer size=%d: %s\n",
317 bip[i].bi_size, strerror(errno));
318 retval = COALESCE_NOMEM;
319 goto out;
320 }
321
322 if (kops.ko_pread(fs->clfs_devfd, bip[i].bi_bp, bip[i].bi_size,
323 lfs_fsbtob(fs, bip[i].bi_daddr)) < 0) {
324 retval = COALESCE_EIO;
325 goto out;
326 }
327 }
328 if (debug)
329 syslog(LOG_DEBUG, "ino %ju markv %jd blocks",
330 (uintmax_t)ino, (intmax_t)nb);
331
332 /*
333 * Write in segment-sized chunks. If at any point we'd write more
334 * than half of the available segments, sleep until that's not
335 * true any more.
336 *
337 * XXX the pointer arithmetic in this loop is illegal; replace
338 * TBIP with an integer (blkcnt_t) offset.
339 */
340 bps = lfs_segtod(fs, 1);
341 for (tbip = bip; tbip < bip + nb; tbip += bps) {
342 do {
343 bread(fs->lfs_ivnode, 0, lfs_sb_getbsize(fs), 0, &bp);
344 cip = *(CLEANERINFO *)bp->b_data;
345 brelse(bp, B_INVAL);
346
347 if (cip.clean < 4) /* XXX magic number 4 */
348 kops.ko_fcntl(fs->clfs_ifilefd,
349 LFCNSEGWAIT, NULL);
350 } while(cip.clean < 4);
351
352 /*
353 * Note that although lim.blkcnt is 32 bits wide, bps
354 * (which is blocks-per-segment) is < 2^32 so the
355 * value assigned here is always in range.
356 */
357 lim.blkiov = tbip;
358 lim.blkcnt = (tbip + bps < bip + nb ? bps : nb % bps);
359 if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim) < 0) {
360 retval = COALESCE_BADMARKV;
361 goto out;
362 }
363 }
364
365 retval = COALESCE_OK;
366 out:
367 free(dip);
368 if (bip) {
369 for (i = 0; i < onb; i++)
370 if (bip[i].bi_bp)
371 free(bip[i].bi_bp);
372 free(bip);
373 }
374 return retval;
375 }
376
377 /*
378 * Try coalescing every inode in the filesystem.
379 * Return the number of inodes actually altered.
380 */
381 int clean_all_inodes(struct clfs *fs)
382 {
383 int i, r, maxino;
384 int totals[COALESCE_MAXERROR];
385 struct stat st;
386
387 memset(totals, 0, sizeof(totals));
388
389 fstat(fs->clfs_ifilefd, &st);
390 maxino = lfs_sb_getifpb(fs) * (st.st_size >> lfs_sb_getbshift(fs)) -
391 lfs_sb_getsegtabsz(fs) - lfs_sb_getcleansz(fs);
392
393 for (i = 0; i < maxino; i++) {
394 r = clean_inode(fs, i);
395 ++totals[r];
396 }
397
398 for (i = 0; i < COALESCE_MAXERROR; i++)
399 if (totals[i])
400 syslog(LOG_DEBUG, "%s: %d", coalesce_return[i],
401 totals[i]);
402
403 return totals[COALESCE_OK];
404 }
405
406 /*
407 * Fork a child process to coalesce this fs.
408 */
409 int
410 fork_coalesce(struct clfs *fs)
411 {
412 static pid_t childpid;
413 int num;
414
415 /*
416 * If already running a coalescing child, don't start a new one.
417 */
418 if (childpid) {
419 if (waitpid(childpid, NULL, WNOHANG) == childpid)
420 childpid = 0;
421 }
422 if (childpid && kill(childpid, 0) >= 0) {
423 /* already running a coalesce process */
424 if (debug)
425 syslog(LOG_DEBUG, "coalescing already in progress");
426 return 0;
427 }
428
429 /*
430 * Fork a child and let the child coalease
431 */
432 childpid = fork();
433 if (childpid < 0) {
434 syslog(LOG_ERR, "%s: fork to coaleasce: %m", lfs_sb_getfsmnt(fs));
435 return 0;
436 } else if (childpid == 0) {
437 syslog(LOG_NOTICE, "%s: new coalescing process, pid %d",
438 lfs_sb_getfsmnt(fs), getpid());
439 num = clean_all_inodes(fs);
440 syslog(LOG_NOTICE, "%s: coalesced %d discontiguous inodes",
441 lfs_sb_getfsmnt(fs), num);
442 exit(0);
443 }
444
445 return 0;
446 }
447