coalesce.c revision 1.27 1 /* $NetBSD: coalesce.c,v 1.27 2015/07/28 05:09:34 dholland Exp $ */
2
3 /*-
4 * Copyright (c) 2002, 2005 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Konrad E. Schroder <perseant (at) hhhh.org>.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include <sys/param.h>
33 #include <sys/mount.h>
34 #include <sys/time.h>
35 #include <sys/resource.h>
36 #include <sys/types.h>
37 #include <sys/wait.h>
38 #include <sys/mman.h>
39
40 #include <ufs/lfs/lfs.h>
41 #include <ufs/lfs/lfs_accessors.h>
42
43 #include <fcntl.h>
44 #include <signal.h>
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <time.h>
49 #include <unistd.h>
50 #include <util.h>
51 #include <errno.h>
52 #include <err.h>
53 #include <assert.h>
54
55 #include <syslog.h>
56
57 #include "bufcache.h"
58 #include "vnode.h"
59 #include "cleaner.h"
60 #include "kernelops.h"
61
62 extern int debug, do_mmap;
63
64 int log2int(int n)
65 {
66 int log;
67
68 log = 0;
69 while (n > 0) {
70 ++log;
71 n >>= 1;
72 }
73 return log - 1;
74 }
75
76 enum coalesce_returncodes {
77 COALESCE_OK = 0,
78 COALESCE_NOINODE,
79 COALESCE_TOOSMALL,
80 COALESCE_BADSIZE,
81 COALESCE_BADBLOCKSIZE,
82 COALESCE_NOMEM,
83 COALESCE_BADBMAPV,
84 COALESCE_BADMARKV,
85 COALESCE_NOTWORTHIT,
86 COALESCE_NOTHINGLEFT,
87 COALESCE_EIO,
88
89 COALESCE_MAXERROR
90 };
91
92 const char *coalesce_return[] = {
93 "Successfully coalesced",
94 "File not in use or inode not found",
95 "Not large enough to coalesce",
96 "Negative size",
97 "Not enough blocks to account for size",
98 "Malloc failed",
99 "LFCNBMAPV failed",
100 "Not broken enough to fix",
101 "Too many blocks not found",
102 "Too many blocks found in active segments",
103 "I/O error",
104
105 "No such error"
106 };
107
108 static struct ulfs1_dinode *
109 get_dinode(struct clfs *fs, ino_t ino)
110 {
111 IFILE *ifp;
112 daddr_t daddr;
113 struct ubuf *bp;
114 struct ulfs1_dinode *dip, *r;
115
116 lfs_ientry(&ifp, fs, ino, &bp);
117 daddr = ifp->if_daddr;
118 brelse(bp, 0);
119
120 if (daddr == 0x0)
121 return NULL;
122
123 bread(fs->clfs_devvp, daddr, lfs_sb_getibsize(fs), 0, &bp);
124 for (dip = (struct ulfs1_dinode *)bp->b_data;
125 dip < (struct ulfs1_dinode *)(bp->b_data + lfs_sb_getibsize(fs)); dip++)
126 if (dip->di_inumber == ino) {
127 r = (struct ulfs1_dinode *)malloc(sizeof(*r));
128 if (r == NULL)
129 break;
130 memcpy(r, dip, sizeof(*r));
131 brelse(bp, 0);
132 return r;
133 }
134 brelse(bp, 0);
135 return NULL;
136 }
137
138 /*
139 * Find out if this inode's data blocks are discontinuous; if they are,
140 * rewrite them using markv. Return the number of inodes rewritten.
141 */
142 static int
143 clean_inode(struct clfs *fs, ino_t ino)
144 {
145 BLOCK_INFO *bip = NULL, *tbip;
146 CLEANERINFO cip;
147 struct ubuf *bp;
148 struct ulfs1_dinode *dip;
149 struct clfs_seguse *sup;
150 struct lfs_fcntl_markv /* {
151 BLOCK_INFO *blkiov;
152 int blkcnt;
153 } */ lim;
154 daddr_t toff;
155 int i;
156 int nb, onb, noff;
157 int retval;
158 int bps;
159
160 dip = get_dinode(fs, ino);
161 if (dip == NULL)
162 return COALESCE_NOINODE;
163
164 /* Compute file block size, set up for bmapv */
165 onb = nb = lfs_lblkno(fs, dip->di_size);
166
167 /* XXX for now, don't do any file small enough to have fragments */
168 if (nb < ULFS_NDADDR) {
169 free(dip);
170 return COALESCE_TOOSMALL;
171 }
172
173 /* Sanity checks */
174 #if 0 /* di_size is uint64_t -- this is a noop */
175 if (dip->di_size < 0) {
176 dlog("ino %d, negative size (%" PRId64 ")", ino, dip->di_size);
177 free(dip);
178 return COALESCE_BADSIZE;
179 }
180 #endif
181 if (nb > dip->di_blocks) {
182 dlog("ino %d, computed blocks %d > held blocks %d", ino, nb,
183 dip->di_blocks);
184 free(dip);
185 return COALESCE_BADBLOCKSIZE;
186 }
187
188 bip = (BLOCK_INFO *)malloc(sizeof(BLOCK_INFO) * nb);
189 if (bip == NULL) {
190 syslog(LOG_WARNING, "ino %llu, %d blocks: %m",
191 (unsigned long long)ino, nb);
192 free(dip);
193 return COALESCE_NOMEM;
194 }
195 for (i = 0; i < nb; i++) {
196 memset(bip + i, 0, sizeof(BLOCK_INFO));
197 bip[i].bi_inode = ino;
198 bip[i].bi_lbn = i;
199 bip[i].bi_version = dip->di_gen;
200 /* Don't set the size, but let lfs_bmap fill it in */
201 }
202 lim.blkiov = bip;
203 lim.blkcnt = nb;
204 if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim) < 0) {
205 syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: %m",
206 lfs_sb_getfsmnt(fs));
207 retval = COALESCE_BADBMAPV;
208 goto out;
209 }
210 #if 0
211 for (i = 0; i < nb; i++) {
212 printf("bi_size = %d, bi_ino = %d, "
213 "bi_lbn = %d, bi_daddr = %d\n",
214 bip[i].bi_size, bip[i].bi_inode, bip[i].bi_lbn,
215 bip[i].bi_daddr);
216 }
217 #endif
218 noff = toff = 0;
219 for (i = 1; i < nb; i++) {
220 if (bip[i].bi_daddr != bip[i - 1].bi_daddr + clfs_sb_getfrag(fs))
221 ++noff;
222 toff += abs(bip[i].bi_daddr - bip[i - 1].bi_daddr
223 - clfs_sb_getfrag(fs)) >> lfs_sb_getfbshift(fs);
224 }
225
226 /*
227 * If this file is not discontinuous, there's no point in rewriting it.
228 *
229 * Explicitly allow a certain amount of discontinuity, since large
230 * files will be broken among segments and medium-sized files
231 * can have a break or two and it's okay.
232 */
233 if (nb <= 1 || noff == 0 || noff < log2int(nb) ||
234 lfs_segtod(fs, noff) * 2 < nb) {
235 retval = COALESCE_NOTWORTHIT;
236 goto out;
237 } else if (debug)
238 syslog(LOG_DEBUG, "ino %llu total discontinuity "
239 "%d (%lld) for %d blocks", (unsigned long long)ino,
240 noff, (long long)toff, nb);
241
242 /* Search for blocks in active segments; don't move them. */
243 for (i = 0; i < nb; i++) {
244 if (bip[i].bi_daddr <= 0)
245 continue;
246 sup = &fs->clfs_segtab[lfs_dtosn(fs, bip[i].bi_daddr)];
247 if (sup->flags & SEGUSE_ACTIVE)
248 bip[i].bi_daddr = LFS_UNUSED_DADDR; /* 0 */
249 }
250
251 /*
252 * Get rid of any blocks we've marked dead. If this is an older
253 * kernel that doesn't have bmapv fill in the block sizes, we'll
254 * toss everything here.
255 */
256 onb = nb;
257 toss_old_blocks(fs, &bip, &nb, NULL);
258 nb = i;
259
260 /*
261 * We may have tossed enough blocks that it is no longer worthwhile
262 * to rewrite this inode.
263 */
264 if (nb == 0 || onb - nb > log2int(onb)) {
265 if (debug)
266 syslog(LOG_DEBUG, "too many blocks tossed, not rewriting");
267 retval = COALESCE_NOTHINGLEFT;
268 goto out;
269 }
270
271 /*
272 * We are going to rewrite this inode.
273 * For any remaining blocks, read in their contents.
274 */
275 for (i = 0; i < nb; i++) {
276 bip[i].bi_bp = malloc(bip[i].bi_size);
277 if (bip[i].bi_bp == NULL) {
278 syslog(LOG_WARNING, "allocate block buffer size=%d: %m",
279 bip[i].bi_size);
280 retval = COALESCE_NOMEM;
281 goto out;
282 }
283
284 if (kops.ko_pread(fs->clfs_devfd, bip[i].bi_bp, bip[i].bi_size,
285 lfs_fsbtob(fs, bip[i].bi_daddr)) < 0) {
286 retval = COALESCE_EIO;
287 goto out;
288 }
289 }
290 if (debug)
291 syslog(LOG_DEBUG, "ino %llu markv %d blocks",
292 (unsigned long long)ino, nb);
293
294 /*
295 * Write in segment-sized chunks. If at any point we'd write more
296 * than half of the available segments, sleep until that's not
297 * true any more.
298 */
299 bps = lfs_segtod(fs, 1);
300 for (tbip = bip; tbip < bip + nb; tbip += bps) {
301 do {
302 bread(fs->lfs_ivnode, 0, clfs_sb_getbsize(fs), 0, &bp);
303 cip = *(CLEANERINFO *)bp->b_data;
304 brelse(bp, B_INVAL);
305
306 if (cip.clean < 4) /* XXX magic number 4 */
307 kops.ko_fcntl(fs->clfs_ifilefd,
308 LFCNSEGWAIT, NULL);
309 } while(cip.clean < 4);
310
311 lim.blkiov = tbip;
312 lim.blkcnt = (tbip + bps < bip + nb ? bps : nb % bps);
313 if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim) < 0) {
314 retval = COALESCE_BADMARKV;
315 goto out;
316 }
317 }
318
319 retval = COALESCE_OK;
320 out:
321 free(dip);
322 if (bip) {
323 for (i = 0; i < onb; i++)
324 if (bip[i].bi_bp)
325 free(bip[i].bi_bp);
326 free(bip);
327 }
328 return retval;
329 }
330
331 /*
332 * Try coalescing every inode in the filesystem.
333 * Return the number of inodes actually altered.
334 */
335 int clean_all_inodes(struct clfs *fs)
336 {
337 int i, r, maxino;
338 int totals[COALESCE_MAXERROR];
339 struct stat st;
340
341 memset(totals, 0, sizeof(totals));
342
343 fstat(fs->clfs_ifilefd, &st);
344 maxino = lfs_sb_getifpb(fs) * (st.st_size >> lfs_sb_getbshift(fs)) -
345 lfs_sb_getsegtabsz(fs) - lfs_sb_getcleansz(fs);
346
347 for (i = 0; i < maxino; i++) {
348 r = clean_inode(fs, i);
349 ++totals[r];
350 }
351
352 for (i = 0; i < COALESCE_MAXERROR; i++)
353 if (totals[i])
354 syslog(LOG_DEBUG, "%s: %d", coalesce_return[i],
355 totals[i]);
356
357 return totals[COALESCE_OK];
358 }
359
360 /*
361 * Fork a child process to coalesce this fs.
362 */
363 int
364 fork_coalesce(struct clfs *fs)
365 {
366 static pid_t childpid;
367 int num;
368
369 /*
370 * If already running a coalescing child, don't start a new one.
371 */
372 if (childpid) {
373 if (waitpid(childpid, NULL, WNOHANG) == childpid)
374 childpid = 0;
375 }
376 if (childpid && kill(childpid, 0) >= 0) {
377 /* already running a coalesce process */
378 if (debug)
379 syslog(LOG_DEBUG, "coalescing already in progress");
380 return 0;
381 }
382
383 /*
384 * Fork a child and let the child coalease
385 */
386 childpid = fork();
387 if (childpid < 0) {
388 syslog(LOG_ERR, "%s: fork to coaleasce: %m", lfs_sb_getfsmnt(fs));
389 return 0;
390 } else if (childpid == 0) {
391 syslog(LOG_NOTICE, "%s: new coalescing process, pid %d",
392 lfs_sb_getfsmnt(fs), getpid());
393 num = clean_all_inodes(fs);
394 syslog(LOG_NOTICE, "%s: coalesced %d discontiguous inodes",
395 lfs_sb_getfsmnt(fs), num);
396 exit(0);
397 }
398
399 return 0;
400 }
401