coalesce.c revision 1.14 1 /* $NetBSD: coalesce.c,v 1.14 2007/10/08 21:41:12 ad Exp $ */
2
3 /*-
4 * Copyright (c) 2002, 2005 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Konrad E. Schroder <perseant (at) hhhh.org>.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38
39 #include <sys/param.h>
40 #include <sys/mount.h>
41 #include <sys/time.h>
42 #include <sys/resource.h>
43 #include <sys/types.h>
44 #include <sys/wait.h>
45 #include <sys/mman.h>
46
47 #include <ufs/ufs/dinode.h>
48 #include <ufs/lfs/lfs.h>
49
50 #include <fcntl.h>
51 #include <signal.h>
52 #include <stdio.h>
53 #include <stdlib.h>
54 #include <string.h>
55 #include <time.h>
56 #include <unistd.h>
57 #include <util.h>
58 #include <errno.h>
59 #include <err.h>
60
61 #include <syslog.h>
62
63 #include "bufcache.h"
64 #include "vnode.h"
65 #include "cleaner.h"
66
67 extern int debug, do_mmap;
68
69 int log2int(int n)
70 {
71 int log;
72
73 log = 0;
74 while (n > 0) {
75 ++log;
76 n >>= 1;
77 }
78 return log - 1;
79 }
80
81 enum coalesce_returncodes {
82 COALESCE_OK = 0,
83 COALESCE_NOINODE,
84 COALESCE_TOOSMALL,
85 COALESCE_BADSIZE,
86 COALESCE_BADBLOCKSIZE,
87 COALESCE_NOMEM,
88 COALESCE_BADBMAPV,
89 COALESCE_BADMARKV,
90 COALESCE_NOTWORTHIT,
91 COALESCE_NOTHINGLEFT,
92 COALESCE_EIO,
93
94 COALESCE_MAXERROR
95 };
96
97 char *coalesce_return[] = {
98 "Successfully coalesced",
99 "File not in use or inode not found",
100 "Not large enough to coalesce",
101 "Negative size",
102 "Not enough blocks to account for size",
103 "Malloc failed",
104 "LFCNBMAPV failed",
105 "Not broken enough to fix",
106 "Too many blocks not found",
107 "Too many blocks found in active segments",
108 "I/O error",
109
110 "No such error"
111 };
112
113 static struct ufs1_dinode *
114 get_dinode(struct clfs *fs, ino_t ino)
115 {
116 IFILE *ifp;
117 daddr_t daddr;
118 struct ubuf *bp;
119 struct ufs1_dinode *dip, *r;
120
121 lfs_ientry(&ifp, fs, ino, &bp);
122 daddr = ifp->if_daddr;
123 brelse(bp, 0);
124
125 if (daddr == 0x0)
126 return NULL;
127
128 bread(fs->clfs_devvp, daddr, fs->lfs_ibsize, NOCRED, &bp);
129 for (dip = (struct ufs1_dinode *)bp->b_data;
130 dip < (struct ufs1_dinode *)(bp->b_data + fs->lfs_ibsize); dip++)
131 if (dip->di_inumber == ino) {
132 r = (struct ufs1_dinode *)malloc(sizeof(*r));
133 memcpy(r, dip, sizeof(*r));
134 brelse(bp, 0);
135 return r;
136 }
137 brelse(bp, 0);
138 return NULL;
139 }
140
141 /*
142 * Find out if this inode's data blocks are discontinuous; if they are,
143 * rewrite them using markv. Return the number of inodes rewritten.
144 */
145 static int
146 clean_inode(struct clfs *fs, ino_t ino)
147 {
148 BLOCK_INFO *bip = NULL, *tbip;
149 CLEANERINFO cip;
150 struct ubuf *bp;
151 struct ufs1_dinode *dip;
152 struct clfs_seguse *sup;
153 struct lfs_fcntl_markv /* {
154 BLOCK_INFO *blkiov;
155 int blkcnt;
156 } */ lim;
157 daddr_t toff;
158 int i;
159 int nb, onb, noff;
160 int retval;
161 int bps;
162
163 dip = get_dinode(fs, ino);
164 if (dip == NULL)
165 return COALESCE_NOINODE;
166
167 /* Compute file block size, set up for bmapv */
168 onb = nb = lblkno(fs, dip->di_size);
169
170 /* XXX for now, don't do any file small enough to have fragments */
171 if (nb < NDADDR) {
172 free(dip);
173 return COALESCE_TOOSMALL;
174 }
175
176 /* Sanity checks */
177 if (dip->di_size < 0) {
178 dlog("ino %d, negative size (%" PRId64 ")", ino, dip->di_size);
179 free(dip);
180 return COALESCE_BADSIZE;
181 }
182 if (nb > dip->di_blocks) {
183 dlog("ino %d, computed blocks %d > held blocks %d", ino, nb,
184 dip->di_blocks);
185 free(dip);
186 return COALESCE_BADBLOCKSIZE;
187 }
188
189 bip = (BLOCK_INFO *)malloc(sizeof(BLOCK_INFO) * nb);
190 if (bip == NULL) {
191 syslog(LOG_WARNING, "ino %llu, %d blocks: %m",
192 (unsigned long long)ino, nb);
193 free(dip);
194 return COALESCE_NOMEM;
195 }
196 for (i = 0; i < nb; i++) {
197 memset(bip + i, 0, sizeof(BLOCK_INFO));
198 bip[i].bi_inode = ino;
199 bip[i].bi_lbn = i;
200 bip[i].bi_version = dip->di_gen;
201 /* Don't set the size, but let lfs_bmap fill it in */
202 }
203 lim.blkiov = bip;
204 lim.blkcnt = nb;
205 if (fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim) < 0) {
206 syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: %m",
207 fs->lfs_fsmnt);
208 retval = COALESCE_BADBMAPV;
209 goto out;
210 }
211 #if 0
212 for (i = 0; i < nb; i++) {
213 printf("bi_size = %d, bi_ino = %d, "
214 "bi_lbn = %d, bi_daddr = %d\n",
215 bip[i].bi_size, bip[i].bi_inode, bip[i].bi_lbn,
216 bip[i].bi_daddr);
217 }
218 #endif
219 noff = toff = 0;
220 for (i = 1; i < nb; i++) {
221 if (bip[i].bi_daddr != bip[i - 1].bi_daddr + fs->lfs_frag)
222 ++noff;
223 toff += abs(bip[i].bi_daddr - bip[i - 1].bi_daddr
224 - fs->lfs_frag) >> fs->lfs_fbshift;
225 }
226
227 /*
228 * If this file is not discontinuous, there's no point in rewriting it.
229 *
230 * Explicitly allow a certain amount of discontinuity, since large
231 * files will be broken among segments and medium-sized files
232 * can have a break or two and it's okay.
233 */
234 if (nb <= 1 || noff == 0 || noff < log2int(nb) ||
235 segtod(fs, noff) * 2 < nb) {
236 retval = COALESCE_NOTWORTHIT;
237 goto out;
238 } else if (debug)
239 syslog(LOG_DEBUG, "ino %llu total discontinuity "
240 "%d (%lld) for %d blocks", (unsigned long long)ino,
241 noff, (long long)toff, nb);
242
243 /* Search for blocks in active segments; don't move them. */
244 for (i = 0; i < nb; i++) {
245 if (bip[i].bi_daddr <= 0)
246 continue;
247 sup = &fs->clfs_segtab[dtosn(fs, bip[i].bi_daddr)];
248 if (sup->flags & SEGUSE_ACTIVE)
249 bip[i].bi_daddr = LFS_UNUSED_DADDR; /* 0 */
250 }
251
252 /*
253 * Get rid of any blocks we've marked dead. If this is an older
254 * kernel that doesn't have bmapv fill in the block sizes, we'll
255 * toss everything here.
256 */
257 onb = nb;
258 toss_old_blocks(fs, &bip, &nb, NULL);
259 nb = i;
260
261 /*
262 * We may have tossed enough blocks that it is no longer worthwhile
263 * to rewrite this inode.
264 */
265 if (nb == 0 || onb - nb > log2int(onb)) {
266 if (debug)
267 syslog(LOG_DEBUG, "too many blocks tossed, not rewriting");
268 retval = COALESCE_NOTHINGLEFT;
269 goto out;
270 }
271
272 /*
273 * We are going to rewrite this inode.
274 * For any remaining blocks, read in their contents.
275 */
276 for (i = 0; i < nb; i++) {
277 bip[i].bi_bp = malloc(bip[i].bi_size);
278 if (bip[i].bi_bp == NULL) {
279 syslog(LOG_WARNING, "allocate block buffer size=%d: %m",
280 bip[i].bi_size);
281 retval = COALESCE_NOMEM;
282 goto out;
283 }
284
285 if (pread(fs->clfs_devfd, bip[i].bi_bp, bip[i].bi_size,
286 fsbtob(fs, bip[i].bi_daddr)) < 0) {
287 retval = COALESCE_EIO;
288 goto out;
289 }
290 }
291 if (debug)
292 syslog(LOG_DEBUG, "ino %llu markv %d blocks",
293 (unsigned long long)ino, nb);
294
295 /*
296 * Write in segment-sized chunks. If at any point we'd write more
297 * than half of the available segments, sleep until that's not
298 * true any more.
299 */
300 bps = segtod(fs, 1);
301 for (tbip = bip; tbip < bip + nb; tbip += bps) {
302 do {
303 bread(fs->lfs_ivnode, 0, fs->lfs_bsize, NOCRED, &bp);
304 cip = *(CLEANERINFO *)bp->b_data;
305 brelse(bp, B_INVAL);
306
307 if (cip.clean < 4) /* XXX magic number 4 */
308 fcntl(fs->clfs_ifilefd, LFCNSEGWAIT, NULL);
309 } while(cip.clean < 4);
310
311 lim.blkiov = tbip;
312 lim.blkcnt = (tbip + bps < bip + nb ? bps : nb % bps);
313 if (fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim) < 0) {
314 retval = COALESCE_BADMARKV;
315 goto out;
316 }
317 }
318
319 retval = COALESCE_OK;
320 out:
321 free(dip);
322 if (bip) {
323 for (i = 0; i < onb; i++)
324 if (bip[i].bi_bp)
325 free(bip[i].bi_bp);
326 free(bip);
327 }
328 return retval;
329 }
330
331 /*
332 * Try coalescing every inode in the filesystem.
333 * Return the number of inodes actually altered.
334 */
335 int clean_all_inodes(struct clfs *fs)
336 {
337 int i, r, maxino;
338 int totals[COALESCE_MAXERROR];
339 struct stat st;
340
341 memset(totals, 0, sizeof(totals));
342
343 fstat(fs->clfs_ifilefd, &st);
344 maxino = fs->lfs_ifpb * (st.st_size >> fs->lfs_bshift) -
345 fs->lfs_segtabsz - fs->lfs_cleansz;
346
347 for (i = 0; i < maxino; i++) {
348 r = clean_inode(fs, i);
349 ++totals[r];
350 }
351
352 for (i = 0; i < COALESCE_MAXERROR; i++)
353 if (totals[i])
354 syslog(LOG_DEBUG, "%s: %d", coalesce_return[i],
355 totals[i]);
356
357 return totals[COALESCE_OK];
358 }
359
360 /*
361 * Fork a child process to coalesce this fs.
362 */
363 int
364 fork_coalesce(struct clfs *fs)
365 {
366 static pid_t childpid;
367 int num;
368
369 /*
370 * If already running a coalescing child, don't start a new one.
371 */
372 if (childpid) {
373 if (waitpid(childpid, NULL, WNOHANG) == childpid)
374 childpid = 0;
375 }
376 if (childpid && kill(childpid, 0) >= 0) {
377 /* already running a coalesce process */
378 if (debug)
379 syslog(LOG_DEBUG, "coalescing already in progress");
380 return 0;
381 }
382
383 /*
384 * Fork a child and let the child coalease
385 */
386 childpid = fork();
387 if (childpid < 0) {
388 syslog(LOG_ERR, "%s: fork to coaleasce: %m", fs->lfs_fsmnt);
389 return 0;
390 } else if (childpid == 0) {
391 syslog(LOG_NOTICE, "%s: new coalescing process, pid %d",
392 fs->lfs_fsmnt, getpid());
393 num = clean_all_inodes(fs);
394 syslog(LOG_NOTICE, "%s: coalesced %d discontiguous inodes",
395 fs->lfs_fsmnt, num);
396 exit(0);
397 }
398
399 return 0;
400 }
401