coalesce.c revision 1.11 1 /* $NetBSD: coalesce.c,v 1.11 2006/03/30 19:10:13 perseant Exp $ */
2
3 /*-
4 * Copyright (c) 2002, 2005 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Konrad E. Schroder <perseant (at) hhhh.org>.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38
39 #include <sys/param.h>
40 #include <sys/mount.h>
41 #include <sys/time.h>
42 #include <sys/resource.h>
43 #include <sys/types.h>
44 #include <sys/wait.h>
45 #include <sys/mman.h>
46
47 #include <ufs/ufs/dinode.h>
48 #include <ufs/lfs/lfs.h>
49
50 #include <fcntl.h>
51 #include <signal.h>
52 #include <stdio.h>
53 #include <stdlib.h>
54 #include <string.h>
55 #include <time.h>
56 #include <unistd.h>
57 #include <util.h>
58 #include <errno.h>
59 #include <err.h>
60
61 #include <syslog.h>
62
63 #include "bufcache.h"
64 #include "vnode.h"
65 #include "cleaner.h"
66
67 extern int debug, do_mmap;
68
69 int log2int(int n)
70 {
71 int log;
72
73 log = 0;
74 while (n > 0) {
75 ++log;
76 n >>= 1;
77 }
78 return log - 1;
79 }
80
81 enum coalesce_returncodes {
82 COALESCE_OK = 0,
83 COALESCE_NOINODE,
84 COALESCE_TOOSMALL,
85 COALESCE_BADSIZE,
86 COALESCE_BADBLOCKSIZE,
87 COALESCE_NOMEM,
88 COALESCE_BADBMAPV,
89 COALESCE_BADMARKV,
90 COALESCE_NOTWORTHIT,
91 COALESCE_NOTHINGLEFT,
92 COALESCE_EIO,
93
94 COALESCE_MAXERROR
95 };
96
97 char *coalesce_return[] = {
98 "Successfully coalesced",
99 "File not in use or inode not found",
100 "Not large enough to coalesce",
101 "Negative size",
102 "Not enough blocks to account for size",
103 "Malloc failed",
104 "LFCNBMAPV failed",
105 "Not broken enough to fix",
106 "Too many blocks not found",
107 "Too many blocks found in active segments",
108 "I/O error",
109
110 "No such error"
111 };
112
113 static struct ufs1_dinode *
114 get_dinode(struct clfs *fs, ino_t ino)
115 {
116 IFILE *ifp;
117 daddr_t daddr;
118 struct ubuf *bp;
119 struct ufs1_dinode *dip, *r;
120
121 lfs_ientry(&ifp, fs, ino, &bp);
122 daddr = ifp->if_daddr;
123 brelse(bp);
124
125 if (daddr == 0x0)
126 return NULL;
127
128 bread(fs->clfs_devvp, daddr, fs->lfs_ibsize, NOCRED, &bp);
129 for (dip = (struct ufs1_dinode *)bp->b_data;
130 dip < (struct ufs1_dinode *)(bp->b_data + fs->lfs_ibsize); dip++)
131 if (dip->di_inumber == ino) {
132 r = (struct ufs1_dinode *)malloc(sizeof(*r));
133 memcpy(r, dip, sizeof(*r));
134 brelse(bp);
135 return r;
136 }
137 brelse(bp);
138 return NULL;
139 }
140
141 /*
142 * Find out if this inode's data blocks are discontinuous; if they are,
143 * rewrite them using markv. Return the number of inodes rewritten.
144 */
145 static int
146 clean_inode(struct clfs *fs, ino_t ino)
147 {
148 BLOCK_INFO *bip = NULL, *tbip;
149 CLEANERINFO cip;
150 struct ubuf *bp;
151 struct ufs1_dinode *dip;
152 struct clfs_seguse *sup;
153 struct lfs_fcntl_markv /* {
154 BLOCK_INFO *blkiov;
155 int blkcnt;
156 } */ lim;
157 daddr_t toff;
158 int i;
159 int nb, onb, noff;
160 int retval;
161 int bps;
162
163 dip = get_dinode(fs, ino);
164 if (dip == NULL)
165 return COALESCE_NOINODE;
166
167 /* Compute file block size, set up for bmapv */
168 onb = nb = lblkno(fs, dip->di_size);
169
170 /* XXX for now, don't do any file small enough to have fragments */
171 if (nb < NDADDR)
172 return COALESCE_TOOSMALL;
173
174 /* Sanity checks */
175 if (dip->di_size < 0) {
176 dlog("ino %d, negative size (%" PRId64 ")", ino, dip->di_size);
177 return COALESCE_BADSIZE;
178 }
179 if (nb > dip->di_blocks) {
180 dlog("ino %d, computed blocks %d > held blocks %d", ino, nb,
181 dip->di_blocks);
182 return COALESCE_BADBLOCKSIZE;
183 }
184
185 bip = (BLOCK_INFO *)malloc(sizeof(BLOCK_INFO) * nb);
186 if (bip == NULL) {
187 syslog(LOG_WARNING, "ino %llu, %d blocks: %m",
188 (unsigned long long)ino, nb);
189 return COALESCE_NOMEM;
190 }
191 for (i = 0; i < nb; i++) {
192 memset(bip + i, 0, sizeof(BLOCK_INFO));
193 bip[i].bi_inode = ino;
194 bip[i].bi_lbn = i;
195 bip[i].bi_version = dip->di_gen;
196 /* Don't set the size, but let lfs_bmap fill it in */
197 }
198 lim.blkiov = bip;
199 lim.blkcnt = nb;
200 if (fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim) < 0) {
201 syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: %m",
202 fs->lfs_fsmnt);
203 retval = COALESCE_BADBMAPV;
204 goto out;
205 }
206 #if 0
207 for (i = 0; i < nb; i++) {
208 printf("bi_size = %d, bi_ino = %d, "
209 "bi_lbn = %d, bi_daddr = %d\n",
210 bip[i].bi_size, bip[i].bi_inode, bip[i].bi_lbn,
211 bip[i].bi_daddr);
212 }
213 #endif
214 noff = toff = 0;
215 for (i = 1; i < nb; i++) {
216 if (bip[i].bi_daddr != bip[i - 1].bi_daddr + fs->lfs_frag)
217 ++noff;
218 toff += abs(bip[i].bi_daddr - bip[i - 1].bi_daddr
219 - fs->lfs_frag) >> fs->lfs_fbshift;
220 }
221
222 /*
223 * If this file is not discontinuous, there's no point in rewriting it.
224 *
225 * Explicitly allow a certain amount of discontinuity, since large
226 * files will be broken among segments and medium-sized files
227 * can have a break or two and it's okay.
228 */
229 if (nb <= 1 || noff == 0 || noff < log2int(nb) ||
230 segtod(fs, noff) * 2 < nb) {
231 retval = COALESCE_NOTWORTHIT;
232 goto out;
233 } else if (debug)
234 syslog(LOG_DEBUG, "ino %llu total discontinuity "
235 "%d (%lld) for %d blocks", (unsigned long long)ino,
236 noff, (long long)toff, nb);
237
238 /* Search for blocks in active segments; don't move them. */
239 for (i = 0; i < nb; i++) {
240 if (bip[i].bi_daddr <= 0)
241 continue;
242 sup = &fs->clfs_segtab[dtosn(fs, bip[i].bi_daddr)];
243 if (sup->flags & SEGUSE_ACTIVE)
244 bip[i].bi_daddr = LFS_UNUSED_DADDR; /* 0 */
245 }
246
247 /*
248 * Get rid of any blocks we've marked dead. If this is an older
249 * kernel that doesn't have bmapv fill in the block sizes, we'll
250 * toss everything here.
251 */
252 onb = nb;
253 toss_old_blocks(fs, &bip, &nb);
254 nb = i;
255
256 /*
257 * We may have tossed enough blocks that it is no longer worthwhile
258 * to rewrite this inode.
259 */
260 if (nb == 0 || onb - nb > log2int(onb)) {
261 if (debug)
262 syslog(LOG_DEBUG, "too many blocks tossed, not rewriting");
263 retval = COALESCE_NOTHINGLEFT;
264 goto out;
265 }
266
267 /*
268 * We are going to rewrite this inode.
269 * For any remaining blocks, read in their contents.
270 */
271 for (i = 0; i < nb; i++) {
272 bip[i].bi_bp = malloc(bip[i].bi_size);
273 if (bip[i].bi_bp == NULL) {
274 syslog(LOG_WARNING, "allocate block buffer size=%d: %m",
275 bip[i].bi_size);
276 retval = COALESCE_NOMEM;
277 goto out;
278 }
279
280 if (pread(fs->clfs_devfd, bip[i].bi_bp, bip[i].bi_size,
281 fsbtob(fs, bip[i].bi_daddr)) < 0) {
282 retval = COALESCE_EIO;
283 goto out;
284 }
285 }
286 if (debug)
287 syslog(LOG_DEBUG, "ino %llu markv %d blocks",
288 (unsigned long long)ino, nb);
289
290 /*
291 * Write in segment-sized chunks. If at any point we'd write more
292 * than half of the available segments, sleep until that's not
293 * true any more.
294 */
295 bps = segtod(fs, 1);
296 for (tbip = bip; tbip < bip + nb; tbip += bps) {
297 do {
298 bread(fs->lfs_ivnode, 0, fs->lfs_bsize, NOCRED, &bp);
299 cip = *(CLEANERINFO *)bp->b_data;
300 bp->b_flags |= B_INVAL;
301 brelse(bp);
302
303 if (cip.clean < 4) /* XXX magic number 4 */
304 fcntl(fs->clfs_ifilefd, LFCNSEGWAIT, NULL);
305 } while(cip.clean < 4);
306
307 lim.blkiov = tbip;
308 lim.blkcnt = (tbip + bps < bip + nb ? bps : nb % bps);
309 if (fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim) < 0) {
310 retval = COALESCE_BADMARKV;
311 goto out;
312 }
313 }
314
315 retval = COALESCE_OK;
316 out:
317 free(dip);
318 if (bip) {
319 for (i = 0; i < onb; i++)
320 if (bip[i].bi_bp)
321 free(bip[i].bi_bp);
322 free(bip);
323 }
324 return retval;
325 }
326
327 /*
328 * Try coalescing every inode in the filesystem.
329 * Return the number of inodes actually altered.
330 */
331 int clean_all_inodes(struct clfs *fs)
332 {
333 int i, r, maxino;
334 int totals[COALESCE_MAXERROR];
335 struct stat st;
336
337 memset(totals, 0, sizeof(totals));
338
339 fstat(fs->clfs_ifilefd, &st);
340 maxino = fs->lfs_ifpb * (st.st_size >> fs->lfs_bshift) -
341 fs->lfs_segtabsz - fs->lfs_cleansz;
342
343 for (i = 0; i < maxino; i++) {
344 r = clean_inode(fs, i);
345 ++totals[r];
346 }
347
348 for (i = 0; i < COALESCE_MAXERROR; i++)
349 if (totals[i])
350 syslog(LOG_DEBUG, "%s: %d", coalesce_return[i],
351 totals[i]);
352
353 return totals[COALESCE_OK];
354 }
355
356 /*
357 * Fork a child process to coalesce this fs.
358 */
359 int
360 fork_coalesce(struct clfs *fs)
361 {
362 static pid_t childpid;
363 int num;
364
365 /*
366 * If already running a coalescing child, don't start a new one.
367 */
368 if (childpid) {
369 if (waitpid(childpid, NULL, WNOHANG) == childpid)
370 childpid = 0;
371 }
372 if (childpid && kill(childpid, 0) >= 0) {
373 /* already running a coalesce process */
374 if (debug)
375 syslog(LOG_DEBUG, "coalescing already in progress");
376 return 0;
377 }
378
379 /*
380 * Fork a child and let the child coalease
381 */
382 childpid = fork();
383 if (childpid < 0) {
384 syslog(LOG_ERR, "%s: fork to coaleasce: %m", fs->lfs_fsmnt);
385 return 0;
386 } else if (childpid == 0) {
387 syslog(LOG_NOTICE, "%s: new coalescing process, pid %d",
388 fs->lfs_fsmnt, getpid());
389 num = clean_all_inodes(fs);
390 syslog(LOG_NOTICE, "%s: coalesced %d discontiguous inodes",
391 fs->lfs_fsmnt, num);
392 exit(0);
393 }
394
395 return 0;
396 }
397