coalesce.c revision 1.19 1 /* $NetBSD: coalesce.c,v 1.19 2012/01/02 21:35:17 perseant Exp $ */
2
3 /*-
4 * Copyright (c) 2002, 2005 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Konrad E. Schroder <perseant (at) hhhh.org>.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include <sys/param.h>
33 #include <sys/mount.h>
34 #include <sys/time.h>
35 #include <sys/resource.h>
36 #include <sys/types.h>
37 #include <sys/wait.h>
38 #include <sys/mman.h>
39
40 #include <ufs/ufs/dinode.h>
41 #include <ufs/lfs/lfs.h>
42
43 #include <fcntl.h>
44 #include <signal.h>
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <time.h>
49 #include <unistd.h>
50 #include <util.h>
51 #include <errno.h>
52 #include <err.h>
53
54 #include <syslog.h>
55
56 #include "bufcache.h"
57 #include "vnode.h"
58 #include "cleaner.h"
59 #include "kernelops.h"
60
61 extern int debug, do_mmap;
62
63 int log2int(int n)
64 {
65 int log;
66
67 log = 0;
68 while (n > 0) {
69 ++log;
70 n >>= 1;
71 }
72 return log - 1;
73 }
74
75 enum coalesce_returncodes {
76 COALESCE_OK = 0,
77 COALESCE_NOINODE,
78 COALESCE_TOOSMALL,
79 COALESCE_BADSIZE,
80 COALESCE_BADBLOCKSIZE,
81 COALESCE_NOMEM,
82 COALESCE_BADBMAPV,
83 COALESCE_BADMARKV,
84 COALESCE_NOTWORTHIT,
85 COALESCE_NOTHINGLEFT,
86 COALESCE_EIO,
87
88 COALESCE_MAXERROR
89 };
90
91 const char *coalesce_return[] = {
92 "Successfully coalesced",
93 "File not in use or inode not found",
94 "Not large enough to coalesce",
95 "Negative size",
96 "Not enough blocks to account for size",
97 "Malloc failed",
98 "LFCNBMAPV failed",
99 "Not broken enough to fix",
100 "Too many blocks not found",
101 "Too many blocks found in active segments",
102 "I/O error",
103
104 "No such error"
105 };
106
107 static struct ufs1_dinode *
108 get_dinode(struct clfs *fs, ino_t ino)
109 {
110 IFILE *ifp;
111 daddr_t daddr;
112 struct ubuf *bp;
113 struct ufs1_dinode *dip, *r;
114
115 lfs_ientry(&ifp, fs, ino, &bp);
116 daddr = ifp->if_daddr;
117 brelse(bp, 0);
118
119 if (daddr == 0x0)
120 return NULL;
121
122 bread(fs->clfs_devvp, daddr, fs->lfs_ibsize, NOCRED, 0, &bp);
123 for (dip = (struct ufs1_dinode *)bp->b_data;
124 dip < (struct ufs1_dinode *)(bp->b_data + fs->lfs_ibsize); dip++)
125 if (dip->di_inumber == ino) {
126 r = (struct ufs1_dinode *)malloc(sizeof(*r));
127 if (r == NULL)
128 break;
129 memcpy(r, dip, sizeof(*r));
130 brelse(bp, 0);
131 return r;
132 }
133 brelse(bp, 0);
134 return NULL;
135 }
136
137 /*
138 * Find out if this inode's data blocks are discontinuous; if they are,
139 * rewrite them using markv. Return the number of inodes rewritten.
140 */
141 static int
142 clean_inode(struct clfs *fs, ino_t ino)
143 {
144 BLOCK_INFO *bip = NULL, *tbip;
145 CLEANERINFO cip;
146 struct ubuf *bp;
147 struct ufs1_dinode *dip;
148 struct clfs_seguse *sup;
149 struct lfs_fcntl_markv /* {
150 BLOCK_INFO *blkiov;
151 int blkcnt;
152 } */ lim;
153 daddr_t toff;
154 int i;
155 int nb, onb, noff;
156 int retval;
157 int bps;
158
159 dip = get_dinode(fs, ino);
160 if (dip == NULL)
161 return COALESCE_NOINODE;
162
163 /* Compute file block size, set up for bmapv */
164 onb = nb = lblkno(fs, dip->di_size);
165
166 /* XXX for now, don't do any file small enough to have fragments */
167 if (nb < NDADDR) {
168 free(dip);
169 return COALESCE_TOOSMALL;
170 }
171
172 /* Sanity checks */
173 #if 0 /* di_size is uint64_t -- this is a noop */
174 if (dip->di_size < 0) {
175 dlog("ino %d, negative size (%" PRId64 ")", ino, dip->di_size);
176 free(dip);
177 return COALESCE_BADSIZE;
178 }
179 #endif
180 if (nb > dip->di_blocks) {
181 dlog("ino %d, computed blocks %d > held blocks %d", ino, nb,
182 dip->di_blocks);
183 free(dip);
184 return COALESCE_BADBLOCKSIZE;
185 }
186
187 bip = (BLOCK_INFO *)malloc(sizeof(BLOCK_INFO) * nb);
188 if (bip == NULL) {
189 syslog(LOG_WARNING, "ino %llu, %d blocks: %m",
190 (unsigned long long)ino, nb);
191 free(dip);
192 return COALESCE_NOMEM;
193 }
194 for (i = 0; i < nb; i++) {
195 memset(bip + i, 0, sizeof(BLOCK_INFO));
196 bip[i].bi_inode = ino;
197 bip[i].bi_lbn = i;
198 bip[i].bi_version = dip->di_gen;
199 /* Don't set the size, but let lfs_bmap fill it in */
200 }
201 lim.blkiov = bip;
202 lim.blkcnt = nb;
203 if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim) < 0) {
204 syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: %m",
205 fs->lfs_fsmnt);
206 retval = COALESCE_BADBMAPV;
207 goto out;
208 }
209 #if 0
210 for (i = 0; i < nb; i++) {
211 printf("bi_size = %d, bi_ino = %d, "
212 "bi_lbn = %d, bi_daddr = %d\n",
213 bip[i].bi_size, bip[i].bi_inode, bip[i].bi_lbn,
214 bip[i].bi_daddr);
215 }
216 #endif
217 noff = toff = 0;
218 for (i = 1; i < nb; i++) {
219 if (bip[i].bi_daddr != bip[i - 1].bi_daddr + fs->lfs_frag)
220 ++noff;
221 toff += abs(bip[i].bi_daddr - bip[i - 1].bi_daddr
222 - fs->lfs_frag) >> fs->lfs_fbshift;
223 }
224
225 /*
226 * If this file is not discontinuous, there's no point in rewriting it.
227 *
228 * Explicitly allow a certain amount of discontinuity, since large
229 * files will be broken among segments and medium-sized files
230 * can have a break or two and it's okay.
231 */
232 if (nb <= 1 || noff == 0 || noff < log2int(nb) ||
233 segtod(fs, noff) * 2 < nb) {
234 retval = COALESCE_NOTWORTHIT;
235 goto out;
236 } else if (debug)
237 syslog(LOG_DEBUG, "ino %llu total discontinuity "
238 "%d (%lld) for %d blocks", (unsigned long long)ino,
239 noff, (long long)toff, nb);
240
241 /* Search for blocks in active segments; don't move them. */
242 for (i = 0; i < nb; i++) {
243 if (bip[i].bi_daddr <= 0)
244 continue;
245 sup = &fs->clfs_segtab[dtosn(fs, bip[i].bi_daddr)];
246 if (sup->flags & SEGUSE_ACTIVE)
247 bip[i].bi_daddr = LFS_UNUSED_DADDR; /* 0 */
248 }
249
250 /*
251 * Get rid of any blocks we've marked dead. If this is an older
252 * kernel that doesn't have bmapv fill in the block sizes, we'll
253 * toss everything here.
254 */
255 onb = nb;
256 toss_old_blocks(fs, &bip, &nb, NULL);
257 nb = i;
258
259 /*
260 * We may have tossed enough blocks that it is no longer worthwhile
261 * to rewrite this inode.
262 */
263 if (nb == 0 || onb - nb > log2int(onb)) {
264 if (debug)
265 syslog(LOG_DEBUG, "too many blocks tossed, not rewriting");
266 retval = COALESCE_NOTHINGLEFT;
267 goto out;
268 }
269
270 /*
271 * We are going to rewrite this inode.
272 * For any remaining blocks, read in their contents.
273 */
274 for (i = 0; i < nb; i++) {
275 bip[i].bi_bp = malloc(bip[i].bi_size);
276 if (bip[i].bi_bp == NULL) {
277 syslog(LOG_WARNING, "allocate block buffer size=%d: %m",
278 bip[i].bi_size);
279 retval = COALESCE_NOMEM;
280 goto out;
281 }
282
283 if (kops.ko_pread(fs->clfs_devfd, bip[i].bi_bp, bip[i].bi_size,
284 fsbtob(fs, bip[i].bi_daddr)) < 0) {
285 retval = COALESCE_EIO;
286 goto out;
287 }
288 }
289 if (debug)
290 syslog(LOG_DEBUG, "ino %llu markv %d blocks",
291 (unsigned long long)ino, nb);
292
293 /*
294 * Write in segment-sized chunks. If at any point we'd write more
295 * than half of the available segments, sleep until that's not
296 * true any more.
297 */
298 bps = segtod(fs, 1);
299 for (tbip = bip; tbip < bip + nb; tbip += bps) {
300 do {
301 bread(fs->lfs_ivnode, 0, fs->lfs_bsize, NOCRED, 0, &bp);
302 cip = *(CLEANERINFO *)bp->b_data;
303 brelse(bp, B_INVAL);
304
305 if (cip.clean < 4) /* XXX magic number 4 */
306 kops.ko_fcntl(fs->clfs_ifilefd,
307 LFCNSEGWAIT, NULL);
308 } while(cip.clean < 4);
309
310 lim.blkiov = tbip;
311 lim.blkcnt = (tbip + bps < bip + nb ? bps : nb % bps);
312 if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim) < 0) {
313 retval = COALESCE_BADMARKV;
314 goto out;
315 }
316 }
317
318 retval = COALESCE_OK;
319 out:
320 free(dip);
321 if (bip) {
322 for (i = 0; i < onb; i++)
323 if (bip[i].bi_bp)
324 free(bip[i].bi_bp);
325 free(bip);
326 }
327 return retval;
328 }
329
330 /*
331 * Try coalescing every inode in the filesystem.
332 * Return the number of inodes actually altered.
333 */
334 int clean_all_inodes(struct clfs *fs)
335 {
336 int i, r, maxino;
337 int totals[COALESCE_MAXERROR];
338 struct stat st;
339
340 memset(totals, 0, sizeof(totals));
341
342 fstat(fs->clfs_ifilefd, &st);
343 maxino = fs->lfs_ifpb * (st.st_size >> fs->lfs_bshift) -
344 fs->lfs_segtabsz - fs->lfs_cleansz;
345
346 for (i = 0; i < maxino; i++) {
347 r = clean_inode(fs, i);
348 ++totals[r];
349 }
350
351 for (i = 0; i < COALESCE_MAXERROR; i++)
352 if (totals[i])
353 syslog(LOG_DEBUG, "%s: %d", coalesce_return[i],
354 totals[i]);
355
356 return totals[COALESCE_OK];
357 }
358
359 /*
360 * Fork a child process to coalesce this fs.
361 */
362 int
363 fork_coalesce(struct clfs *fs)
364 {
365 static pid_t childpid;
366 int num;
367
368 /*
369 * If already running a coalescing child, don't start a new one.
370 */
371 if (childpid) {
372 if (waitpid(childpid, NULL, WNOHANG) == childpid)
373 childpid = 0;
374 }
375 if (childpid && kill(childpid, 0) >= 0) {
376 /* already running a coalesce process */
377 if (debug)
378 syslog(LOG_DEBUG, "coalescing already in progress");
379 return 0;
380 }
381
382 /*
383 * Fork a child and let the child coalease
384 */
385 childpid = fork();
386 if (childpid < 0) {
387 syslog(LOG_ERR, "%s: fork to coaleasce: %m", fs->lfs_fsmnt);
388 return 0;
389 } else if (childpid == 0) {
390 syslog(LOG_NOTICE, "%s: new coalescing process, pid %d",
391 fs->lfs_fsmnt, getpid());
392 num = clean_all_inodes(fs);
393 syslog(LOG_NOTICE, "%s: coalesced %d discontiguous inodes",
394 fs->lfs_fsmnt, num);
395 exit(0);
396 }
397
398 return 0;
399 }
400