coalesce.c revision 1.22 1 /* $NetBSD: coalesce.c,v 1.22 2013/06/08 21:15:30 dholland Exp $ */
2
3 /*-
4 * Copyright (c) 2002, 2005 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Konrad E. Schroder <perseant (at) hhhh.org>.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include <sys/param.h>
33 #include <sys/mount.h>
34 #include <sys/time.h>
35 #include <sys/resource.h>
36 #include <sys/types.h>
37 #include <sys/wait.h>
38 #include <sys/mman.h>
39
40 #include <ufs/lfs/lfs.h>
41
42 #include <fcntl.h>
43 #include <signal.h>
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <time.h>
48 #include <unistd.h>
49 #include <util.h>
50 #include <errno.h>
51 #include <err.h>
52
53 #include <syslog.h>
54
55 #include "bufcache.h"
56 #include "vnode.h"
57 #include "cleaner.h"
58 #include "kernelops.h"
59
60 extern int debug, do_mmap;
61
62 int log2int(int n)
63 {
64 int log;
65
66 log = 0;
67 while (n > 0) {
68 ++log;
69 n >>= 1;
70 }
71 return log - 1;
72 }
73
74 enum coalesce_returncodes {
75 COALESCE_OK = 0,
76 COALESCE_NOINODE,
77 COALESCE_TOOSMALL,
78 COALESCE_BADSIZE,
79 COALESCE_BADBLOCKSIZE,
80 COALESCE_NOMEM,
81 COALESCE_BADBMAPV,
82 COALESCE_BADMARKV,
83 COALESCE_NOTWORTHIT,
84 COALESCE_NOTHINGLEFT,
85 COALESCE_EIO,
86
87 COALESCE_MAXERROR
88 };
89
90 const char *coalesce_return[] = {
91 "Successfully coalesced",
92 "File not in use or inode not found",
93 "Not large enough to coalesce",
94 "Negative size",
95 "Not enough blocks to account for size",
96 "Malloc failed",
97 "LFCNBMAPV failed",
98 "Not broken enough to fix",
99 "Too many blocks not found",
100 "Too many blocks found in active segments",
101 "I/O error",
102
103 "No such error"
104 };
105
106 static struct ulfs1_dinode *
107 get_dinode(struct clfs *fs, ino_t ino)
108 {
109 IFILE *ifp;
110 daddr_t daddr;
111 struct ubuf *bp;
112 struct ulfs1_dinode *dip, *r;
113
114 lfs_ientry(&ifp, fs, ino, &bp);
115 daddr = ifp->if_daddr;
116 brelse(bp, 0);
117
118 if (daddr == 0x0)
119 return NULL;
120
121 bread(fs->clfs_devvp, daddr, fs->lfs_ibsize, NOCRED, 0, &bp);
122 for (dip = (struct ulfs1_dinode *)bp->b_data;
123 dip < (struct ulfs1_dinode *)(bp->b_data + fs->lfs_ibsize); dip++)
124 if (dip->di_inumber == ino) {
125 r = (struct ulfs1_dinode *)malloc(sizeof(*r));
126 if (r == NULL)
127 break;
128 memcpy(r, dip, sizeof(*r));
129 brelse(bp, 0);
130 return r;
131 }
132 brelse(bp, 0);
133 return NULL;
134 }
135
136 /*
137 * Find out if this inode's data blocks are discontinuous; if they are,
138 * rewrite them using markv. Return the number of inodes rewritten.
139 */
140 static int
141 clean_inode(struct clfs *fs, ino_t ino)
142 {
143 BLOCK_INFO *bip = NULL, *tbip;
144 CLEANERINFO cip;
145 struct ubuf *bp;
146 struct ulfs1_dinode *dip;
147 struct clfs_seguse *sup;
148 struct lfs_fcntl_markv /* {
149 BLOCK_INFO *blkiov;
150 int blkcnt;
151 } */ lim;
152 daddr_t toff;
153 int i;
154 int nb, onb, noff;
155 int retval;
156 int bps;
157
158 dip = get_dinode(fs, ino);
159 if (dip == NULL)
160 return COALESCE_NOINODE;
161
162 /* Compute file block size, set up for bmapv */
163 onb = nb = lblkno(fs, dip->di_size);
164
165 /* XXX for now, don't do any file small enough to have fragments */
166 if (nb < ULFS_NDADDR) {
167 free(dip);
168 return COALESCE_TOOSMALL;
169 }
170
171 /* Sanity checks */
172 #if 0 /* di_size is uint64_t -- this is a noop */
173 if (dip->di_size < 0) {
174 dlog("ino %d, negative size (%" PRId64 ")", ino, dip->di_size);
175 free(dip);
176 return COALESCE_BADSIZE;
177 }
178 #endif
179 if (nb > dip->di_blocks) {
180 dlog("ino %d, computed blocks %d > held blocks %d", ino, nb,
181 dip->di_blocks);
182 free(dip);
183 return COALESCE_BADBLOCKSIZE;
184 }
185
186 bip = (BLOCK_INFO *)malloc(sizeof(BLOCK_INFO) * nb);
187 if (bip == NULL) {
188 syslog(LOG_WARNING, "ino %llu, %d blocks: %m",
189 (unsigned long long)ino, nb);
190 free(dip);
191 return COALESCE_NOMEM;
192 }
193 for (i = 0; i < nb; i++) {
194 memset(bip + i, 0, sizeof(BLOCK_INFO));
195 bip[i].bi_inode = ino;
196 bip[i].bi_lbn = i;
197 bip[i].bi_version = dip->di_gen;
198 /* Don't set the size, but let lfs_bmap fill it in */
199 }
200 lim.blkiov = bip;
201 lim.blkcnt = nb;
202 if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim) < 0) {
203 syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: %m",
204 fs->lfs_fsmnt);
205 retval = COALESCE_BADBMAPV;
206 goto out;
207 }
208 #if 0
209 for (i = 0; i < nb; i++) {
210 printf("bi_size = %d, bi_ino = %d, "
211 "bi_lbn = %d, bi_daddr = %d\n",
212 bip[i].bi_size, bip[i].bi_inode, bip[i].bi_lbn,
213 bip[i].bi_daddr);
214 }
215 #endif
216 noff = toff = 0;
217 for (i = 1; i < nb; i++) {
218 if (bip[i].bi_daddr != bip[i - 1].bi_daddr + fs->lfs_frag)
219 ++noff;
220 toff += abs(bip[i].bi_daddr - bip[i - 1].bi_daddr
221 - fs->lfs_frag) >> fs->lfs_fbshift;
222 }
223
224 /*
225 * If this file is not discontinuous, there's no point in rewriting it.
226 *
227 * Explicitly allow a certain amount of discontinuity, since large
228 * files will be broken among segments and medium-sized files
229 * can have a break or two and it's okay.
230 */
231 if (nb <= 1 || noff == 0 || noff < log2int(nb) ||
232 segtod(fs, noff) * 2 < nb) {
233 retval = COALESCE_NOTWORTHIT;
234 goto out;
235 } else if (debug)
236 syslog(LOG_DEBUG, "ino %llu total discontinuity "
237 "%d (%lld) for %d blocks", (unsigned long long)ino,
238 noff, (long long)toff, nb);
239
240 /* Search for blocks in active segments; don't move them. */
241 for (i = 0; i < nb; i++) {
242 if (bip[i].bi_daddr <= 0)
243 continue;
244 sup = &fs->clfs_segtab[dtosn(fs, bip[i].bi_daddr)];
245 if (sup->flags & SEGUSE_ACTIVE)
246 bip[i].bi_daddr = LFS_UNUSED_DADDR; /* 0 */
247 }
248
249 /*
250 * Get rid of any blocks we've marked dead. If this is an older
251 * kernel that doesn't have bmapv fill in the block sizes, we'll
252 * toss everything here.
253 */
254 onb = nb;
255 toss_old_blocks(fs, &bip, &nb, NULL);
256 nb = i;
257
258 /*
259 * We may have tossed enough blocks that it is no longer worthwhile
260 * to rewrite this inode.
261 */
262 if (nb == 0 || onb - nb > log2int(onb)) {
263 if (debug)
264 syslog(LOG_DEBUG, "too many blocks tossed, not rewriting");
265 retval = COALESCE_NOTHINGLEFT;
266 goto out;
267 }
268
269 /*
270 * We are going to rewrite this inode.
271 * For any remaining blocks, read in their contents.
272 */
273 for (i = 0; i < nb; i++) {
274 bip[i].bi_bp = malloc(bip[i].bi_size);
275 if (bip[i].bi_bp == NULL) {
276 syslog(LOG_WARNING, "allocate block buffer size=%d: %m",
277 bip[i].bi_size);
278 retval = COALESCE_NOMEM;
279 goto out;
280 }
281
282 if (kops.ko_pread(fs->clfs_devfd, bip[i].bi_bp, bip[i].bi_size,
283 fsbtob(fs, bip[i].bi_daddr)) < 0) {
284 retval = COALESCE_EIO;
285 goto out;
286 }
287 }
288 if (debug)
289 syslog(LOG_DEBUG, "ino %llu markv %d blocks",
290 (unsigned long long)ino, nb);
291
292 /*
293 * Write in segment-sized chunks. If at any point we'd write more
294 * than half of the available segments, sleep until that's not
295 * true any more.
296 */
297 bps = segtod(fs, 1);
298 for (tbip = bip; tbip < bip + nb; tbip += bps) {
299 do {
300 bread(fs->lfs_ivnode, 0, fs->lfs_bsize, NOCRED, 0, &bp);
301 cip = *(CLEANERINFO *)bp->b_data;
302 brelse(bp, B_INVAL);
303
304 if (cip.clean < 4) /* XXX magic number 4 */
305 kops.ko_fcntl(fs->clfs_ifilefd,
306 LFCNSEGWAIT, NULL);
307 } while(cip.clean < 4);
308
309 lim.blkiov = tbip;
310 lim.blkcnt = (tbip + bps < bip + nb ? bps : nb % bps);
311 if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim) < 0) {
312 retval = COALESCE_BADMARKV;
313 goto out;
314 }
315 }
316
317 retval = COALESCE_OK;
318 out:
319 free(dip);
320 if (bip) {
321 for (i = 0; i < onb; i++)
322 if (bip[i].bi_bp)
323 free(bip[i].bi_bp);
324 free(bip);
325 }
326 return retval;
327 }
328
329 /*
330 * Try coalescing every inode in the filesystem.
331 * Return the number of inodes actually altered.
332 */
333 int clean_all_inodes(struct clfs *fs)
334 {
335 int i, r, maxino;
336 int totals[COALESCE_MAXERROR];
337 struct stat st;
338
339 memset(totals, 0, sizeof(totals));
340
341 fstat(fs->clfs_ifilefd, &st);
342 maxino = fs->lfs_ifpb * (st.st_size >> fs->lfs_bshift) -
343 fs->lfs_segtabsz - fs->lfs_cleansz;
344
345 for (i = 0; i < maxino; i++) {
346 r = clean_inode(fs, i);
347 ++totals[r];
348 }
349
350 for (i = 0; i < COALESCE_MAXERROR; i++)
351 if (totals[i])
352 syslog(LOG_DEBUG, "%s: %d", coalesce_return[i],
353 totals[i]);
354
355 return totals[COALESCE_OK];
356 }
357
358 /*
359 * Fork a child process to coalesce this fs.
360 */
361 int
362 fork_coalesce(struct clfs *fs)
363 {
364 static pid_t childpid;
365 int num;
366
367 /*
368 * If already running a coalescing child, don't start a new one.
369 */
370 if (childpid) {
371 if (waitpid(childpid, NULL, WNOHANG) == childpid)
372 childpid = 0;
373 }
374 if (childpid && kill(childpid, 0) >= 0) {
375 /* already running a coalesce process */
376 if (debug)
377 syslog(LOG_DEBUG, "coalescing already in progress");
378 return 0;
379 }
380
381 /*
382 * Fork a child and let the child coalease
383 */
384 childpid = fork();
385 if (childpid < 0) {
386 syslog(LOG_ERR, "%s: fork to coaleasce: %m", fs->lfs_fsmnt);
387 return 0;
388 } else if (childpid == 0) {
389 syslog(LOG_NOTICE, "%s: new coalescing process, pid %d",
390 fs->lfs_fsmnt, getpid());
391 num = clean_all_inodes(fs);
392 syslog(LOG_NOTICE, "%s: coalesced %d discontiguous inodes",
393 fs->lfs_fsmnt, num);
394 exit(0);
395 }
396
397 return 0;
398 }
399