coalesce.c revision 1.11 1 1.11 perseant /* $NetBSD: coalesce.c,v 1.11 2006/03/30 19:10:13 perseant Exp $ */
2 1.1 perseant
3 1.1 perseant /*-
4 1.11 perseant * Copyright (c) 2002, 2005 The NetBSD Foundation, Inc.
5 1.1 perseant * All rights reserved.
6 1.1 perseant *
7 1.1 perseant * This code is derived from software contributed to The NetBSD Foundation
8 1.1 perseant * by Konrad E. Schroder <perseant (at) hhhh.org>.
9 1.1 perseant *
10 1.1 perseant * Redistribution and use in source and binary forms, with or without
11 1.1 perseant * modification, are permitted provided that the following conditions
12 1.1 perseant * are met:
13 1.1 perseant * 1. Redistributions of source code must retain the above copyright
14 1.1 perseant * notice, this list of conditions and the following disclaimer.
15 1.1 perseant * 2. Redistributions in binary form must reproduce the above copyright
16 1.1 perseant * notice, this list of conditions and the following disclaimer in the
17 1.1 perseant * documentation and/or other materials provided with the distribution.
18 1.1 perseant * 3. All advertising materials mentioning features or use of this software
19 1.1 perseant * must display the following acknowledgement:
20 1.1 perseant * This product includes software developed by the NetBSD
21 1.1 perseant * Foundation, Inc. and its contributors.
22 1.1 perseant * 4. Neither the name of The NetBSD Foundation nor the names of its
23 1.1 perseant * contributors may be used to endorse or promote products derived
24 1.1 perseant * from this software without specific prior written permission.
25 1.1 perseant *
26 1.1 perseant * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 1.1 perseant * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 1.1 perseant * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 1.1 perseant * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 1.1 perseant * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 1.1 perseant * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 1.1 perseant * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 1.1 perseant * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 1.1 perseant * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 1.1 perseant * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 1.1 perseant * POSSIBILITY OF SUCH DAMAGE.
37 1.1 perseant */
38 1.1 perseant
39 1.1 perseant #include <sys/param.h>
40 1.1 perseant #include <sys/mount.h>
41 1.1 perseant #include <sys/time.h>
42 1.1 perseant #include <sys/resource.h>
43 1.1 perseant #include <sys/types.h>
44 1.1 perseant #include <sys/wait.h>
45 1.1 perseant #include <sys/mman.h>
46 1.1 perseant
47 1.1 perseant #include <ufs/ufs/dinode.h>
48 1.1 perseant #include <ufs/lfs/lfs.h>
49 1.1 perseant
50 1.1 perseant #include <fcntl.h>
51 1.1 perseant #include <signal.h>
52 1.1 perseant #include <stdio.h>
53 1.1 perseant #include <stdlib.h>
54 1.1 perseant #include <string.h>
55 1.1 perseant #include <time.h>
56 1.1 perseant #include <unistd.h>
57 1.1 perseant #include <util.h>
58 1.1 perseant #include <errno.h>
59 1.1 perseant #include <err.h>
60 1.1 perseant
61 1.1 perseant #include <syslog.h>
62 1.1 perseant
63 1.11 perseant #include "bufcache.h"
64 1.11 perseant #include "vnode.h"
65 1.11 perseant #include "cleaner.h"
66 1.1 perseant
67 1.2 perseant extern int debug, do_mmap;
68 1.1 perseant
69 1.11 perseant int log2int(int n)
70 1.2 perseant {
71 1.2 perseant int log;
72 1.2 perseant
73 1.2 perseant log = 0;
74 1.2 perseant while (n > 0) {
75 1.2 perseant ++log;
76 1.11 perseant n >>= 1;
77 1.2 perseant }
78 1.2 perseant return log - 1;
79 1.2 perseant }
80 1.2 perseant
81 1.3 perseant enum coalesce_returncodes {
82 1.3 perseant COALESCE_OK = 0,
83 1.3 perseant COALESCE_NOINODE,
84 1.3 perseant COALESCE_TOOSMALL,
85 1.3 perseant COALESCE_BADSIZE,
86 1.3 perseant COALESCE_BADBLOCKSIZE,
87 1.3 perseant COALESCE_NOMEM,
88 1.3 perseant COALESCE_BADBMAPV,
89 1.11 perseant COALESCE_BADMARKV,
90 1.3 perseant COALESCE_NOTWORTHIT,
91 1.3 perseant COALESCE_NOTHINGLEFT,
92 1.5 yamt COALESCE_EIO,
93 1.3 perseant
94 1.3 perseant COALESCE_MAXERROR
95 1.3 perseant };
96 1.3 perseant
97 1.3 perseant char *coalesce_return[] = {
98 1.3 perseant "Successfully coalesced",
99 1.3 perseant "File not in use or inode not found",
100 1.3 perseant "Not large enough to coalesce",
101 1.3 perseant "Negative size",
102 1.3 perseant "Not enough blocks to account for size",
103 1.3 perseant "Malloc failed",
104 1.8 perseant "LFCNBMAPV failed",
105 1.3 perseant "Not broken enough to fix",
106 1.3 perseant "Too many blocks not found",
107 1.3 perseant "Too many blocks found in active segments",
108 1.5 yamt "I/O error",
109 1.3 perseant
110 1.3 perseant "No such error"
111 1.3 perseant };
112 1.3 perseant
113 1.11 perseant static struct ufs1_dinode *
114 1.11 perseant get_dinode(struct clfs *fs, ino_t ino)
115 1.11 perseant {
116 1.11 perseant IFILE *ifp;
117 1.11 perseant daddr_t daddr;
118 1.11 perseant struct ubuf *bp;
119 1.11 perseant struct ufs1_dinode *dip, *r;
120 1.11 perseant
121 1.11 perseant lfs_ientry(&ifp, fs, ino, &bp);
122 1.11 perseant daddr = ifp->if_daddr;
123 1.11 perseant brelse(bp);
124 1.11 perseant
125 1.11 perseant if (daddr == 0x0)
126 1.11 perseant return NULL;
127 1.11 perseant
128 1.11 perseant bread(fs->clfs_devvp, daddr, fs->lfs_ibsize, NOCRED, &bp);
129 1.11 perseant for (dip = (struct ufs1_dinode *)bp->b_data;
130 1.11 perseant dip < (struct ufs1_dinode *)(bp->b_data + fs->lfs_ibsize); dip++)
131 1.11 perseant if (dip->di_inumber == ino) {
132 1.11 perseant r = (struct ufs1_dinode *)malloc(sizeof(*r));
133 1.11 perseant memcpy(r, dip, sizeof(*r));
134 1.11 perseant brelse(bp);
135 1.11 perseant return r;
136 1.11 perseant }
137 1.11 perseant brelse(bp);
138 1.11 perseant return NULL;
139 1.11 perseant }
140 1.11 perseant
141 1.1 perseant /*
142 1.1 perseant * Find out if this inode's data blocks are discontinuous; if they are,
143 1.7 perseant * rewrite them using markv. Return the number of inodes rewritten.
144 1.1 perseant */
145 1.11 perseant static int
146 1.11 perseant clean_inode(struct clfs *fs, ino_t ino)
147 1.1 perseant {
148 1.7 perseant BLOCK_INFO *bip = NULL, *tbip;
149 1.11 perseant CLEANERINFO cip;
150 1.11 perseant struct ubuf *bp;
151 1.9 fvdl struct ufs1_dinode *dip;
152 1.11 perseant struct clfs_seguse *sup;
153 1.11 perseant struct lfs_fcntl_markv /* {
154 1.11 perseant BLOCK_INFO *blkiov;
155 1.11 perseant int blkcnt;
156 1.11 perseant } */ lim;
157 1.11 perseant daddr_t toff;
158 1.11 perseant int i;
159 1.2 perseant int nb, onb, noff;
160 1.11 perseant int retval;
161 1.1 perseant int bps;
162 1.1 perseant
163 1.11 perseant dip = get_dinode(fs, ino);
164 1.1 perseant if (dip == NULL)
165 1.3 perseant return COALESCE_NOINODE;
166 1.1 perseant
167 1.7 perseant /* Compute file block size, set up for bmapv */
168 1.11 perseant onb = nb = lblkno(fs, dip->di_size);
169 1.2 perseant
170 1.2 perseant /* XXX for now, don't do any file small enough to have fragments */
171 1.2 perseant if (nb < NDADDR)
172 1.3 perseant return COALESCE_TOOSMALL;
173 1.2 perseant
174 1.2 perseant /* Sanity checks */
175 1.2 perseant if (dip->di_size < 0) {
176 1.11 perseant dlog("ino %d, negative size (%" PRId64 ")", ino, dip->di_size);
177 1.3 perseant return COALESCE_BADSIZE;
178 1.2 perseant }
179 1.1 perseant if (nb > dip->di_blocks) {
180 1.11 perseant dlog("ino %d, computed blocks %d > held blocks %d", ino, nb,
181 1.11 perseant dip->di_blocks);
182 1.3 perseant return COALESCE_BADBLOCKSIZE;
183 1.1 perseant }
184 1.2 perseant
185 1.7 perseant bip = (BLOCK_INFO *)malloc(sizeof(BLOCK_INFO) * nb);
186 1.1 perseant if (bip == NULL) {
187 1.10 christos syslog(LOG_WARNING, "ino %llu, %d blocks: %m",
188 1.10 christos (unsigned long long)ino, nb);
189 1.3 perseant return COALESCE_NOMEM;
190 1.1 perseant }
191 1.1 perseant for (i = 0; i < nb; i++) {
192 1.7 perseant memset(bip + i, 0, sizeof(BLOCK_INFO));
193 1.1 perseant bip[i].bi_inode = ino;
194 1.1 perseant bip[i].bi_lbn = i;
195 1.2 perseant bip[i].bi_version = dip->di_gen;
196 1.1 perseant /* Don't set the size, but let lfs_bmap fill it in */
197 1.1 perseant }
198 1.11 perseant lim.blkiov = bip;
199 1.11 perseant lim.blkcnt = nb;
200 1.11 perseant if (fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim) < 0) {
201 1.11 perseant syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: %m",
202 1.11 perseant fs->lfs_fsmnt);
203 1.5 yamt retval = COALESCE_BADBMAPV;
204 1.5 yamt goto out;
205 1.5 yamt }
206 1.5 yamt #if 0
207 1.5 yamt for (i = 0; i < nb; i++) {
208 1.5 yamt printf("bi_size = %d, bi_ino = %d, "
209 1.5 yamt "bi_lbn = %d, bi_daddr = %d\n",
210 1.5 yamt bip[i].bi_size, bip[i].bi_inode, bip[i].bi_lbn,
211 1.5 yamt bip[i].bi_daddr);
212 1.1 perseant }
213 1.5 yamt #endif
214 1.1 perseant noff = toff = 0;
215 1.1 perseant for (i = 1; i < nb; i++) {
216 1.11 perseant if (bip[i].bi_daddr != bip[i - 1].bi_daddr + fs->lfs_frag)
217 1.1 perseant ++noff;
218 1.4 yamt toff += abs(bip[i].bi_daddr - bip[i - 1].bi_daddr
219 1.11 perseant - fs->lfs_frag) >> fs->lfs_fbshift;
220 1.1 perseant }
221 1.1 perseant
222 1.1 perseant /*
223 1.1 perseant * If this file is not discontinuous, there's no point in rewriting it.
224 1.11 perseant *
225 1.11 perseant * Explicitly allow a certain amount of discontinuity, since large
226 1.11 perseant * files will be broken among segments and medium-sized files
227 1.11 perseant * can have a break or two and it's okay.
228 1.1 perseant */
229 1.2 perseant if (nb <= 1 || noff == 0 || noff < log2int(nb) ||
230 1.11 perseant segtod(fs, noff) * 2 < nb) {
231 1.5 yamt retval = COALESCE_NOTWORTHIT;
232 1.5 yamt goto out;
233 1.1 perseant } else if (debug)
234 1.10 christos syslog(LOG_DEBUG, "ino %llu total discontinuity "
235 1.10 christos "%d (%lld) for %d blocks", (unsigned long long)ino,
236 1.10 christos noff, (long long)toff, nb);
237 1.1 perseant
238 1.1 perseant /* Search for blocks in active segments; don't move them. */
239 1.1 perseant for (i = 0; i < nb; i++) {
240 1.1 perseant if (bip[i].bi_daddr <= 0)
241 1.1 perseant continue;
242 1.11 perseant sup = &fs->clfs_segtab[dtosn(fs, bip[i].bi_daddr)];
243 1.11 perseant if (sup->flags & SEGUSE_ACTIVE)
244 1.1 perseant bip[i].bi_daddr = LFS_UNUSED_DADDR; /* 0 */
245 1.1 perseant }
246 1.11 perseant
247 1.11 perseant /*
248 1.11 perseant * Get rid of any blocks we've marked dead. If this is an older
249 1.11 perseant * kernel that doesn't have bmapv fill in the block sizes, we'll
250 1.11 perseant * toss everything here.
251 1.1 perseant */
252 1.11 perseant onb = nb;
253 1.11 perseant toss_old_blocks(fs, &bip, &nb);
254 1.11 perseant nb = i;
255 1.2 perseant
256 1.1 perseant /*
257 1.2 perseant * We may have tossed enough blocks that it is no longer worthwhile
258 1.2 perseant * to rewrite this inode.
259 1.1 perseant */
260 1.11 perseant if (nb == 0 || onb - nb > log2int(onb)) {
261 1.3 perseant if (debug)
262 1.3 perseant syslog(LOG_DEBUG, "too many blocks tossed, not rewriting");
263 1.11 perseant retval = COALESCE_NOTHINGLEFT;
264 1.11 perseant goto out;
265 1.1 perseant }
266 1.1 perseant
267 1.11 perseant /*
268 1.1 perseant * We are going to rewrite this inode.
269 1.1 perseant * For any remaining blocks, read in their contents.
270 1.1 perseant */
271 1.1 perseant for (i = 0; i < nb; i++) {
272 1.1 perseant bip[i].bi_bp = malloc(bip[i].bi_size);
273 1.5 yamt if (bip[i].bi_bp == NULL) {
274 1.5 yamt syslog(LOG_WARNING, "allocate block buffer size=%d: %m",
275 1.5 yamt bip[i].bi_size);
276 1.5 yamt retval = COALESCE_NOMEM;
277 1.5 yamt goto out;
278 1.5 yamt }
279 1.11 perseant
280 1.11 perseant if (pread(fs->clfs_devfd, bip[i].bi_bp, bip[i].bi_size,
281 1.11 perseant fsbtob(fs, bip[i].bi_daddr)) < 0) {
282 1.5 yamt retval = COALESCE_EIO;
283 1.5 yamt goto out;
284 1.5 yamt }
285 1.1 perseant }
286 1.1 perseant if (debug)
287 1.10 christos syslog(LOG_DEBUG, "ino %llu markv %d blocks",
288 1.10 christos (unsigned long long)ino, nb);
289 1.1 perseant
290 1.2 perseant /*
291 1.2 perseant * Write in segment-sized chunks. If at any point we'd write more
292 1.2 perseant * than half of the available segments, sleep until that's not
293 1.2 perseant * true any more.
294 1.2 perseant */
295 1.11 perseant bps = segtod(fs, 1);
296 1.1 perseant for (tbip = bip; tbip < bip + nb; tbip += bps) {
297 1.11 perseant do {
298 1.11 perseant bread(fs->lfs_ivnode, 0, fs->lfs_bsize, NOCRED, &bp);
299 1.11 perseant cip = *(CLEANERINFO *)bp->b_data;
300 1.11 perseant bp->b_flags |= B_INVAL;
301 1.11 perseant brelse(bp);
302 1.11 perseant
303 1.11 perseant if (cip.clean < 4) /* XXX magic number 4 */
304 1.11 perseant fcntl(fs->clfs_ifilefd, LFCNSEGWAIT, NULL);
305 1.11 perseant } while(cip.clean < 4);
306 1.11 perseant
307 1.11 perseant lim.blkiov = tbip;
308 1.11 perseant lim.blkcnt = (tbip + bps < bip + nb ? bps : nb % bps);
309 1.11 perseant if (fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim) < 0) {
310 1.11 perseant retval = COALESCE_BADMARKV;
311 1.11 perseant goto out;
312 1.2 perseant }
313 1.1 perseant }
314 1.1 perseant
315 1.5 yamt retval = COALESCE_OK;
316 1.5 yamt out:
317 1.11 perseant free(dip);
318 1.5 yamt if (bip) {
319 1.5 yamt for (i = 0; i < onb; i++)
320 1.5 yamt if (bip[i].bi_bp)
321 1.5 yamt free(bip[i].bi_bp);
322 1.5 yamt free(bip);
323 1.5 yamt }
324 1.5 yamt return retval;
325 1.1 perseant }
326 1.1 perseant
327 1.1 perseant /*
328 1.1 perseant * Try coalescing every inode in the filesystem.
329 1.1 perseant * Return the number of inodes actually altered.
330 1.1 perseant */
331 1.11 perseant int clean_all_inodes(struct clfs *fs)
332 1.1 perseant {
333 1.11 perseant int i, r, maxino;
334 1.3 perseant int totals[COALESCE_MAXERROR];
335 1.11 perseant struct stat st;
336 1.1 perseant
337 1.3 perseant memset(totals, 0, sizeof(totals));
338 1.11 perseant
339 1.11 perseant fstat(fs->clfs_ifilefd, &st);
340 1.11 perseant maxino = fs->lfs_ifpb * (st.st_size >> fs->lfs_bshift) -
341 1.11 perseant fs->lfs_segtabsz - fs->lfs_cleansz;
342 1.11 perseant
343 1.11 perseant for (i = 0; i < maxino; i++) {
344 1.11 perseant r = clean_inode(fs, i);
345 1.3 perseant ++totals[r];
346 1.1 perseant }
347 1.3 perseant
348 1.3 perseant for (i = 0; i < COALESCE_MAXERROR; i++)
349 1.3 perseant if (totals[i])
350 1.3 perseant syslog(LOG_DEBUG, "%s: %d", coalesce_return[i],
351 1.11 perseant totals[i]);
352 1.11 perseant
353 1.3 perseant return totals[COALESCE_OK];
354 1.1 perseant }
355 1.1 perseant
356 1.11 perseant /*
357 1.11 perseant * Fork a child process to coalesce this fs.
358 1.11 perseant */
359 1.11 perseant int
360 1.11 perseant fork_coalesce(struct clfs *fs)
361 1.1 perseant {
362 1.1 perseant static pid_t childpid;
363 1.2 perseant int num;
364 1.2 perseant
365 1.11 perseant /*
366 1.11 perseant * If already running a coalescing child, don't start a new one.
367 1.11 perseant */
368 1.1 perseant if (childpid) {
369 1.11 perseant if (waitpid(childpid, NULL, WNOHANG) == childpid)
370 1.1 perseant childpid = 0;
371 1.1 perseant }
372 1.1 perseant if (childpid && kill(childpid, 0) >= 0) {
373 1.1 perseant /* already running a coalesce process */
374 1.2 perseant if (debug)
375 1.2 perseant syslog(LOG_DEBUG, "coalescing already in progress");
376 1.1 perseant return 0;
377 1.1 perseant }
378 1.11 perseant
379 1.11 perseant /*
380 1.11 perseant * Fork a child and let the child coalease
381 1.11 perseant */
382 1.1 perseant childpid = fork();
383 1.1 perseant if (childpid < 0) {
384 1.11 perseant syslog(LOG_ERR, "%s: fork to coaleasce: %m", fs->lfs_fsmnt);
385 1.1 perseant return 0;
386 1.1 perseant } else if (childpid == 0) {
387 1.11 perseant syslog(LOG_NOTICE, "%s: new coalescing process, pid %d",
388 1.11 perseant fs->lfs_fsmnt, getpid());
389 1.11 perseant num = clean_all_inodes(fs);
390 1.11 perseant syslog(LOG_NOTICE, "%s: coalesced %d discontiguous inodes",
391 1.11 perseant fs->lfs_fsmnt, num);
392 1.1 perseant exit(0);
393 1.1 perseant }
394 1.11 perseant
395 1.1 perseant return 0;
396 1.1 perseant }
397