coalesce.c revision 1.29 1 1.29 dholland /* $NetBSD: coalesce.c,v 1.29 2015/08/12 18:23:16 dholland Exp $ */
2 1.1 perseant
3 1.1 perseant /*-
4 1.11 perseant * Copyright (c) 2002, 2005 The NetBSD Foundation, Inc.
5 1.1 perseant * All rights reserved.
6 1.1 perseant *
7 1.1 perseant * This code is derived from software contributed to The NetBSD Foundation
8 1.1 perseant * by Konrad E. Schroder <perseant (at) hhhh.org>.
9 1.1 perseant *
10 1.1 perseant * Redistribution and use in source and binary forms, with or without
11 1.1 perseant * modification, are permitted provided that the following conditions
12 1.1 perseant * are met:
13 1.1 perseant * 1. Redistributions of source code must retain the above copyright
14 1.1 perseant * notice, this list of conditions and the following disclaimer.
15 1.1 perseant * 2. Redistributions in binary form must reproduce the above copyright
16 1.1 perseant * notice, this list of conditions and the following disclaimer in the
17 1.1 perseant * documentation and/or other materials provided with the distribution.
18 1.1 perseant *
19 1.1 perseant * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 1.1 perseant * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 1.1 perseant * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 1.1 perseant * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 1.1 perseant * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 1.1 perseant * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 1.1 perseant * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 1.1 perseant * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 1.1 perseant * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 1.1 perseant * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 1.1 perseant * POSSIBILITY OF SUCH DAMAGE.
30 1.1 perseant */
31 1.1 perseant
32 1.1 perseant #include <sys/param.h>
33 1.1 perseant #include <sys/mount.h>
34 1.1 perseant #include <sys/time.h>
35 1.1 perseant #include <sys/resource.h>
36 1.1 perseant #include <sys/types.h>
37 1.1 perseant #include <sys/wait.h>
38 1.1 perseant #include <sys/mman.h>
39 1.1 perseant
40 1.1 perseant #include <ufs/lfs/lfs.h>
41 1.1 perseant
42 1.1 perseant #include <fcntl.h>
43 1.1 perseant #include <signal.h>
44 1.1 perseant #include <stdio.h>
45 1.1 perseant #include <stdlib.h>
46 1.1 perseant #include <string.h>
47 1.1 perseant #include <time.h>
48 1.1 perseant #include <unistd.h>
49 1.1 perseant #include <util.h>
50 1.1 perseant #include <errno.h>
51 1.1 perseant #include <err.h>
52 1.26 dholland #include <assert.h>
53 1.1 perseant
54 1.1 perseant #include <syslog.h>
55 1.1 perseant
56 1.11 perseant #include "bufcache.h"
57 1.11 perseant #include "vnode.h"
58 1.11 perseant #include "cleaner.h"
59 1.18 pooka #include "kernelops.h"
60 1.1 perseant
61 1.2 perseant extern int debug, do_mmap;
62 1.1 perseant
63 1.29 dholland /*
64 1.29 dholland * XXX return the arg to just int when/if we don't need it for
65 1.29 dholland * potentially huge block counts any more.
66 1.29 dholland */
67 1.29 dholland static int
68 1.29 dholland log2int(intmax_t n)
69 1.2 perseant {
70 1.2 perseant int log;
71 1.2 perseant
72 1.2 perseant log = 0;
73 1.2 perseant while (n > 0) {
74 1.2 perseant ++log;
75 1.11 perseant n >>= 1;
76 1.2 perseant }
77 1.2 perseant return log - 1;
78 1.2 perseant }
79 1.2 perseant
80 1.3 perseant enum coalesce_returncodes {
81 1.3 perseant COALESCE_OK = 0,
82 1.3 perseant COALESCE_NOINODE,
83 1.3 perseant COALESCE_TOOSMALL,
84 1.3 perseant COALESCE_BADSIZE,
85 1.3 perseant COALESCE_BADBLOCKSIZE,
86 1.3 perseant COALESCE_NOMEM,
87 1.3 perseant COALESCE_BADBMAPV,
88 1.11 perseant COALESCE_BADMARKV,
89 1.3 perseant COALESCE_NOTWORTHIT,
90 1.3 perseant COALESCE_NOTHINGLEFT,
91 1.5 yamt COALESCE_EIO,
92 1.3 perseant
93 1.3 perseant COALESCE_MAXERROR
94 1.3 perseant };
95 1.3 perseant
96 1.17 lukem const char *coalesce_return[] = {
97 1.3 perseant "Successfully coalesced",
98 1.3 perseant "File not in use or inode not found",
99 1.3 perseant "Not large enough to coalesce",
100 1.3 perseant "Negative size",
101 1.3 perseant "Not enough blocks to account for size",
102 1.3 perseant "Malloc failed",
103 1.8 perseant "LFCNBMAPV failed",
104 1.3 perseant "Not broken enough to fix",
105 1.3 perseant "Too many blocks not found",
106 1.3 perseant "Too many blocks found in active segments",
107 1.5 yamt "I/O error",
108 1.3 perseant
109 1.3 perseant "No such error"
110 1.3 perseant };
111 1.3 perseant
112 1.21 dholland static struct ulfs1_dinode *
113 1.11 perseant get_dinode(struct clfs *fs, ino_t ino)
114 1.11 perseant {
115 1.11 perseant IFILE *ifp;
116 1.11 perseant daddr_t daddr;
117 1.11 perseant struct ubuf *bp;
118 1.21 dholland struct ulfs1_dinode *dip, *r;
119 1.11 perseant
120 1.11 perseant lfs_ientry(&ifp, fs, ino, &bp);
121 1.11 perseant daddr = ifp->if_daddr;
122 1.14 ad brelse(bp, 0);
123 1.11 perseant
124 1.11 perseant if (daddr == 0x0)
125 1.11 perseant return NULL;
126 1.11 perseant
127 1.26 dholland bread(fs->clfs_devvp, daddr, lfs_sb_getibsize(fs), 0, &bp);
128 1.21 dholland for (dip = (struct ulfs1_dinode *)bp->b_data;
129 1.26 dholland dip < (struct ulfs1_dinode *)(bp->b_data + lfs_sb_getibsize(fs)); dip++)
130 1.11 perseant if (dip->di_inumber == ino) {
131 1.21 dholland r = (struct ulfs1_dinode *)malloc(sizeof(*r));
132 1.19 perseant if (r == NULL)
133 1.19 perseant break;
134 1.11 perseant memcpy(r, dip, sizeof(*r));
135 1.14 ad brelse(bp, 0);
136 1.11 perseant return r;
137 1.11 perseant }
138 1.14 ad brelse(bp, 0);
139 1.11 perseant return NULL;
140 1.11 perseant }
141 1.11 perseant
142 1.1 perseant /*
143 1.1 perseant * Find out if this inode's data blocks are discontinuous; if they are,
144 1.7 perseant * rewrite them using markv. Return the number of inodes rewritten.
145 1.1 perseant */
146 1.11 perseant static int
147 1.11 perseant clean_inode(struct clfs *fs, ino_t ino)
148 1.1 perseant {
149 1.7 perseant BLOCK_INFO *bip = NULL, *tbip;
150 1.11 perseant CLEANERINFO cip;
151 1.11 perseant struct ubuf *bp;
152 1.21 dholland struct ulfs1_dinode *dip;
153 1.11 perseant struct clfs_seguse *sup;
154 1.11 perseant struct lfs_fcntl_markv /* {
155 1.11 perseant BLOCK_INFO *blkiov;
156 1.11 perseant int blkcnt;
157 1.11 perseant } */ lim;
158 1.11 perseant daddr_t toff;
159 1.29 dholland int noff;
160 1.29 dholland blkcnt_t i, nb, onb;
161 1.11 perseant int retval;
162 1.1 perseant int bps;
163 1.1 perseant
164 1.11 perseant dip = get_dinode(fs, ino);
165 1.1 perseant if (dip == NULL)
166 1.3 perseant return COALESCE_NOINODE;
167 1.1 perseant
168 1.7 perseant /* Compute file block size, set up for bmapv */
169 1.23 christos onb = nb = lfs_lblkno(fs, dip->di_size);
170 1.2 perseant
171 1.2 perseant /* XXX for now, don't do any file small enough to have fragments */
172 1.21 dholland if (nb < ULFS_NDADDR) {
173 1.12 christos free(dip);
174 1.3 perseant return COALESCE_TOOSMALL;
175 1.12 christos }
176 1.2 perseant
177 1.2 perseant /* Sanity checks */
178 1.17 lukem #if 0 /* di_size is uint64_t -- this is a noop */
179 1.2 perseant if (dip->di_size < 0) {
180 1.11 perseant dlog("ino %d, negative size (%" PRId64 ")", ino, dip->di_size);
181 1.12 christos free(dip);
182 1.3 perseant return COALESCE_BADSIZE;
183 1.2 perseant }
184 1.17 lukem #endif
185 1.1 perseant if (nb > dip->di_blocks) {
186 1.29 dholland dlog("ino %ju, computed blocks %jd > held blocks %ju",
187 1.29 dholland (uintmax_t)ino, (intmax_t)nb,
188 1.29 dholland (uintmax_t)dip->di_blocks);
189 1.12 christos free(dip);
190 1.3 perseant return COALESCE_BADBLOCKSIZE;
191 1.1 perseant }
192 1.2 perseant
193 1.29 dholland /*
194 1.29 dholland * XXX: We should really coalesce really large files in
195 1.29 dholland * chunks, as there's substantial diminishing returns and
196 1.29 dholland * mallocing huge amounts of memory just to get those returns
197 1.29 dholland * is pretty silly. But that requires a big rework of this
198 1.29 dholland * code. (On the plus side though then we can stop worrying
199 1.29 dholland * about block counts > 2^31.)
200 1.29 dholland */
201 1.29 dholland
202 1.29 dholland /* ugh, no DADDR_T_MAX */
203 1.29 dholland __CTASSERT(sizeof(daddr_t) == sizeof(int64_t));
204 1.29 dholland if (nb > INT64_MAX / sizeof(BLOCK_INFO)) {
205 1.29 dholland syslog(LOG_WARNING, "ino %ju, %jd blocks: array too large\n",
206 1.29 dholland (uintmax_t)ino, (uintmax_t)nb);
207 1.29 dholland free(dip);
208 1.29 dholland return COALESCE_NOMEM;
209 1.29 dholland }
210 1.29 dholland
211 1.7 perseant bip = (BLOCK_INFO *)malloc(sizeof(BLOCK_INFO) * nb);
212 1.1 perseant if (bip == NULL) {
213 1.29 dholland syslog(LOG_WARNING, "ino %llu, %jd blocks: %s\n",
214 1.29 dholland (unsigned long long)ino, (intmax_t)nb,
215 1.29 dholland strerror(errno));
216 1.12 christos free(dip);
217 1.3 perseant return COALESCE_NOMEM;
218 1.1 perseant }
219 1.1 perseant for (i = 0; i < nb; i++) {
220 1.7 perseant memset(bip + i, 0, sizeof(BLOCK_INFO));
221 1.1 perseant bip[i].bi_inode = ino;
222 1.1 perseant bip[i].bi_lbn = i;
223 1.2 perseant bip[i].bi_version = dip->di_gen;
224 1.1 perseant /* Don't set the size, but let lfs_bmap fill it in */
225 1.1 perseant }
226 1.29 dholland /*
227 1.29 dholland * The kernel also contains this check; but as lim.blkcnt is
228 1.29 dholland * only 32 bits wide, we need to check ourselves too in case
229 1.29 dholland * we'd otherwise truncate a value > 2^31, as that might
230 1.29 dholland * succeed and create bizarre results.
231 1.29 dholland */
232 1.29 dholland if (nb > LFS_MARKV_MAXBLKCNT) {
233 1.29 dholland syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: Too large\n",
234 1.29 dholland lfs_sb_getfsmnt(fs));
235 1.29 dholland retval = COALESCE_BADBMAPV;
236 1.29 dholland goto out;
237 1.29 dholland }
238 1.11 perseant lim.blkiov = bip;
239 1.11 perseant lim.blkcnt = nb;
240 1.18 pooka if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim) < 0) {
241 1.11 perseant syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: %m",
242 1.26 dholland lfs_sb_getfsmnt(fs));
243 1.5 yamt retval = COALESCE_BADBMAPV;
244 1.5 yamt goto out;
245 1.5 yamt }
246 1.5 yamt #if 0
247 1.5 yamt for (i = 0; i < nb; i++) {
248 1.29 dholland printf("bi_size = %d, bi_ino = %ju, "
249 1.29 dholland "bi_lbn = %jd, bi_daddr = %jd\n",
250 1.29 dholland bip[i].bi_size, (uintmax_t)bip[i].bi_inode,
251 1.29 dholland (intmax_t)bip[i].bi_lbn,
252 1.29 dholland (intmax_t)bip[i].bi_daddr);
253 1.1 perseant }
254 1.5 yamt #endif
255 1.29 dholland noff = 0;
256 1.29 dholland toff = 0;
257 1.1 perseant for (i = 1; i < nb; i++) {
258 1.28 dholland if (bip[i].bi_daddr != bip[i - 1].bi_daddr + lfs_sb_getfrag(fs))
259 1.1 perseant ++noff;
260 1.4 yamt toff += abs(bip[i].bi_daddr - bip[i - 1].bi_daddr
261 1.28 dholland - lfs_sb_getfrag(fs)) >> lfs_sb_getfbshift(fs);
262 1.1 perseant }
263 1.1 perseant
264 1.1 perseant /*
265 1.1 perseant * If this file is not discontinuous, there's no point in rewriting it.
266 1.11 perseant *
267 1.11 perseant * Explicitly allow a certain amount of discontinuity, since large
268 1.11 perseant * files will be broken among segments and medium-sized files
269 1.11 perseant * can have a break or two and it's okay.
270 1.1 perseant */
271 1.2 perseant if (nb <= 1 || noff == 0 || noff < log2int(nb) ||
272 1.23 christos lfs_segtod(fs, noff) * 2 < nb) {
273 1.5 yamt retval = COALESCE_NOTWORTHIT;
274 1.5 yamt goto out;
275 1.1 perseant } else if (debug)
276 1.10 christos syslog(LOG_DEBUG, "ino %llu total discontinuity "
277 1.29 dholland "%d (%jd) for %jd blocks", (unsigned long long)ino,
278 1.29 dholland noff, (intmax_t)toff, (intmax_t)nb);
279 1.1 perseant
280 1.1 perseant /* Search for blocks in active segments; don't move them. */
281 1.1 perseant for (i = 0; i < nb; i++) {
282 1.1 perseant if (bip[i].bi_daddr <= 0)
283 1.1 perseant continue;
284 1.23 christos sup = &fs->clfs_segtab[lfs_dtosn(fs, bip[i].bi_daddr)];
285 1.11 perseant if (sup->flags & SEGUSE_ACTIVE)
286 1.1 perseant bip[i].bi_daddr = LFS_UNUSED_DADDR; /* 0 */
287 1.1 perseant }
288 1.11 perseant
289 1.11 perseant /*
290 1.11 perseant * Get rid of any blocks we've marked dead. If this is an older
291 1.11 perseant * kernel that doesn't have bmapv fill in the block sizes, we'll
292 1.11 perseant * toss everything here.
293 1.1 perseant */
294 1.11 perseant onb = nb;
295 1.13 perseant toss_old_blocks(fs, &bip, &nb, NULL);
296 1.11 perseant nb = i;
297 1.2 perseant
298 1.1 perseant /*
299 1.2 perseant * We may have tossed enough blocks that it is no longer worthwhile
300 1.2 perseant * to rewrite this inode.
301 1.1 perseant */
302 1.11 perseant if (nb == 0 || onb - nb > log2int(onb)) {
303 1.3 perseant if (debug)
304 1.3 perseant syslog(LOG_DEBUG, "too many blocks tossed, not rewriting");
305 1.11 perseant retval = COALESCE_NOTHINGLEFT;
306 1.11 perseant goto out;
307 1.1 perseant }
308 1.1 perseant
309 1.11 perseant /*
310 1.1 perseant * We are going to rewrite this inode.
311 1.1 perseant * For any remaining blocks, read in their contents.
312 1.1 perseant */
313 1.1 perseant for (i = 0; i < nb; i++) {
314 1.1 perseant bip[i].bi_bp = malloc(bip[i].bi_size);
315 1.5 yamt if (bip[i].bi_bp == NULL) {
316 1.29 dholland syslog(LOG_WARNING, "allocate block buffer size=%d: %s\n",
317 1.29 dholland bip[i].bi_size, strerror(errno));
318 1.5 yamt retval = COALESCE_NOMEM;
319 1.5 yamt goto out;
320 1.5 yamt }
321 1.11 perseant
322 1.18 pooka if (kops.ko_pread(fs->clfs_devfd, bip[i].bi_bp, bip[i].bi_size,
323 1.23 christos lfs_fsbtob(fs, bip[i].bi_daddr)) < 0) {
324 1.5 yamt retval = COALESCE_EIO;
325 1.5 yamt goto out;
326 1.5 yamt }
327 1.1 perseant }
328 1.1 perseant if (debug)
329 1.29 dholland syslog(LOG_DEBUG, "ino %ju markv %jd blocks",
330 1.29 dholland (uintmax_t)ino, (intmax_t)nb);
331 1.1 perseant
332 1.2 perseant /*
333 1.2 perseant * Write in segment-sized chunks. If at any point we'd write more
334 1.2 perseant * than half of the available segments, sleep until that's not
335 1.2 perseant * true any more.
336 1.29 dholland *
337 1.29 dholland * XXX the pointer arithmetic in this loop is illegal; replace
338 1.29 dholland * TBIP with an integer (blkcnt_t) offset.
339 1.2 perseant */
340 1.23 christos bps = lfs_segtod(fs, 1);
341 1.1 perseant for (tbip = bip; tbip < bip + nb; tbip += bps) {
342 1.11 perseant do {
343 1.28 dholland bread(fs->lfs_ivnode, 0, lfs_sb_getbsize(fs), 0, &bp);
344 1.11 perseant cip = *(CLEANERINFO *)bp->b_data;
345 1.14 ad brelse(bp, B_INVAL);
346 1.11 perseant
347 1.11 perseant if (cip.clean < 4) /* XXX magic number 4 */
348 1.18 pooka kops.ko_fcntl(fs->clfs_ifilefd,
349 1.18 pooka LFCNSEGWAIT, NULL);
350 1.11 perseant } while(cip.clean < 4);
351 1.11 perseant
352 1.29 dholland /*
353 1.29 dholland * Note that although lim.blkcnt is 32 bits wide, bps
354 1.29 dholland * (which is blocks-per-segment) is < 2^32 so the
355 1.29 dholland * value assigned here is always in range.
356 1.29 dholland */
357 1.11 perseant lim.blkiov = tbip;
358 1.11 perseant lim.blkcnt = (tbip + bps < bip + nb ? bps : nb % bps);
359 1.18 pooka if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim) < 0) {
360 1.11 perseant retval = COALESCE_BADMARKV;
361 1.11 perseant goto out;
362 1.2 perseant }
363 1.1 perseant }
364 1.1 perseant
365 1.5 yamt retval = COALESCE_OK;
366 1.5 yamt out:
367 1.11 perseant free(dip);
368 1.5 yamt if (bip) {
369 1.5 yamt for (i = 0; i < onb; i++)
370 1.5 yamt if (bip[i].bi_bp)
371 1.5 yamt free(bip[i].bi_bp);
372 1.5 yamt free(bip);
373 1.5 yamt }
374 1.5 yamt return retval;
375 1.1 perseant }
376 1.1 perseant
377 1.1 perseant /*
378 1.1 perseant * Try coalescing every inode in the filesystem.
379 1.1 perseant * Return the number of inodes actually altered.
380 1.1 perseant */
381 1.11 perseant int clean_all_inodes(struct clfs *fs)
382 1.1 perseant {
383 1.11 perseant int i, r, maxino;
384 1.3 perseant int totals[COALESCE_MAXERROR];
385 1.11 perseant struct stat st;
386 1.1 perseant
387 1.3 perseant memset(totals, 0, sizeof(totals));
388 1.11 perseant
389 1.11 perseant fstat(fs->clfs_ifilefd, &st);
390 1.26 dholland maxino = lfs_sb_getifpb(fs) * (st.st_size >> lfs_sb_getbshift(fs)) -
391 1.25 dholland lfs_sb_getsegtabsz(fs) - lfs_sb_getcleansz(fs);
392 1.11 perseant
393 1.11 perseant for (i = 0; i < maxino; i++) {
394 1.11 perseant r = clean_inode(fs, i);
395 1.3 perseant ++totals[r];
396 1.1 perseant }
397 1.3 perseant
398 1.3 perseant for (i = 0; i < COALESCE_MAXERROR; i++)
399 1.3 perseant if (totals[i])
400 1.3 perseant syslog(LOG_DEBUG, "%s: %d", coalesce_return[i],
401 1.11 perseant totals[i]);
402 1.11 perseant
403 1.3 perseant return totals[COALESCE_OK];
404 1.1 perseant }
405 1.1 perseant
406 1.11 perseant /*
407 1.11 perseant * Fork a child process to coalesce this fs.
408 1.11 perseant */
409 1.11 perseant int
410 1.11 perseant fork_coalesce(struct clfs *fs)
411 1.1 perseant {
412 1.1 perseant static pid_t childpid;
413 1.2 perseant int num;
414 1.2 perseant
415 1.11 perseant /*
416 1.11 perseant * If already running a coalescing child, don't start a new one.
417 1.11 perseant */
418 1.1 perseant if (childpid) {
419 1.11 perseant if (waitpid(childpid, NULL, WNOHANG) == childpid)
420 1.1 perseant childpid = 0;
421 1.1 perseant }
422 1.1 perseant if (childpid && kill(childpid, 0) >= 0) {
423 1.1 perseant /* already running a coalesce process */
424 1.2 perseant if (debug)
425 1.2 perseant syslog(LOG_DEBUG, "coalescing already in progress");
426 1.1 perseant return 0;
427 1.1 perseant }
428 1.11 perseant
429 1.11 perseant /*
430 1.11 perseant * Fork a child and let the child coalease
431 1.11 perseant */
432 1.1 perseant childpid = fork();
433 1.1 perseant if (childpid < 0) {
434 1.26 dholland syslog(LOG_ERR, "%s: fork to coaleasce: %m", lfs_sb_getfsmnt(fs));
435 1.1 perseant return 0;
436 1.1 perseant } else if (childpid == 0) {
437 1.11 perseant syslog(LOG_NOTICE, "%s: new coalescing process, pid %d",
438 1.26 dholland lfs_sb_getfsmnt(fs), getpid());
439 1.11 perseant num = clean_all_inodes(fs);
440 1.11 perseant syslog(LOG_NOTICE, "%s: coalesced %d discontiguous inodes",
441 1.26 dholland lfs_sb_getfsmnt(fs), num);
442 1.1 perseant exit(0);
443 1.1 perseant }
444 1.11 perseant
445 1.1 perseant return 0;
446 1.1 perseant }
447