Home | History | Annotate | Line # | Download | only in lfs_cleanerd
coalesce.c revision 1.29
      1 /*      $NetBSD: coalesce.c,v 1.29 2015/08/12 18:23:16 dholland Exp $  */
      2 
      3 /*-
      4  * Copyright (c) 2002, 2005 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Konrad E. Schroder <perseant (at) hhhh.org>.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/param.h>
     33 #include <sys/mount.h>
     34 #include <sys/time.h>
     35 #include <sys/resource.h>
     36 #include <sys/types.h>
     37 #include <sys/wait.h>
     38 #include <sys/mman.h>
     39 
     40 #include <ufs/lfs/lfs.h>
     41 
     42 #include <fcntl.h>
     43 #include <signal.h>
     44 #include <stdio.h>
     45 #include <stdlib.h>
     46 #include <string.h>
     47 #include <time.h>
     48 #include <unistd.h>
     49 #include <util.h>
     50 #include <errno.h>
     51 #include <err.h>
     52 #include <assert.h>
     53 
     54 #include <syslog.h>
     55 
     56 #include "bufcache.h"
     57 #include "vnode.h"
     58 #include "cleaner.h"
     59 #include "kernelops.h"
     60 
     61 extern int debug, do_mmap;
     62 
     63 /*
     64  * XXX return the arg to just int when/if we don't need it for
     65  * potentially huge block counts any more.
     66  */
     67 static int
     68 log2int(intmax_t n)
     69 {
     70 	int log;
     71 
     72 	log = 0;
     73 	while (n > 0) {
     74 		++log;
     75 		n >>= 1;
     76 	}
     77 	return log - 1;
     78 }
     79 
     80 enum coalesce_returncodes {
     81 	COALESCE_OK = 0,
     82 	COALESCE_NOINODE,
     83 	COALESCE_TOOSMALL,
     84 	COALESCE_BADSIZE,
     85 	COALESCE_BADBLOCKSIZE,
     86 	COALESCE_NOMEM,
     87 	COALESCE_BADBMAPV,
     88 	COALESCE_BADMARKV,
     89 	COALESCE_NOTWORTHIT,
     90 	COALESCE_NOTHINGLEFT,
     91 	COALESCE_EIO,
     92 
     93 	COALESCE_MAXERROR
     94 };
     95 
     96 const char *coalesce_return[] = {
     97 	"Successfully coalesced",
     98 	"File not in use or inode not found",
     99 	"Not large enough to coalesce",
    100 	"Negative size",
    101 	"Not enough blocks to account for size",
    102 	"Malloc failed",
    103 	"LFCNBMAPV failed",
    104 	"Not broken enough to fix",
    105 	"Too many blocks not found",
    106 	"Too many blocks found in active segments",
    107 	"I/O error",
    108 
    109 	"No such error"
    110 };
    111 
    112 static struct ulfs1_dinode *
    113 get_dinode(struct clfs *fs, ino_t ino)
    114 {
    115 	IFILE *ifp;
    116 	daddr_t daddr;
    117 	struct ubuf *bp;
    118 	struct ulfs1_dinode *dip, *r;
    119 
    120 	lfs_ientry(&ifp, fs, ino, &bp);
    121 	daddr = ifp->if_daddr;
    122 	brelse(bp, 0);
    123 
    124 	if (daddr == 0x0)
    125 		return NULL;
    126 
    127 	bread(fs->clfs_devvp, daddr, lfs_sb_getibsize(fs), 0, &bp);
    128 	for (dip = (struct ulfs1_dinode *)bp->b_data;
    129 	     dip < (struct ulfs1_dinode *)(bp->b_data + lfs_sb_getibsize(fs)); dip++)
    130 		if (dip->di_inumber == ino) {
    131 			r = (struct ulfs1_dinode *)malloc(sizeof(*r));
    132 			if (r == NULL)
    133 				break;
    134 			memcpy(r, dip, sizeof(*r));
    135 			brelse(bp, 0);
    136 			return r;
    137 		}
    138 	brelse(bp, 0);
    139 	return NULL;
    140 }
    141 
    142 /*
    143  * Find out if this inode's data blocks are discontinuous; if they are,
    144  * rewrite them using markv.  Return the number of inodes rewritten.
    145  */
    146 static int
    147 clean_inode(struct clfs *fs, ino_t ino)
    148 {
    149 	BLOCK_INFO *bip = NULL, *tbip;
    150 	CLEANERINFO cip;
    151 	struct ubuf *bp;
    152 	struct ulfs1_dinode *dip;
    153 	struct clfs_seguse *sup;
    154 	struct lfs_fcntl_markv /* {
    155 		BLOCK_INFO *blkiov;
    156 		int blkcnt;
    157 	} */ lim;
    158 	daddr_t toff;
    159 	int noff;
    160 	blkcnt_t i, nb, onb;
    161 	int retval;
    162 	int bps;
    163 
    164 	dip = get_dinode(fs, ino);
    165 	if (dip == NULL)
    166 		return COALESCE_NOINODE;
    167 
    168 	/* Compute file block size, set up for bmapv */
    169 	onb = nb = lfs_lblkno(fs, dip->di_size);
    170 
    171 	/* XXX for now, don't do any file small enough to have fragments */
    172 	if (nb < ULFS_NDADDR) {
    173 		free(dip);
    174 		return COALESCE_TOOSMALL;
    175 	}
    176 
    177 	/* Sanity checks */
    178 #if 0	/* di_size is uint64_t -- this is a noop */
    179 	if (dip->di_size < 0) {
    180 		dlog("ino %d, negative size (%" PRId64 ")", ino, dip->di_size);
    181 		free(dip);
    182 		return COALESCE_BADSIZE;
    183 	}
    184 #endif
    185 	if (nb > dip->di_blocks) {
    186 		dlog("ino %ju, computed blocks %jd > held blocks %ju",
    187 		     (uintmax_t)ino, (intmax_t)nb,
    188 		     (uintmax_t)dip->di_blocks);
    189 		free(dip);
    190 		return COALESCE_BADBLOCKSIZE;
    191 	}
    192 
    193 	/*
    194 	 * XXX: We should really coalesce really large files in
    195 	 * chunks, as there's substantial diminishing returns and
    196 	 * mallocing huge amounts of memory just to get those returns
    197 	 * is pretty silly. But that requires a big rework of this
    198 	 * code. (On the plus side though then we can stop worrying
    199 	 * about block counts > 2^31.)
    200 	 */
    201 
    202 	/* ugh, no DADDR_T_MAX */
    203 	__CTASSERT(sizeof(daddr_t) == sizeof(int64_t));
    204 	if (nb > INT64_MAX / sizeof(BLOCK_INFO)) {
    205 		syslog(LOG_WARNING, "ino %ju, %jd blocks: array too large\n",
    206 		       (uintmax_t)ino, (uintmax_t)nb);
    207 		free(dip);
    208 		return COALESCE_NOMEM;
    209 	}
    210 
    211 	bip = (BLOCK_INFO *)malloc(sizeof(BLOCK_INFO) * nb);
    212 	if (bip == NULL) {
    213 		syslog(LOG_WARNING, "ino %llu, %jd blocks: %s\n",
    214 		    (unsigned long long)ino, (intmax_t)nb,
    215 		    strerror(errno));
    216 		free(dip);
    217 		return COALESCE_NOMEM;
    218 	}
    219 	for (i = 0; i < nb; i++) {
    220 		memset(bip + i, 0, sizeof(BLOCK_INFO));
    221 		bip[i].bi_inode = ino;
    222 		bip[i].bi_lbn = i;
    223 		bip[i].bi_version = dip->di_gen;
    224 		/* Don't set the size, but let lfs_bmap fill it in */
    225 	}
    226 	/*
    227 	 * The kernel also contains this check; but as lim.blkcnt is
    228 	 * only 32 bits wide, we need to check ourselves too in case
    229 	 * we'd otherwise truncate a value > 2^31, as that might
    230 	 * succeed and create bizarre results.
    231 	 */
    232 	if (nb > LFS_MARKV_MAXBLKCNT) {
    233 		syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: Too large\n",
    234 		       lfs_sb_getfsmnt(fs));
    235 		retval = COALESCE_BADBMAPV;
    236 		goto out;
    237 	}
    238 	lim.blkiov = bip;
    239 	lim.blkcnt = nb;
    240 	if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim) < 0) {
    241 		syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: %m",
    242 		       lfs_sb_getfsmnt(fs));
    243 		retval = COALESCE_BADBMAPV;
    244 		goto out;
    245 	}
    246 #if 0
    247 	for (i = 0; i < nb; i++) {
    248 		printf("bi_size = %d, bi_ino = %ju, "
    249 		    "bi_lbn = %jd, bi_daddr = %jd\n",
    250 		    bip[i].bi_size, (uintmax_t)bip[i].bi_inode,
    251 		    (intmax_t)bip[i].bi_lbn,
    252 		    (intmax_t)bip[i].bi_daddr);
    253 	}
    254 #endif
    255 	noff = 0;
    256 	toff = 0;
    257 	for (i = 1; i < nb; i++) {
    258 		if (bip[i].bi_daddr != bip[i - 1].bi_daddr + lfs_sb_getfrag(fs))
    259 			++noff;
    260 		toff += abs(bip[i].bi_daddr - bip[i - 1].bi_daddr
    261 		    - lfs_sb_getfrag(fs)) >> lfs_sb_getfbshift(fs);
    262 	}
    263 
    264 	/*
    265 	 * If this file is not discontinuous, there's no point in rewriting it.
    266 	 *
    267 	 * Explicitly allow a certain amount of discontinuity, since large
    268 	 * files will be broken among segments and medium-sized files
    269 	 * can have a break or two and it's okay.
    270 	 */
    271 	if (nb <= 1 || noff == 0 || noff < log2int(nb) ||
    272 	    lfs_segtod(fs, noff) * 2 < nb) {
    273 		retval = COALESCE_NOTWORTHIT;
    274 		goto out;
    275 	} else if (debug)
    276 		syslog(LOG_DEBUG, "ino %llu total discontinuity "
    277 		    "%d (%jd) for %jd blocks", (unsigned long long)ino,
    278 		    noff, (intmax_t)toff, (intmax_t)nb);
    279 
    280 	/* Search for blocks in active segments; don't move them. */
    281 	for (i = 0; i < nb; i++) {
    282 		if (bip[i].bi_daddr <= 0)
    283 			continue;
    284 		sup = &fs->clfs_segtab[lfs_dtosn(fs, bip[i].bi_daddr)];
    285 		if (sup->flags & SEGUSE_ACTIVE)
    286 			bip[i].bi_daddr = LFS_UNUSED_DADDR; /* 0 */
    287 	}
    288 
    289 	/*
    290 	 * Get rid of any blocks we've marked dead.  If this is an older
    291 	 * kernel that doesn't have bmapv fill in the block sizes, we'll
    292 	 * toss everything here.
    293 	 */
    294 	onb = nb;
    295 	toss_old_blocks(fs, &bip, &nb, NULL);
    296 	nb = i;
    297 
    298 	/*
    299 	 * We may have tossed enough blocks that it is no longer worthwhile
    300 	 * to rewrite this inode.
    301 	 */
    302 	if (nb == 0 || onb - nb > log2int(onb)) {
    303 		if (debug)
    304 			syslog(LOG_DEBUG, "too many blocks tossed, not rewriting");
    305 		retval = COALESCE_NOTHINGLEFT;
    306 		goto out;
    307 	}
    308 
    309 	/*
    310 	 * We are going to rewrite this inode.
    311 	 * For any remaining blocks, read in their contents.
    312 	 */
    313 	for (i = 0; i < nb; i++) {
    314 		bip[i].bi_bp = malloc(bip[i].bi_size);
    315 		if (bip[i].bi_bp == NULL) {
    316 			syslog(LOG_WARNING, "allocate block buffer size=%d: %s\n",
    317 			    bip[i].bi_size, strerror(errno));
    318 			retval = COALESCE_NOMEM;
    319 			goto out;
    320 		}
    321 
    322 		if (kops.ko_pread(fs->clfs_devfd, bip[i].bi_bp, bip[i].bi_size,
    323 			  lfs_fsbtob(fs, bip[i].bi_daddr)) < 0) {
    324 			retval = COALESCE_EIO;
    325 			goto out;
    326 		}
    327 	}
    328 	if (debug)
    329 		syslog(LOG_DEBUG, "ino %ju markv %jd blocks",
    330 		    (uintmax_t)ino, (intmax_t)nb);
    331 
    332 	/*
    333 	 * Write in segment-sized chunks.  If at any point we'd write more
    334 	 * than half of the available segments, sleep until that's not
    335 	 * true any more.
    336 	 *
    337 	 * XXX the pointer arithmetic in this loop is illegal; replace
    338 	 * TBIP with an integer (blkcnt_t) offset.
    339 	 */
    340 	bps = lfs_segtod(fs, 1);
    341 	for (tbip = bip; tbip < bip + nb; tbip += bps) {
    342 		do {
    343 			bread(fs->lfs_ivnode, 0, lfs_sb_getbsize(fs), 0, &bp);
    344 			cip = *(CLEANERINFO *)bp->b_data;
    345 			brelse(bp, B_INVAL);
    346 
    347 			if (cip.clean < 4) /* XXX magic number 4 */
    348 				kops.ko_fcntl(fs->clfs_ifilefd,
    349 				    LFCNSEGWAIT, NULL);
    350 		} while(cip.clean < 4);
    351 
    352 		/*
    353 		 * Note that although lim.blkcnt is 32 bits wide, bps
    354 		 * (which is blocks-per-segment) is < 2^32 so the
    355 		 * value assigned here is always in range.
    356 		 */
    357 		lim.blkiov = tbip;
    358 		lim.blkcnt = (tbip + bps < bip + nb ? bps : nb % bps);
    359 		if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim) < 0) {
    360 			retval = COALESCE_BADMARKV;
    361 			goto out;
    362 		}
    363 	}
    364 
    365 	retval = COALESCE_OK;
    366 out:
    367 	free(dip);
    368 	if (bip) {
    369 		for (i = 0; i < onb; i++)
    370 			if (bip[i].bi_bp)
    371 				free(bip[i].bi_bp);
    372 		free(bip);
    373 	}
    374 	return retval;
    375 }
    376 
    377 /*
    378  * Try coalescing every inode in the filesystem.
    379  * Return the number of inodes actually altered.
    380  */
    381 int clean_all_inodes(struct clfs *fs)
    382 {
    383 	int i, r, maxino;
    384 	int totals[COALESCE_MAXERROR];
    385 	struct stat st;
    386 
    387 	memset(totals, 0, sizeof(totals));
    388 
    389 	fstat(fs->clfs_ifilefd, &st);
    390 	maxino = lfs_sb_getifpb(fs) * (st.st_size >> lfs_sb_getbshift(fs)) -
    391 		lfs_sb_getsegtabsz(fs) - lfs_sb_getcleansz(fs);
    392 
    393 	for (i = 0; i < maxino; i++) {
    394 		r = clean_inode(fs, i);
    395 		++totals[r];
    396 	}
    397 
    398 	for (i = 0; i < COALESCE_MAXERROR; i++)
    399 		if (totals[i])
    400 			syslog(LOG_DEBUG, "%s: %d", coalesce_return[i],
    401 			       totals[i]);
    402 
    403 	return totals[COALESCE_OK];
    404 }
    405 
    406 /*
    407  * Fork a child process to coalesce this fs.
    408  */
    409 int
    410 fork_coalesce(struct clfs *fs)
    411 {
    412 	static pid_t childpid;
    413 	int num;
    414 
    415 	/*
    416 	 * If already running a coalescing child, don't start a new one.
    417 	 */
    418 	if (childpid) {
    419 		if (waitpid(childpid, NULL, WNOHANG) == childpid)
    420 			childpid = 0;
    421 	}
    422 	if (childpid && kill(childpid, 0) >= 0) {
    423 		/* already running a coalesce process */
    424 		if (debug)
    425 			syslog(LOG_DEBUG, "coalescing already in progress");
    426 		return 0;
    427 	}
    428 
    429 	/*
    430 	 * Fork a child and let the child coalease
    431 	 */
    432 	childpid = fork();
    433 	if (childpid < 0) {
    434 		syslog(LOG_ERR, "%s: fork to coaleasce: %m", lfs_sb_getfsmnt(fs));
    435 		return 0;
    436 	} else if (childpid == 0) {
    437 		syslog(LOG_NOTICE, "%s: new coalescing process, pid %d",
    438 		       lfs_sb_getfsmnt(fs), getpid());
    439 		num = clean_all_inodes(fs);
    440 		syslog(LOG_NOTICE, "%s: coalesced %d discontiguous inodes",
    441 		       lfs_sb_getfsmnt(fs), num);
    442 		exit(0);
    443 	}
    444 
    445 	return 0;
    446 }
    447