Home | History | Annotate | Line # | Download | only in lfs_cleanerd
coalesce.c revision 1.19
      1 /*      $NetBSD: coalesce.c,v 1.19 2012/01/02 21:35:17 perseant Exp $  */
      2 
      3 /*-
      4  * Copyright (c) 2002, 2005 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Konrad E. Schroder <perseant (at) hhhh.org>.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/param.h>
     33 #include <sys/mount.h>
     34 #include <sys/time.h>
     35 #include <sys/resource.h>
     36 #include <sys/types.h>
     37 #include <sys/wait.h>
     38 #include <sys/mman.h>
     39 
     40 #include <ufs/ufs/dinode.h>
     41 #include <ufs/lfs/lfs.h>
     42 
     43 #include <fcntl.h>
     44 #include <signal.h>
     45 #include <stdio.h>
     46 #include <stdlib.h>
     47 #include <string.h>
     48 #include <time.h>
     49 #include <unistd.h>
     50 #include <util.h>
     51 #include <errno.h>
     52 #include <err.h>
     53 
     54 #include <syslog.h>
     55 
     56 #include "bufcache.h"
     57 #include "vnode.h"
     58 #include "cleaner.h"
     59 #include "kernelops.h"
     60 
     61 extern int debug, do_mmap;
     62 
     63 int log2int(int n)
     64 {
     65 	int log;
     66 
     67 	log = 0;
     68 	while (n > 0) {
     69 		++log;
     70 		n >>= 1;
     71 	}
     72 	return log - 1;
     73 }
     74 
     75 enum coalesce_returncodes {
     76 	COALESCE_OK = 0,
     77 	COALESCE_NOINODE,
     78 	COALESCE_TOOSMALL,
     79 	COALESCE_BADSIZE,
     80 	COALESCE_BADBLOCKSIZE,
     81 	COALESCE_NOMEM,
     82 	COALESCE_BADBMAPV,
     83 	COALESCE_BADMARKV,
     84 	COALESCE_NOTWORTHIT,
     85 	COALESCE_NOTHINGLEFT,
     86 	COALESCE_EIO,
     87 
     88 	COALESCE_MAXERROR
     89 };
     90 
     91 const char *coalesce_return[] = {
     92 	"Successfully coalesced",
     93 	"File not in use or inode not found",
     94 	"Not large enough to coalesce",
     95 	"Negative size",
     96 	"Not enough blocks to account for size",
     97 	"Malloc failed",
     98 	"LFCNBMAPV failed",
     99 	"Not broken enough to fix",
    100 	"Too many blocks not found",
    101 	"Too many blocks found in active segments",
    102 	"I/O error",
    103 
    104 	"No such error"
    105 };
    106 
    107 static struct ufs1_dinode *
    108 get_dinode(struct clfs *fs, ino_t ino)
    109 {
    110 	IFILE *ifp;
    111 	daddr_t daddr;
    112 	struct ubuf *bp;
    113 	struct ufs1_dinode *dip, *r;
    114 
    115 	lfs_ientry(&ifp, fs, ino, &bp);
    116 	daddr = ifp->if_daddr;
    117 	brelse(bp, 0);
    118 
    119 	if (daddr == 0x0)
    120 		return NULL;
    121 
    122 	bread(fs->clfs_devvp, daddr, fs->lfs_ibsize, NOCRED, 0, &bp);
    123 	for (dip = (struct ufs1_dinode *)bp->b_data;
    124 	     dip < (struct ufs1_dinode *)(bp->b_data + fs->lfs_ibsize); dip++)
    125 		if (dip->di_inumber == ino) {
    126 			r = (struct ufs1_dinode *)malloc(sizeof(*r));
    127 			if (r == NULL)
    128 				break;
    129 			memcpy(r, dip, sizeof(*r));
    130 			brelse(bp, 0);
    131 			return r;
    132 		}
    133 	brelse(bp, 0);
    134 	return NULL;
    135 }
    136 
    137 /*
    138  * Find out if this inode's data blocks are discontinuous; if they are,
    139  * rewrite them using markv.  Return the number of inodes rewritten.
    140  */
    141 static int
    142 clean_inode(struct clfs *fs, ino_t ino)
    143 {
    144 	BLOCK_INFO *bip = NULL, *tbip;
    145 	CLEANERINFO cip;
    146 	struct ubuf *bp;
    147 	struct ufs1_dinode *dip;
    148 	struct clfs_seguse *sup;
    149 	struct lfs_fcntl_markv /* {
    150 		BLOCK_INFO *blkiov;
    151 		int blkcnt;
    152 	} */ lim;
    153 	daddr_t toff;
    154 	int i;
    155 	int nb, onb, noff;
    156 	int retval;
    157 	int bps;
    158 
    159 	dip = get_dinode(fs, ino);
    160 	if (dip == NULL)
    161 		return COALESCE_NOINODE;
    162 
    163 	/* Compute file block size, set up for bmapv */
    164 	onb = nb = lblkno(fs, dip->di_size);
    165 
    166 	/* XXX for now, don't do any file small enough to have fragments */
    167 	if (nb < NDADDR) {
    168 		free(dip);
    169 		return COALESCE_TOOSMALL;
    170 	}
    171 
    172 	/* Sanity checks */
    173 #if 0	/* di_size is uint64_t -- this is a noop */
    174 	if (dip->di_size < 0) {
    175 		dlog("ino %d, negative size (%" PRId64 ")", ino, dip->di_size);
    176 		free(dip);
    177 		return COALESCE_BADSIZE;
    178 	}
    179 #endif
    180 	if (nb > dip->di_blocks) {
    181 		dlog("ino %d, computed blocks %d > held blocks %d", ino, nb,
    182 		     dip->di_blocks);
    183 		free(dip);
    184 		return COALESCE_BADBLOCKSIZE;
    185 	}
    186 
    187 	bip = (BLOCK_INFO *)malloc(sizeof(BLOCK_INFO) * nb);
    188 	if (bip == NULL) {
    189 		syslog(LOG_WARNING, "ino %llu, %d blocks: %m",
    190 		    (unsigned long long)ino, nb);
    191 		free(dip);
    192 		return COALESCE_NOMEM;
    193 	}
    194 	for (i = 0; i < nb; i++) {
    195 		memset(bip + i, 0, sizeof(BLOCK_INFO));
    196 		bip[i].bi_inode = ino;
    197 		bip[i].bi_lbn = i;
    198 		bip[i].bi_version = dip->di_gen;
    199 		/* Don't set the size, but let lfs_bmap fill it in */
    200 	}
    201 	lim.blkiov = bip;
    202 	lim.blkcnt = nb;
    203 	if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim) < 0) {
    204 		syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: %m",
    205 		       fs->lfs_fsmnt);
    206 		retval = COALESCE_BADBMAPV;
    207 		goto out;
    208 	}
    209 #if 0
    210 	for (i = 0; i < nb; i++) {
    211 		printf("bi_size = %d, bi_ino = %d, "
    212 		    "bi_lbn = %d, bi_daddr = %d\n",
    213 		    bip[i].bi_size, bip[i].bi_inode, bip[i].bi_lbn,
    214 		    bip[i].bi_daddr);
    215 	}
    216 #endif
    217 	noff = toff = 0;
    218 	for (i = 1; i < nb; i++) {
    219 		if (bip[i].bi_daddr != bip[i - 1].bi_daddr + fs->lfs_frag)
    220 			++noff;
    221 		toff += abs(bip[i].bi_daddr - bip[i - 1].bi_daddr
    222 		    - fs->lfs_frag) >> fs->lfs_fbshift;
    223 	}
    224 
    225 	/*
    226 	 * If this file is not discontinuous, there's no point in rewriting it.
    227 	 *
    228 	 * Explicitly allow a certain amount of discontinuity, since large
    229 	 * files will be broken among segments and medium-sized files
    230 	 * can have a break or two and it's okay.
    231 	 */
    232 	if (nb <= 1 || noff == 0 || noff < log2int(nb) ||
    233 	    segtod(fs, noff) * 2 < nb) {
    234 		retval = COALESCE_NOTWORTHIT;
    235 		goto out;
    236 	} else if (debug)
    237 		syslog(LOG_DEBUG, "ino %llu total discontinuity "
    238 		    "%d (%lld) for %d blocks", (unsigned long long)ino,
    239 		    noff, (long long)toff, nb);
    240 
    241 	/* Search for blocks in active segments; don't move them. */
    242 	for (i = 0; i < nb; i++) {
    243 		if (bip[i].bi_daddr <= 0)
    244 			continue;
    245 		sup = &fs->clfs_segtab[dtosn(fs, bip[i].bi_daddr)];
    246 		if (sup->flags & SEGUSE_ACTIVE)
    247 			bip[i].bi_daddr = LFS_UNUSED_DADDR; /* 0 */
    248 	}
    249 
    250 	/*
    251 	 * Get rid of any blocks we've marked dead.  If this is an older
    252 	 * kernel that doesn't have bmapv fill in the block sizes, we'll
    253 	 * toss everything here.
    254 	 */
    255 	onb = nb;
    256 	toss_old_blocks(fs, &bip, &nb, NULL);
    257 	nb = i;
    258 
    259 	/*
    260 	 * We may have tossed enough blocks that it is no longer worthwhile
    261 	 * to rewrite this inode.
    262 	 */
    263 	if (nb == 0 || onb - nb > log2int(onb)) {
    264 		if (debug)
    265 			syslog(LOG_DEBUG, "too many blocks tossed, not rewriting");
    266 		retval = COALESCE_NOTHINGLEFT;
    267 		goto out;
    268 	}
    269 
    270 	/*
    271 	 * We are going to rewrite this inode.
    272 	 * For any remaining blocks, read in their contents.
    273 	 */
    274 	for (i = 0; i < nb; i++) {
    275 		bip[i].bi_bp = malloc(bip[i].bi_size);
    276 		if (bip[i].bi_bp == NULL) {
    277 			syslog(LOG_WARNING, "allocate block buffer size=%d: %m",
    278 			    bip[i].bi_size);
    279 			retval = COALESCE_NOMEM;
    280 			goto out;
    281 		}
    282 
    283 		if (kops.ko_pread(fs->clfs_devfd, bip[i].bi_bp, bip[i].bi_size,
    284 			  fsbtob(fs, bip[i].bi_daddr)) < 0) {
    285 			retval = COALESCE_EIO;
    286 			goto out;
    287 		}
    288 	}
    289 	if (debug)
    290 		syslog(LOG_DEBUG, "ino %llu markv %d blocks",
    291 		    (unsigned long long)ino, nb);
    292 
    293 	/*
    294 	 * Write in segment-sized chunks.  If at any point we'd write more
    295 	 * than half of the available segments, sleep until that's not
    296 	 * true any more.
    297 	 */
    298 	bps = segtod(fs, 1);
    299 	for (tbip = bip; tbip < bip + nb; tbip += bps) {
    300 		do {
    301 			bread(fs->lfs_ivnode, 0, fs->lfs_bsize, NOCRED, 0, &bp);
    302 			cip = *(CLEANERINFO *)bp->b_data;
    303 			brelse(bp, B_INVAL);
    304 
    305 			if (cip.clean < 4) /* XXX magic number 4 */
    306 				kops.ko_fcntl(fs->clfs_ifilefd,
    307 				    LFCNSEGWAIT, NULL);
    308 		} while(cip.clean < 4);
    309 
    310 		lim.blkiov = tbip;
    311 		lim.blkcnt = (tbip + bps < bip + nb ? bps : nb % bps);
    312 		if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim) < 0) {
    313 			retval = COALESCE_BADMARKV;
    314 			goto out;
    315 		}
    316 	}
    317 
    318 	retval = COALESCE_OK;
    319 out:
    320 	free(dip);
    321 	if (bip) {
    322 		for (i = 0; i < onb; i++)
    323 			if (bip[i].bi_bp)
    324 				free(bip[i].bi_bp);
    325 		free(bip);
    326 	}
    327 	return retval;
    328 }
    329 
    330 /*
    331  * Try coalescing every inode in the filesystem.
    332  * Return the number of inodes actually altered.
    333  */
    334 int clean_all_inodes(struct clfs *fs)
    335 {
    336 	int i, r, maxino;
    337 	int totals[COALESCE_MAXERROR];
    338 	struct stat st;
    339 
    340 	memset(totals, 0, sizeof(totals));
    341 
    342 	fstat(fs->clfs_ifilefd, &st);
    343 	maxino = fs->lfs_ifpb * (st.st_size >> fs->lfs_bshift) -
    344 		fs->lfs_segtabsz - fs->lfs_cleansz;
    345 
    346 	for (i = 0; i < maxino; i++) {
    347 		r = clean_inode(fs, i);
    348 		++totals[r];
    349 	}
    350 
    351 	for (i = 0; i < COALESCE_MAXERROR; i++)
    352 		if (totals[i])
    353 			syslog(LOG_DEBUG, "%s: %d", coalesce_return[i],
    354 			       totals[i]);
    355 
    356 	return totals[COALESCE_OK];
    357 }
    358 
    359 /*
    360  * Fork a child process to coalesce this fs.
    361  */
    362 int
    363 fork_coalesce(struct clfs *fs)
    364 {
    365 	static pid_t childpid;
    366 	int num;
    367 
    368 	/*
    369 	 * If already running a coalescing child, don't start a new one.
    370 	 */
    371 	if (childpid) {
    372 		if (waitpid(childpid, NULL, WNOHANG) == childpid)
    373 			childpid = 0;
    374 	}
    375 	if (childpid && kill(childpid, 0) >= 0) {
    376 		/* already running a coalesce process */
    377 		if (debug)
    378 			syslog(LOG_DEBUG, "coalescing already in progress");
    379 		return 0;
    380 	}
    381 
    382 	/*
    383 	 * Fork a child and let the child coalease
    384 	 */
    385 	childpid = fork();
    386 	if (childpid < 0) {
    387 		syslog(LOG_ERR, "%s: fork to coaleasce: %m", fs->lfs_fsmnt);
    388 		return 0;
    389 	} else if (childpid == 0) {
    390 		syslog(LOG_NOTICE, "%s: new coalescing process, pid %d",
    391 		       fs->lfs_fsmnt, getpid());
    392 		num = clean_all_inodes(fs);
    393 		syslog(LOG_NOTICE, "%s: coalesced %d discontiguous inodes",
    394 		       fs->lfs_fsmnt, num);
    395 		exit(0);
    396 	}
    397 
    398 	return 0;
    399 }
    400