Home | History | Annotate | Line # | Download | only in lfs_cleanerd
coalesce.c revision 1.14.6.1
      1 /*      $NetBSD: coalesce.c,v 1.14.6.1 2008/05/18 12:30:45 yamt Exp $  */
      2 
      3 /*-
      4  * Copyright (c) 2002, 2005 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Konrad E. Schroder <perseant (at) hhhh.org>.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/param.h>
     33 #include <sys/mount.h>
     34 #include <sys/time.h>
     35 #include <sys/resource.h>
     36 #include <sys/types.h>
     37 #include <sys/wait.h>
     38 #include <sys/mman.h>
     39 
     40 #include <ufs/ufs/dinode.h>
     41 #include <ufs/lfs/lfs.h>
     42 
     43 #include <fcntl.h>
     44 #include <signal.h>
     45 #include <stdio.h>
     46 #include <stdlib.h>
     47 #include <string.h>
     48 #include <time.h>
     49 #include <unistd.h>
     50 #include <util.h>
     51 #include <errno.h>
     52 #include <err.h>
     53 
     54 #include <syslog.h>
     55 
     56 #include "bufcache.h"
     57 #include "vnode.h"
     58 #include "cleaner.h"
     59 
     60 extern int debug, do_mmap;
     61 
     62 int log2int(int n)
     63 {
     64 	int log;
     65 
     66 	log = 0;
     67 	while (n > 0) {
     68 		++log;
     69 		n >>= 1;
     70 	}
     71 	return log - 1;
     72 }
     73 
     74 enum coalesce_returncodes {
     75 	COALESCE_OK = 0,
     76 	COALESCE_NOINODE,
     77 	COALESCE_TOOSMALL,
     78 	COALESCE_BADSIZE,
     79 	COALESCE_BADBLOCKSIZE,
     80 	COALESCE_NOMEM,
     81 	COALESCE_BADBMAPV,
     82 	COALESCE_BADMARKV,
     83 	COALESCE_NOTWORTHIT,
     84 	COALESCE_NOTHINGLEFT,
     85 	COALESCE_EIO,
     86 
     87 	COALESCE_MAXERROR
     88 };
     89 
     90 char *coalesce_return[] = {
     91 	"Successfully coalesced",
     92 	"File not in use or inode not found",
     93 	"Not large enough to coalesce",
     94 	"Negative size",
     95 	"Not enough blocks to account for size",
     96 	"Malloc failed",
     97 	"LFCNBMAPV failed",
     98 	"Not broken enough to fix",
     99 	"Too many blocks not found",
    100 	"Too many blocks found in active segments",
    101 	"I/O error",
    102 
    103 	"No such error"
    104 };
    105 
    106 static struct ufs1_dinode *
    107 get_dinode(struct clfs *fs, ino_t ino)
    108 {
    109 	IFILE *ifp;
    110 	daddr_t daddr;
    111 	struct ubuf *bp;
    112 	struct ufs1_dinode *dip, *r;
    113 
    114 	lfs_ientry(&ifp, fs, ino, &bp);
    115 	daddr = ifp->if_daddr;
    116 	brelse(bp, 0);
    117 
    118 	if (daddr == 0x0)
    119 		return NULL;
    120 
    121 	bread(fs->clfs_devvp, daddr, fs->lfs_ibsize, NOCRED, 0, &bp);
    122 	for (dip = (struct ufs1_dinode *)bp->b_data;
    123 	     dip < (struct ufs1_dinode *)(bp->b_data + fs->lfs_ibsize); dip++)
    124 		if (dip->di_inumber == ino) {
    125 			r = (struct ufs1_dinode *)malloc(sizeof(*r));
    126 			memcpy(r, dip, sizeof(*r));
    127 			brelse(bp, 0);
    128 			return r;
    129 		}
    130 	brelse(bp, 0);
    131 	return NULL;
    132 }
    133 
    134 /*
    135  * Find out if this inode's data blocks are discontinuous; if they are,
    136  * rewrite them using markv.  Return the number of inodes rewritten.
    137  */
    138 static int
    139 clean_inode(struct clfs *fs, ino_t ino)
    140 {
    141 	BLOCK_INFO *bip = NULL, *tbip;
    142 	CLEANERINFO cip;
    143 	struct ubuf *bp;
    144 	struct ufs1_dinode *dip;
    145 	struct clfs_seguse *sup;
    146 	struct lfs_fcntl_markv /* {
    147 		BLOCK_INFO *blkiov;
    148 		int blkcnt;
    149 	} */ lim;
    150 	daddr_t toff;
    151 	int i;
    152 	int nb, onb, noff;
    153 	int retval;
    154 	int bps;
    155 
    156 	dip = get_dinode(fs, ino);
    157 	if (dip == NULL)
    158 		return COALESCE_NOINODE;
    159 
    160 	/* Compute file block size, set up for bmapv */
    161 	onb = nb = lblkno(fs, dip->di_size);
    162 
    163 	/* XXX for now, don't do any file small enough to have fragments */
    164 	if (nb < NDADDR) {
    165 		free(dip);
    166 		return COALESCE_TOOSMALL;
    167 	}
    168 
    169 	/* Sanity checks */
    170 	if (dip->di_size < 0) {
    171 		dlog("ino %d, negative size (%" PRId64 ")", ino, dip->di_size);
    172 		free(dip);
    173 		return COALESCE_BADSIZE;
    174 	}
    175 	if (nb > dip->di_blocks) {
    176 		dlog("ino %d, computed blocks %d > held blocks %d", ino, nb,
    177 		     dip->di_blocks);
    178 		free(dip);
    179 		return COALESCE_BADBLOCKSIZE;
    180 	}
    181 
    182 	bip = (BLOCK_INFO *)malloc(sizeof(BLOCK_INFO) * nb);
    183 	if (bip == NULL) {
    184 		syslog(LOG_WARNING, "ino %llu, %d blocks: %m",
    185 		    (unsigned long long)ino, nb);
    186 		free(dip);
    187 		return COALESCE_NOMEM;
    188 	}
    189 	for (i = 0; i < nb; i++) {
    190 		memset(bip + i, 0, sizeof(BLOCK_INFO));
    191 		bip[i].bi_inode = ino;
    192 		bip[i].bi_lbn = i;
    193 		bip[i].bi_version = dip->di_gen;
    194 		/* Don't set the size, but let lfs_bmap fill it in */
    195 	}
    196 	lim.blkiov = bip;
    197 	lim.blkcnt = nb;
    198 	if (fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim) < 0) {
    199 		syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: %m",
    200 		       fs->lfs_fsmnt);
    201 		retval = COALESCE_BADBMAPV;
    202 		goto out;
    203 	}
    204 #if 0
    205 	for (i = 0; i < nb; i++) {
    206 		printf("bi_size = %d, bi_ino = %d, "
    207 		    "bi_lbn = %d, bi_daddr = %d\n",
    208 		    bip[i].bi_size, bip[i].bi_inode, bip[i].bi_lbn,
    209 		    bip[i].bi_daddr);
    210 	}
    211 #endif
    212 	noff = toff = 0;
    213 	for (i = 1; i < nb; i++) {
    214 		if (bip[i].bi_daddr != bip[i - 1].bi_daddr + fs->lfs_frag)
    215 			++noff;
    216 		toff += abs(bip[i].bi_daddr - bip[i - 1].bi_daddr
    217 		    - fs->lfs_frag) >> fs->lfs_fbshift;
    218 	}
    219 
    220 	/*
    221 	 * If this file is not discontinuous, there's no point in rewriting it.
    222 	 *
    223 	 * Explicitly allow a certain amount of discontinuity, since large
    224 	 * files will be broken among segments and medium-sized files
    225 	 * can have a break or two and it's okay.
    226 	 */
    227 	if (nb <= 1 || noff == 0 || noff < log2int(nb) ||
    228 	    segtod(fs, noff) * 2 < nb) {
    229 		retval = COALESCE_NOTWORTHIT;
    230 		goto out;
    231 	} else if (debug)
    232 		syslog(LOG_DEBUG, "ino %llu total discontinuity "
    233 		    "%d (%lld) for %d blocks", (unsigned long long)ino,
    234 		    noff, (long long)toff, nb);
    235 
    236 	/* Search for blocks in active segments; don't move them. */
    237 	for (i = 0; i < nb; i++) {
    238 		if (bip[i].bi_daddr <= 0)
    239 			continue;
    240 		sup = &fs->clfs_segtab[dtosn(fs, bip[i].bi_daddr)];
    241 		if (sup->flags & SEGUSE_ACTIVE)
    242 			bip[i].bi_daddr = LFS_UNUSED_DADDR; /* 0 */
    243 	}
    244 
    245 	/*
    246 	 * Get rid of any blocks we've marked dead.  If this is an older
    247 	 * kernel that doesn't have bmapv fill in the block sizes, we'll
    248 	 * toss everything here.
    249 	 */
    250 	onb = nb;
    251 	toss_old_blocks(fs, &bip, &nb, NULL);
    252 	nb = i;
    253 
    254 	/*
    255 	 * We may have tossed enough blocks that it is no longer worthwhile
    256 	 * to rewrite this inode.
    257 	 */
    258 	if (nb == 0 || onb - nb > log2int(onb)) {
    259 		if (debug)
    260 			syslog(LOG_DEBUG, "too many blocks tossed, not rewriting");
    261 		retval = COALESCE_NOTHINGLEFT;
    262 		goto out;
    263 	}
    264 
    265 	/*
    266 	 * We are going to rewrite this inode.
    267 	 * For any remaining blocks, read in their contents.
    268 	 */
    269 	for (i = 0; i < nb; i++) {
    270 		bip[i].bi_bp = malloc(bip[i].bi_size);
    271 		if (bip[i].bi_bp == NULL) {
    272 			syslog(LOG_WARNING, "allocate block buffer size=%d: %m",
    273 			    bip[i].bi_size);
    274 			retval = COALESCE_NOMEM;
    275 			goto out;
    276 		}
    277 
    278 		if (pread(fs->clfs_devfd, bip[i].bi_bp, bip[i].bi_size,
    279 			  fsbtob(fs, bip[i].bi_daddr)) < 0) {
    280 			retval = COALESCE_EIO;
    281 			goto out;
    282 		}
    283 	}
    284 	if (debug)
    285 		syslog(LOG_DEBUG, "ino %llu markv %d blocks",
    286 		    (unsigned long long)ino, nb);
    287 
    288 	/*
    289 	 * Write in segment-sized chunks.  If at any point we'd write more
    290 	 * than half of the available segments, sleep until that's not
    291 	 * true any more.
    292 	 */
    293 	bps = segtod(fs, 1);
    294 	for (tbip = bip; tbip < bip + nb; tbip += bps) {
    295 		do {
    296 			bread(fs->lfs_ivnode, 0, fs->lfs_bsize, NOCRED, 0, &bp);
    297 			cip = *(CLEANERINFO *)bp->b_data;
    298 			brelse(bp, B_INVAL);
    299 
    300 			if (cip.clean < 4) /* XXX magic number 4 */
    301 				fcntl(fs->clfs_ifilefd, LFCNSEGWAIT, NULL);
    302 		} while(cip.clean < 4);
    303 
    304 		lim.blkiov = tbip;
    305 		lim.blkcnt = (tbip + bps < bip + nb ? bps : nb % bps);
    306 		if (fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim) < 0) {
    307 			retval = COALESCE_BADMARKV;
    308 			goto out;
    309 		}
    310 	}
    311 
    312 	retval = COALESCE_OK;
    313 out:
    314 	free(dip);
    315 	if (bip) {
    316 		for (i = 0; i < onb; i++)
    317 			if (bip[i].bi_bp)
    318 				free(bip[i].bi_bp);
    319 		free(bip);
    320 	}
    321 	return retval;
    322 }
    323 
    324 /*
    325  * Try coalescing every inode in the filesystem.
    326  * Return the number of inodes actually altered.
    327  */
    328 int clean_all_inodes(struct clfs *fs)
    329 {
    330 	int i, r, maxino;
    331 	int totals[COALESCE_MAXERROR];
    332 	struct stat st;
    333 
    334 	memset(totals, 0, sizeof(totals));
    335 
    336 	fstat(fs->clfs_ifilefd, &st);
    337 	maxino = fs->lfs_ifpb * (st.st_size >> fs->lfs_bshift) -
    338 		fs->lfs_segtabsz - fs->lfs_cleansz;
    339 
    340 	for (i = 0; i < maxino; i++) {
    341 		r = clean_inode(fs, i);
    342 		++totals[r];
    343 	}
    344 
    345 	for (i = 0; i < COALESCE_MAXERROR; i++)
    346 		if (totals[i])
    347 			syslog(LOG_DEBUG, "%s: %d", coalesce_return[i],
    348 			       totals[i]);
    349 
    350 	return totals[COALESCE_OK];
    351 }
    352 
    353 /*
    354  * Fork a child process to coalesce this fs.
    355  */
    356 int
    357 fork_coalesce(struct clfs *fs)
    358 {
    359 	static pid_t childpid;
    360 	int num;
    361 
    362 	/*
    363 	 * If already running a coalescing child, don't start a new one.
    364 	 */
    365 	if (childpid) {
    366 		if (waitpid(childpid, NULL, WNOHANG) == childpid)
    367 			childpid = 0;
    368 	}
    369 	if (childpid && kill(childpid, 0) >= 0) {
    370 		/* already running a coalesce process */
    371 		if (debug)
    372 			syslog(LOG_DEBUG, "coalescing already in progress");
    373 		return 0;
    374 	}
    375 
    376 	/*
    377 	 * Fork a child and let the child coalease
    378 	 */
    379 	childpid = fork();
    380 	if (childpid < 0) {
    381 		syslog(LOG_ERR, "%s: fork to coaleasce: %m", fs->lfs_fsmnt);
    382 		return 0;
    383 	} else if (childpid == 0) {
    384 		syslog(LOG_NOTICE, "%s: new coalescing process, pid %d",
    385 		       fs->lfs_fsmnt, getpid());
    386 		num = clean_all_inodes(fs);
    387 		syslog(LOG_NOTICE, "%s: coalesced %d discontiguous inodes",
    388 		       fs->lfs_fsmnt, num);
    389 		exit(0);
    390 	}
    391 
    392 	return 0;
    393 }
    394