Home | History | Annotate | Line # | Download | only in lfs_cleanerd
coalesce.c revision 1.27
      1 /*      $NetBSD: coalesce.c,v 1.27 2015/07/28 05:09:34 dholland Exp $  */
      2 
      3 /*-
      4  * Copyright (c) 2002, 2005 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Konrad E. Schroder <perseant (at) hhhh.org>.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/param.h>
     33 #include <sys/mount.h>
     34 #include <sys/time.h>
     35 #include <sys/resource.h>
     36 #include <sys/types.h>
     37 #include <sys/wait.h>
     38 #include <sys/mman.h>
     39 
     40 #include <ufs/lfs/lfs.h>
     41 #include <ufs/lfs/lfs_accessors.h>
     42 
     43 #include <fcntl.h>
     44 #include <signal.h>
     45 #include <stdio.h>
     46 #include <stdlib.h>
     47 #include <string.h>
     48 #include <time.h>
     49 #include <unistd.h>
     50 #include <util.h>
     51 #include <errno.h>
     52 #include <err.h>
     53 #include <assert.h>
     54 
     55 #include <syslog.h>
     56 
     57 #include "bufcache.h"
     58 #include "vnode.h"
     59 #include "cleaner.h"
     60 #include "kernelops.h"
     61 
     62 extern int debug, do_mmap;
     63 
     64 int log2int(int n)
     65 {
     66 	int log;
     67 
     68 	log = 0;
     69 	while (n > 0) {
     70 		++log;
     71 		n >>= 1;
     72 	}
     73 	return log - 1;
     74 }
     75 
     76 enum coalesce_returncodes {
     77 	COALESCE_OK = 0,
     78 	COALESCE_NOINODE,
     79 	COALESCE_TOOSMALL,
     80 	COALESCE_BADSIZE,
     81 	COALESCE_BADBLOCKSIZE,
     82 	COALESCE_NOMEM,
     83 	COALESCE_BADBMAPV,
     84 	COALESCE_BADMARKV,
     85 	COALESCE_NOTWORTHIT,
     86 	COALESCE_NOTHINGLEFT,
     87 	COALESCE_EIO,
     88 
     89 	COALESCE_MAXERROR
     90 };
     91 
     92 const char *coalesce_return[] = {
     93 	"Successfully coalesced",
     94 	"File not in use or inode not found",
     95 	"Not large enough to coalesce",
     96 	"Negative size",
     97 	"Not enough blocks to account for size",
     98 	"Malloc failed",
     99 	"LFCNBMAPV failed",
    100 	"Not broken enough to fix",
    101 	"Too many blocks not found",
    102 	"Too many blocks found in active segments",
    103 	"I/O error",
    104 
    105 	"No such error"
    106 };
    107 
    108 static struct ulfs1_dinode *
    109 get_dinode(struct clfs *fs, ino_t ino)
    110 {
    111 	IFILE *ifp;
    112 	daddr_t daddr;
    113 	struct ubuf *bp;
    114 	struct ulfs1_dinode *dip, *r;
    115 
    116 	lfs_ientry(&ifp, fs, ino, &bp);
    117 	daddr = ifp->if_daddr;
    118 	brelse(bp, 0);
    119 
    120 	if (daddr == 0x0)
    121 		return NULL;
    122 
    123 	bread(fs->clfs_devvp, daddr, lfs_sb_getibsize(fs), 0, &bp);
    124 	for (dip = (struct ulfs1_dinode *)bp->b_data;
    125 	     dip < (struct ulfs1_dinode *)(bp->b_data + lfs_sb_getibsize(fs)); dip++)
    126 		if (dip->di_inumber == ino) {
    127 			r = (struct ulfs1_dinode *)malloc(sizeof(*r));
    128 			if (r == NULL)
    129 				break;
    130 			memcpy(r, dip, sizeof(*r));
    131 			brelse(bp, 0);
    132 			return r;
    133 		}
    134 	brelse(bp, 0);
    135 	return NULL;
    136 }
    137 
    138 /*
    139  * Find out if this inode's data blocks are discontinuous; if they are,
    140  * rewrite them using markv.  Return the number of inodes rewritten.
    141  */
    142 static int
    143 clean_inode(struct clfs *fs, ino_t ino)
    144 {
    145 	BLOCK_INFO *bip = NULL, *tbip;
    146 	CLEANERINFO cip;
    147 	struct ubuf *bp;
    148 	struct ulfs1_dinode *dip;
    149 	struct clfs_seguse *sup;
    150 	struct lfs_fcntl_markv /* {
    151 		BLOCK_INFO *blkiov;
    152 		int blkcnt;
    153 	} */ lim;
    154 	daddr_t toff;
    155 	int i;
    156 	int nb, onb, noff;
    157 	int retval;
    158 	int bps;
    159 
    160 	dip = get_dinode(fs, ino);
    161 	if (dip == NULL)
    162 		return COALESCE_NOINODE;
    163 
    164 	/* Compute file block size, set up for bmapv */
    165 	onb = nb = lfs_lblkno(fs, dip->di_size);
    166 
    167 	/* XXX for now, don't do any file small enough to have fragments */
    168 	if (nb < ULFS_NDADDR) {
    169 		free(dip);
    170 		return COALESCE_TOOSMALL;
    171 	}
    172 
    173 	/* Sanity checks */
    174 #if 0	/* di_size is uint64_t -- this is a noop */
    175 	if (dip->di_size < 0) {
    176 		dlog("ino %d, negative size (%" PRId64 ")", ino, dip->di_size);
    177 		free(dip);
    178 		return COALESCE_BADSIZE;
    179 	}
    180 #endif
    181 	if (nb > dip->di_blocks) {
    182 		dlog("ino %d, computed blocks %d > held blocks %d", ino, nb,
    183 		     dip->di_blocks);
    184 		free(dip);
    185 		return COALESCE_BADBLOCKSIZE;
    186 	}
    187 
    188 	bip = (BLOCK_INFO *)malloc(sizeof(BLOCK_INFO) * nb);
    189 	if (bip == NULL) {
    190 		syslog(LOG_WARNING, "ino %llu, %d blocks: %m",
    191 		    (unsigned long long)ino, nb);
    192 		free(dip);
    193 		return COALESCE_NOMEM;
    194 	}
    195 	for (i = 0; i < nb; i++) {
    196 		memset(bip + i, 0, sizeof(BLOCK_INFO));
    197 		bip[i].bi_inode = ino;
    198 		bip[i].bi_lbn = i;
    199 		bip[i].bi_version = dip->di_gen;
    200 		/* Don't set the size, but let lfs_bmap fill it in */
    201 	}
    202 	lim.blkiov = bip;
    203 	lim.blkcnt = nb;
    204 	if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim) < 0) {
    205 		syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: %m",
    206 		       lfs_sb_getfsmnt(fs));
    207 		retval = COALESCE_BADBMAPV;
    208 		goto out;
    209 	}
    210 #if 0
    211 	for (i = 0; i < nb; i++) {
    212 		printf("bi_size = %d, bi_ino = %d, "
    213 		    "bi_lbn = %d, bi_daddr = %d\n",
    214 		    bip[i].bi_size, bip[i].bi_inode, bip[i].bi_lbn,
    215 		    bip[i].bi_daddr);
    216 	}
    217 #endif
    218 	noff = toff = 0;
    219 	for (i = 1; i < nb; i++) {
    220 		if (bip[i].bi_daddr != bip[i - 1].bi_daddr + clfs_sb_getfrag(fs))
    221 			++noff;
    222 		toff += abs(bip[i].bi_daddr - bip[i - 1].bi_daddr
    223 		    - clfs_sb_getfrag(fs)) >> lfs_sb_getfbshift(fs);
    224 	}
    225 
    226 	/*
    227 	 * If this file is not discontinuous, there's no point in rewriting it.
    228 	 *
    229 	 * Explicitly allow a certain amount of discontinuity, since large
    230 	 * files will be broken among segments and medium-sized files
    231 	 * can have a break or two and it's okay.
    232 	 */
    233 	if (nb <= 1 || noff == 0 || noff < log2int(nb) ||
    234 	    lfs_segtod(fs, noff) * 2 < nb) {
    235 		retval = COALESCE_NOTWORTHIT;
    236 		goto out;
    237 	} else if (debug)
    238 		syslog(LOG_DEBUG, "ino %llu total discontinuity "
    239 		    "%d (%lld) for %d blocks", (unsigned long long)ino,
    240 		    noff, (long long)toff, nb);
    241 
    242 	/* Search for blocks in active segments; don't move them. */
    243 	for (i = 0; i < nb; i++) {
    244 		if (bip[i].bi_daddr <= 0)
    245 			continue;
    246 		sup = &fs->clfs_segtab[lfs_dtosn(fs, bip[i].bi_daddr)];
    247 		if (sup->flags & SEGUSE_ACTIVE)
    248 			bip[i].bi_daddr = LFS_UNUSED_DADDR; /* 0 */
    249 	}
    250 
    251 	/*
    252 	 * Get rid of any blocks we've marked dead.  If this is an older
    253 	 * kernel that doesn't have bmapv fill in the block sizes, we'll
    254 	 * toss everything here.
    255 	 */
    256 	onb = nb;
    257 	toss_old_blocks(fs, &bip, &nb, NULL);
    258 	nb = i;
    259 
    260 	/*
    261 	 * We may have tossed enough blocks that it is no longer worthwhile
    262 	 * to rewrite this inode.
    263 	 */
    264 	if (nb == 0 || onb - nb > log2int(onb)) {
    265 		if (debug)
    266 			syslog(LOG_DEBUG, "too many blocks tossed, not rewriting");
    267 		retval = COALESCE_NOTHINGLEFT;
    268 		goto out;
    269 	}
    270 
    271 	/*
    272 	 * We are going to rewrite this inode.
    273 	 * For any remaining blocks, read in their contents.
    274 	 */
    275 	for (i = 0; i < nb; i++) {
    276 		bip[i].bi_bp = malloc(bip[i].bi_size);
    277 		if (bip[i].bi_bp == NULL) {
    278 			syslog(LOG_WARNING, "allocate block buffer size=%d: %m",
    279 			    bip[i].bi_size);
    280 			retval = COALESCE_NOMEM;
    281 			goto out;
    282 		}
    283 
    284 		if (kops.ko_pread(fs->clfs_devfd, bip[i].bi_bp, bip[i].bi_size,
    285 			  lfs_fsbtob(fs, bip[i].bi_daddr)) < 0) {
    286 			retval = COALESCE_EIO;
    287 			goto out;
    288 		}
    289 	}
    290 	if (debug)
    291 		syslog(LOG_DEBUG, "ino %llu markv %d blocks",
    292 		    (unsigned long long)ino, nb);
    293 
    294 	/*
    295 	 * Write in segment-sized chunks.  If at any point we'd write more
    296 	 * than half of the available segments, sleep until that's not
    297 	 * true any more.
    298 	 */
    299 	bps = lfs_segtod(fs, 1);
    300 	for (tbip = bip; tbip < bip + nb; tbip += bps) {
    301 		do {
    302 			bread(fs->lfs_ivnode, 0, clfs_sb_getbsize(fs), 0, &bp);
    303 			cip = *(CLEANERINFO *)bp->b_data;
    304 			brelse(bp, B_INVAL);
    305 
    306 			if (cip.clean < 4) /* XXX magic number 4 */
    307 				kops.ko_fcntl(fs->clfs_ifilefd,
    308 				    LFCNSEGWAIT, NULL);
    309 		} while(cip.clean < 4);
    310 
    311 		lim.blkiov = tbip;
    312 		lim.blkcnt = (tbip + bps < bip + nb ? bps : nb % bps);
    313 		if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim) < 0) {
    314 			retval = COALESCE_BADMARKV;
    315 			goto out;
    316 		}
    317 	}
    318 
    319 	retval = COALESCE_OK;
    320 out:
    321 	free(dip);
    322 	if (bip) {
    323 		for (i = 0; i < onb; i++)
    324 			if (bip[i].bi_bp)
    325 				free(bip[i].bi_bp);
    326 		free(bip);
    327 	}
    328 	return retval;
    329 }
    330 
    331 /*
    332  * Try coalescing every inode in the filesystem.
    333  * Return the number of inodes actually altered.
    334  */
    335 int clean_all_inodes(struct clfs *fs)
    336 {
    337 	int i, r, maxino;
    338 	int totals[COALESCE_MAXERROR];
    339 	struct stat st;
    340 
    341 	memset(totals, 0, sizeof(totals));
    342 
    343 	fstat(fs->clfs_ifilefd, &st);
    344 	maxino = lfs_sb_getifpb(fs) * (st.st_size >> lfs_sb_getbshift(fs)) -
    345 		lfs_sb_getsegtabsz(fs) - lfs_sb_getcleansz(fs);
    346 
    347 	for (i = 0; i < maxino; i++) {
    348 		r = clean_inode(fs, i);
    349 		++totals[r];
    350 	}
    351 
    352 	for (i = 0; i < COALESCE_MAXERROR; i++)
    353 		if (totals[i])
    354 			syslog(LOG_DEBUG, "%s: %d", coalesce_return[i],
    355 			       totals[i]);
    356 
    357 	return totals[COALESCE_OK];
    358 }
    359 
    360 /*
    361  * Fork a child process to coalesce this fs.
    362  */
    363 int
    364 fork_coalesce(struct clfs *fs)
    365 {
    366 	static pid_t childpid;
    367 	int num;
    368 
    369 	/*
    370 	 * If already running a coalescing child, don't start a new one.
    371 	 */
    372 	if (childpid) {
    373 		if (waitpid(childpid, NULL, WNOHANG) == childpid)
    374 			childpid = 0;
    375 	}
    376 	if (childpid && kill(childpid, 0) >= 0) {
    377 		/* already running a coalesce process */
    378 		if (debug)
    379 			syslog(LOG_DEBUG, "coalescing already in progress");
    380 		return 0;
    381 	}
    382 
    383 	/*
    384 	 * Fork a child and let the child coalease
    385 	 */
    386 	childpid = fork();
    387 	if (childpid < 0) {
    388 		syslog(LOG_ERR, "%s: fork to coaleasce: %m", lfs_sb_getfsmnt(fs));
    389 		return 0;
    390 	} else if (childpid == 0) {
    391 		syslog(LOG_NOTICE, "%s: new coalescing process, pid %d",
    392 		       lfs_sb_getfsmnt(fs), getpid());
    393 		num = clean_all_inodes(fs);
    394 		syslog(LOG_NOTICE, "%s: coalesced %d discontiguous inodes",
    395 		       lfs_sb_getfsmnt(fs), num);
    396 		exit(0);
    397 	}
    398 
    399 	return 0;
    400 }
    401