Home | History | Annotate | Line # | Download | only in lfs_cleanerd
coalesce.c revision 1.11
      1  1.11  perseant /*      $NetBSD: coalesce.c,v 1.11 2006/03/30 19:10:13 perseant Exp $  */
      2   1.1  perseant 
      3   1.1  perseant /*-
      4  1.11  perseant  * Copyright (c) 2002, 2005 The NetBSD Foundation, Inc.
      5   1.1  perseant  * All rights reserved.
      6   1.1  perseant  *
      7   1.1  perseant  * This code is derived from software contributed to The NetBSD Foundation
      8   1.1  perseant  * by Konrad E. Schroder <perseant (at) hhhh.org>.
      9   1.1  perseant  *
     10   1.1  perseant  * Redistribution and use in source and binary forms, with or without
     11   1.1  perseant  * modification, are permitted provided that the following conditions
     12   1.1  perseant  * are met:
     13   1.1  perseant  * 1. Redistributions of source code must retain the above copyright
     14   1.1  perseant  *    notice, this list of conditions and the following disclaimer.
     15   1.1  perseant  * 2. Redistributions in binary form must reproduce the above copyright
     16   1.1  perseant  *    notice, this list of conditions and the following disclaimer in the
     17   1.1  perseant  *    documentation and/or other materials provided with the distribution.
     18   1.1  perseant  * 3. All advertising materials mentioning features or use of this software
     19   1.1  perseant  *    must display the following acknowledgement:
     20   1.1  perseant  *      This product includes software developed by the NetBSD
     21   1.1  perseant  *      Foundation, Inc. and its contributors.
     22   1.1  perseant  * 4. Neither the name of The NetBSD Foundation nor the names of its
     23   1.1  perseant  *    contributors may be used to endorse or promote products derived
     24   1.1  perseant  *    from this software without specific prior written permission.
     25   1.1  perseant  *
     26   1.1  perseant  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     27   1.1  perseant  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     28   1.1  perseant  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     29   1.1  perseant  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     30   1.1  perseant  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     31   1.1  perseant  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     32   1.1  perseant  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     33   1.1  perseant  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     34   1.1  perseant  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     35   1.1  perseant  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     36   1.1  perseant  * POSSIBILITY OF SUCH DAMAGE.
     37   1.1  perseant  */
     38   1.1  perseant 
     39   1.1  perseant #include <sys/param.h>
     40   1.1  perseant #include <sys/mount.h>
     41   1.1  perseant #include <sys/time.h>
     42   1.1  perseant #include <sys/resource.h>
     43   1.1  perseant #include <sys/types.h>
     44   1.1  perseant #include <sys/wait.h>
     45   1.1  perseant #include <sys/mman.h>
     46   1.1  perseant 
     47   1.1  perseant #include <ufs/ufs/dinode.h>
     48   1.1  perseant #include <ufs/lfs/lfs.h>
     49   1.1  perseant 
     50   1.1  perseant #include <fcntl.h>
     51   1.1  perseant #include <signal.h>
     52   1.1  perseant #include <stdio.h>
     53   1.1  perseant #include <stdlib.h>
     54   1.1  perseant #include <string.h>
     55   1.1  perseant #include <time.h>
     56   1.1  perseant #include <unistd.h>
     57   1.1  perseant #include <util.h>
     58   1.1  perseant #include <errno.h>
     59   1.1  perseant #include <err.h>
     60   1.1  perseant 
     61   1.1  perseant #include <syslog.h>
     62   1.1  perseant 
     63  1.11  perseant #include "bufcache.h"
     64  1.11  perseant #include "vnode.h"
     65  1.11  perseant #include "cleaner.h"
     66   1.1  perseant 
     67   1.2  perseant extern int debug, do_mmap;
     68   1.1  perseant 
     69  1.11  perseant int log2int(int n)
     70   1.2  perseant {
     71   1.2  perseant 	int log;
     72   1.2  perseant 
     73   1.2  perseant 	log = 0;
     74   1.2  perseant 	while (n > 0) {
     75   1.2  perseant 		++log;
     76  1.11  perseant 		n >>= 1;
     77   1.2  perseant 	}
     78   1.2  perseant 	return log - 1;
     79   1.2  perseant }
     80   1.2  perseant 
     81   1.3  perseant enum coalesce_returncodes {
     82   1.3  perseant 	COALESCE_OK = 0,
     83   1.3  perseant 	COALESCE_NOINODE,
     84   1.3  perseant 	COALESCE_TOOSMALL,
     85   1.3  perseant 	COALESCE_BADSIZE,
     86   1.3  perseant 	COALESCE_BADBLOCKSIZE,
     87   1.3  perseant 	COALESCE_NOMEM,
     88   1.3  perseant 	COALESCE_BADBMAPV,
     89  1.11  perseant 	COALESCE_BADMARKV,
     90   1.3  perseant 	COALESCE_NOTWORTHIT,
     91   1.3  perseant 	COALESCE_NOTHINGLEFT,
     92   1.5      yamt 	COALESCE_EIO,
     93   1.3  perseant 
     94   1.3  perseant 	COALESCE_MAXERROR
     95   1.3  perseant };
     96   1.3  perseant 
     97   1.3  perseant char *coalesce_return[] = {
     98   1.3  perseant 	"Successfully coalesced",
     99   1.3  perseant 	"File not in use or inode not found",
    100   1.3  perseant 	"Not large enough to coalesce",
    101   1.3  perseant 	"Negative size",
    102   1.3  perseant 	"Not enough blocks to account for size",
    103   1.3  perseant 	"Malloc failed",
    104   1.8  perseant 	"LFCNBMAPV failed",
    105   1.3  perseant 	"Not broken enough to fix",
    106   1.3  perseant 	"Too many blocks not found",
    107   1.3  perseant 	"Too many blocks found in active segments",
    108   1.5      yamt 	"I/O error",
    109   1.3  perseant 
    110   1.3  perseant 	"No such error"
    111   1.3  perseant };
    112   1.3  perseant 
    113  1.11  perseant static struct ufs1_dinode *
    114  1.11  perseant get_dinode(struct clfs *fs, ino_t ino)
    115  1.11  perseant {
    116  1.11  perseant 	IFILE *ifp;
    117  1.11  perseant 	daddr_t daddr;
    118  1.11  perseant 	struct ubuf *bp;
    119  1.11  perseant 	struct ufs1_dinode *dip, *r;
    120  1.11  perseant 
    121  1.11  perseant 	lfs_ientry(&ifp, fs, ino, &bp);
    122  1.11  perseant 	daddr = ifp->if_daddr;
    123  1.11  perseant 	brelse(bp);
    124  1.11  perseant 
    125  1.11  perseant 	if (daddr == 0x0)
    126  1.11  perseant 		return NULL;
    127  1.11  perseant 
    128  1.11  perseant 	bread(fs->clfs_devvp, daddr, fs->lfs_ibsize, NOCRED, &bp);
    129  1.11  perseant 	for (dip = (struct ufs1_dinode *)bp->b_data;
    130  1.11  perseant 	     dip < (struct ufs1_dinode *)(bp->b_data + fs->lfs_ibsize); dip++)
    131  1.11  perseant 		if (dip->di_inumber == ino) {
    132  1.11  perseant 			r = (struct ufs1_dinode *)malloc(sizeof(*r));
    133  1.11  perseant 			memcpy(r, dip, sizeof(*r));
    134  1.11  perseant 			brelse(bp);
    135  1.11  perseant 			return r;
    136  1.11  perseant 		}
    137  1.11  perseant 	brelse(bp);
    138  1.11  perseant 	return NULL;
    139  1.11  perseant }
    140  1.11  perseant 
    141   1.1  perseant /*
    142   1.1  perseant  * Find out if this inode's data blocks are discontinuous; if they are,
    143   1.7  perseant  * rewrite them using markv.  Return the number of inodes rewritten.
    144   1.1  perseant  */
    145  1.11  perseant static int
    146  1.11  perseant clean_inode(struct clfs *fs, ino_t ino)
    147   1.1  perseant {
    148   1.7  perseant 	BLOCK_INFO *bip = NULL, *tbip;
    149  1.11  perseant 	CLEANERINFO cip;
    150  1.11  perseant 	struct ubuf *bp;
    151   1.9      fvdl 	struct ufs1_dinode *dip;
    152  1.11  perseant 	struct clfs_seguse *sup;
    153  1.11  perseant 	struct lfs_fcntl_markv /* {
    154  1.11  perseant 		BLOCK_INFO *blkiov;
    155  1.11  perseant 		int blkcnt;
    156  1.11  perseant 	} */ lim;
    157  1.11  perseant 	daddr_t toff;
    158  1.11  perseant 	int i;
    159   1.2  perseant 	int nb, onb, noff;
    160  1.11  perseant 	int retval;
    161   1.1  perseant 	int bps;
    162   1.1  perseant 
    163  1.11  perseant 	dip = get_dinode(fs, ino);
    164   1.1  perseant 	if (dip == NULL)
    165   1.3  perseant 		return COALESCE_NOINODE;
    166   1.1  perseant 
    167   1.7  perseant 	/* Compute file block size, set up for bmapv */
    168  1.11  perseant 	onb = nb = lblkno(fs, dip->di_size);
    169   1.2  perseant 
    170   1.2  perseant 	/* XXX for now, don't do any file small enough to have fragments */
    171   1.2  perseant 	if (nb < NDADDR)
    172   1.3  perseant 		return COALESCE_TOOSMALL;
    173   1.2  perseant 
    174   1.2  perseant 	/* Sanity checks */
    175   1.2  perseant 	if (dip->di_size < 0) {
    176  1.11  perseant 		dlog("ino %d, negative size (%" PRId64 ")", ino, dip->di_size);
    177   1.3  perseant 		return COALESCE_BADSIZE;
    178   1.2  perseant 	}
    179   1.1  perseant 	if (nb > dip->di_blocks) {
    180  1.11  perseant 		dlog("ino %d, computed blocks %d > held blocks %d", ino, nb,
    181  1.11  perseant 		     dip->di_blocks);
    182   1.3  perseant 		return COALESCE_BADBLOCKSIZE;
    183   1.1  perseant 	}
    184   1.2  perseant 
    185   1.7  perseant 	bip = (BLOCK_INFO *)malloc(sizeof(BLOCK_INFO) * nb);
    186   1.1  perseant 	if (bip == NULL) {
    187  1.10  christos 		syslog(LOG_WARNING, "ino %llu, %d blocks: %m",
    188  1.10  christos 		    (unsigned long long)ino, nb);
    189   1.3  perseant 		return COALESCE_NOMEM;
    190   1.1  perseant 	}
    191   1.1  perseant 	for (i = 0; i < nb; i++) {
    192   1.7  perseant 		memset(bip + i, 0, sizeof(BLOCK_INFO));
    193   1.1  perseant 		bip[i].bi_inode = ino;
    194   1.1  perseant 		bip[i].bi_lbn = i;
    195   1.2  perseant 		bip[i].bi_version = dip->di_gen;
    196   1.1  perseant 		/* Don't set the size, but let lfs_bmap fill it in */
    197   1.1  perseant 	}
    198  1.11  perseant 	lim.blkiov = bip;
    199  1.11  perseant 	lim.blkcnt = nb;
    200  1.11  perseant 	if (fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim) < 0) {
    201  1.11  perseant 		syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: %m",
    202  1.11  perseant 		       fs->lfs_fsmnt);
    203   1.5      yamt 		retval = COALESCE_BADBMAPV;
    204   1.5      yamt 		goto out;
    205   1.5      yamt 	}
    206   1.5      yamt #if 0
    207   1.5      yamt 	for (i = 0; i < nb; i++) {
    208   1.5      yamt 		printf("bi_size = %d, bi_ino = %d, "
    209   1.5      yamt 		    "bi_lbn = %d, bi_daddr = %d\n",
    210   1.5      yamt 		    bip[i].bi_size, bip[i].bi_inode, bip[i].bi_lbn,
    211   1.5      yamt 		    bip[i].bi_daddr);
    212   1.1  perseant 	}
    213   1.5      yamt #endif
    214   1.1  perseant 	noff = toff = 0;
    215   1.1  perseant 	for (i = 1; i < nb; i++) {
    216  1.11  perseant 		if (bip[i].bi_daddr != bip[i - 1].bi_daddr + fs->lfs_frag)
    217   1.1  perseant 			++noff;
    218   1.4      yamt 		toff += abs(bip[i].bi_daddr - bip[i - 1].bi_daddr
    219  1.11  perseant 		    - fs->lfs_frag) >> fs->lfs_fbshift;
    220   1.1  perseant 	}
    221   1.1  perseant 
    222   1.1  perseant 	/*
    223   1.1  perseant 	 * If this file is not discontinuous, there's no point in rewriting it.
    224  1.11  perseant 	 *
    225  1.11  perseant 	 * Explicitly allow a certain amount of discontinuity, since large
    226  1.11  perseant 	 * files will be broken among segments and medium-sized files
    227  1.11  perseant 	 * can have a break or two and it's okay.
    228   1.1  perseant 	 */
    229   1.2  perseant 	if (nb <= 1 || noff == 0 || noff < log2int(nb) ||
    230  1.11  perseant 	    segtod(fs, noff) * 2 < nb) {
    231   1.5      yamt 		retval = COALESCE_NOTWORTHIT;
    232   1.5      yamt 		goto out;
    233   1.1  perseant 	} else if (debug)
    234  1.10  christos 		syslog(LOG_DEBUG, "ino %llu total discontinuity "
    235  1.10  christos 		    "%d (%lld) for %d blocks", (unsigned long long)ino,
    236  1.10  christos 		    noff, (long long)toff, nb);
    237   1.1  perseant 
    238   1.1  perseant 	/* Search for blocks in active segments; don't move them. */
    239   1.1  perseant 	for (i = 0; i < nb; i++) {
    240   1.1  perseant 		if (bip[i].bi_daddr <= 0)
    241   1.1  perseant 			continue;
    242  1.11  perseant 		sup = &fs->clfs_segtab[dtosn(fs, bip[i].bi_daddr)];
    243  1.11  perseant 		if (sup->flags & SEGUSE_ACTIVE)
    244   1.1  perseant 			bip[i].bi_daddr = LFS_UNUSED_DADDR; /* 0 */
    245   1.1  perseant 	}
    246  1.11  perseant 
    247  1.11  perseant 	/*
    248  1.11  perseant 	 * Get rid of any blocks we've marked dead.  If this is an older
    249  1.11  perseant 	 * kernel that doesn't have bmapv fill in the block sizes, we'll
    250  1.11  perseant 	 * toss everything here.
    251   1.1  perseant 	 */
    252  1.11  perseant 	onb = nb;
    253  1.11  perseant 	toss_old_blocks(fs, &bip, &nb);
    254  1.11  perseant 	nb = i;
    255   1.2  perseant 
    256   1.1  perseant 	/*
    257   1.2  perseant 	 * We may have tossed enough blocks that it is no longer worthwhile
    258   1.2  perseant 	 * to rewrite this inode.
    259   1.1  perseant 	 */
    260  1.11  perseant 	if (nb == 0 || onb - nb > log2int(onb)) {
    261   1.3  perseant 		if (debug)
    262   1.3  perseant 			syslog(LOG_DEBUG, "too many blocks tossed, not rewriting");
    263  1.11  perseant 		retval = COALESCE_NOTHINGLEFT;
    264  1.11  perseant 		goto out;
    265   1.1  perseant 	}
    266   1.1  perseant 
    267  1.11  perseant 	/*
    268   1.1  perseant 	 * We are going to rewrite this inode.
    269   1.1  perseant 	 * For any remaining blocks, read in their contents.
    270   1.1  perseant 	 */
    271   1.1  perseant 	for (i = 0; i < nb; i++) {
    272   1.1  perseant 		bip[i].bi_bp = malloc(bip[i].bi_size);
    273   1.5      yamt 		if (bip[i].bi_bp == NULL) {
    274   1.5      yamt 			syslog(LOG_WARNING, "allocate block buffer size=%d: %m",
    275   1.5      yamt 			    bip[i].bi_size);
    276   1.5      yamt 			retval = COALESCE_NOMEM;
    277   1.5      yamt 			goto out;
    278   1.5      yamt 		}
    279  1.11  perseant 
    280  1.11  perseant 		if (pread(fs->clfs_devfd, bip[i].bi_bp, bip[i].bi_size,
    281  1.11  perseant 			  fsbtob(fs, bip[i].bi_daddr)) < 0) {
    282   1.5      yamt 			retval = COALESCE_EIO;
    283   1.5      yamt 			goto out;
    284   1.5      yamt 		}
    285   1.1  perseant 	}
    286   1.1  perseant 	if (debug)
    287  1.10  christos 		syslog(LOG_DEBUG, "ino %llu markv %d blocks",
    288  1.10  christos 		    (unsigned long long)ino, nb);
    289   1.1  perseant 
    290   1.2  perseant 	/*
    291   1.2  perseant 	 * Write in segment-sized chunks.  If at any point we'd write more
    292   1.2  perseant 	 * than half of the available segments, sleep until that's not
    293   1.2  perseant 	 * true any more.
    294   1.2  perseant 	 */
    295  1.11  perseant 	bps = segtod(fs, 1);
    296   1.1  perseant 	for (tbip = bip; tbip < bip + nb; tbip += bps) {
    297  1.11  perseant 		do {
    298  1.11  perseant 			bread(fs->lfs_ivnode, 0, fs->lfs_bsize, NOCRED, &bp);
    299  1.11  perseant 			cip = *(CLEANERINFO *)bp->b_data;
    300  1.11  perseant 			bp->b_flags |= B_INVAL;
    301  1.11  perseant 			brelse(bp);
    302  1.11  perseant 
    303  1.11  perseant 			if (cip.clean < 4) /* XXX magic number 4 */
    304  1.11  perseant 				fcntl(fs->clfs_ifilefd, LFCNSEGWAIT, NULL);
    305  1.11  perseant 		} while(cip.clean < 4);
    306  1.11  perseant 
    307  1.11  perseant 		lim.blkiov = tbip;
    308  1.11  perseant 		lim.blkcnt = (tbip + bps < bip + nb ? bps : nb % bps);
    309  1.11  perseant 		if (fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim) < 0) {
    310  1.11  perseant 			retval = COALESCE_BADMARKV;
    311  1.11  perseant 			goto out;
    312   1.2  perseant 		}
    313   1.1  perseant 	}
    314   1.1  perseant 
    315   1.5      yamt 	retval = COALESCE_OK;
    316   1.5      yamt out:
    317  1.11  perseant 	free(dip);
    318   1.5      yamt 	if (bip) {
    319   1.5      yamt 		for (i = 0; i < onb; i++)
    320   1.5      yamt 			if (bip[i].bi_bp)
    321   1.5      yamt 				free(bip[i].bi_bp);
    322   1.5      yamt 		free(bip);
    323   1.5      yamt 	}
    324   1.5      yamt 	return retval;
    325   1.1  perseant }
    326   1.1  perseant 
    327   1.1  perseant /*
    328   1.1  perseant  * Try coalescing every inode in the filesystem.
    329   1.1  perseant  * Return the number of inodes actually altered.
    330   1.1  perseant  */
    331  1.11  perseant int clean_all_inodes(struct clfs *fs)
    332   1.1  perseant {
    333  1.11  perseant 	int i, r, maxino;
    334   1.3  perseant 	int totals[COALESCE_MAXERROR];
    335  1.11  perseant 	struct stat st;
    336   1.1  perseant 
    337   1.3  perseant 	memset(totals, 0, sizeof(totals));
    338  1.11  perseant 
    339  1.11  perseant 	fstat(fs->clfs_ifilefd, &st);
    340  1.11  perseant 	maxino = fs->lfs_ifpb * (st.st_size >> fs->lfs_bshift) -
    341  1.11  perseant 		fs->lfs_segtabsz - fs->lfs_cleansz;
    342  1.11  perseant 
    343  1.11  perseant 	for (i = 0; i < maxino; i++) {
    344  1.11  perseant 		r = clean_inode(fs, i);
    345   1.3  perseant 		++totals[r];
    346   1.1  perseant 	}
    347   1.3  perseant 
    348   1.3  perseant 	for (i = 0; i < COALESCE_MAXERROR; i++)
    349   1.3  perseant 		if (totals[i])
    350   1.3  perseant 			syslog(LOG_DEBUG, "%s: %d", coalesce_return[i],
    351  1.11  perseant 			       totals[i]);
    352  1.11  perseant 
    353   1.3  perseant 	return totals[COALESCE_OK];
    354   1.1  perseant }
    355   1.1  perseant 
    356  1.11  perseant /*
    357  1.11  perseant  * Fork a child process to coalesce this fs.
    358  1.11  perseant  */
    359  1.11  perseant int
    360  1.11  perseant fork_coalesce(struct clfs *fs)
    361   1.1  perseant {
    362   1.1  perseant 	static pid_t childpid;
    363   1.2  perseant 	int num;
    364   1.2  perseant 
    365  1.11  perseant 	/*
    366  1.11  perseant 	 * If already running a coalescing child, don't start a new one.
    367  1.11  perseant 	 */
    368   1.1  perseant 	if (childpid) {
    369  1.11  perseant 		if (waitpid(childpid, NULL, WNOHANG) == childpid)
    370   1.1  perseant 			childpid = 0;
    371   1.1  perseant 	}
    372   1.1  perseant 	if (childpid && kill(childpid, 0) >= 0) {
    373   1.1  perseant 		/* already running a coalesce process */
    374   1.2  perseant 		if (debug)
    375   1.2  perseant 			syslog(LOG_DEBUG, "coalescing already in progress");
    376   1.1  perseant 		return 0;
    377   1.1  perseant 	}
    378  1.11  perseant 
    379  1.11  perseant 	/*
    380  1.11  perseant 	 * Fork a child and let the child coalease
    381  1.11  perseant 	 */
    382   1.1  perseant 	childpid = fork();
    383   1.1  perseant 	if (childpid < 0) {
    384  1.11  perseant 		syslog(LOG_ERR, "%s: fork to coaleasce: %m", fs->lfs_fsmnt);
    385   1.1  perseant 		return 0;
    386   1.1  perseant 	} else if (childpid == 0) {
    387  1.11  perseant 		syslog(LOG_NOTICE, "%s: new coalescing process, pid %d",
    388  1.11  perseant 		       fs->lfs_fsmnt, getpid());
    389  1.11  perseant 		num = clean_all_inodes(fs);
    390  1.11  perseant 		syslog(LOG_NOTICE, "%s: coalesced %d discontiguous inodes",
    391  1.11  perseant 		       fs->lfs_fsmnt, num);
    392   1.1  perseant 		exit(0);
    393   1.1  perseant 	}
    394  1.11  perseant 
    395   1.1  perseant 	return 0;
    396   1.1  perseant }
    397