Home | History | Annotate | Line # | Download | only in rumpvfs
rumpblk.c revision 1.14
      1 /*	$NetBSD: rumpblk.c,v 1.14 2009/03/23 11:52:42 pooka Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 2009 Antti Kantee.  All Rights Reserved.
      5  *
      6  * Development of this software was supported by the
      7  * Finnish Cultural Foundation.
      8  *
      9  * Redistribution and use in source and binary forms, with or without
     10  * modification, are permitted provided that the following conditions
     11  * are met:
     12  * 1. Redistributions of source code must retain the above copyright
     13  *    notice, this list of conditions and the following disclaimer.
     14  * 2. Redistributions in binary form must reproduce the above copyright
     15  *    notice, this list of conditions and the following disclaimer in the
     16  *    documentation and/or other materials provided with the distribution.
     17  *
     18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
     19  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     20  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     21  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
     24  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     28  * SUCH DAMAGE.
     29  */
     30 
     31 /*
     32  * Block device emulation.  Presents a block device interface and
     33  * uses rumpuser system calls to satisfy I/O requests.
     34  */
     35 
     36 #include <sys/cdefs.h>
     37 __KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.14 2009/03/23 11:52:42 pooka Exp $");
     38 
     39 #include <sys/param.h>
     40 #include <sys/buf.h>
     41 #include <sys/conf.h>
     42 #include <sys/disklabel.h>
     43 #include <sys/fcntl.h>
     44 #include <sys/kmem.h>
     45 #include <sys/malloc.h>
     46 #include <sys/stat.h>
     47 
     48 #include <rump/rumpuser.h>
     49 
     50 #include "rump_private.h"
     51 #include "rump_vfs_private.h"
     52 
     53 #define RUMPBLK_SIZE 16
     54 static struct rblkdev {
     55 	char *rblk_path;
     56 	int rblk_fd;
     57 	uint8_t *rblk_mem;
     58 	off_t rblk_size;
     59 
     60 	struct partition *rblk_curpi;
     61 	struct partition rblk_pi;
     62 	struct disklabel rblk_dl;
     63 } minors[RUMPBLK_SIZE];
     64 
     65 dev_type_open(rumpblk_open);
     66 dev_type_close(rumpblk_close);
     67 dev_type_read(rumpblk_read);
     68 dev_type_write(rumpblk_write);
     69 dev_type_ioctl(rumpblk_ioctl);
     70 dev_type_strategy(rumpblk_strategy);
     71 dev_type_strategy(rumpblk_strategy_fail);
     72 dev_type_dump(rumpblk_dump);
     73 dev_type_size(rumpblk_size);
     74 
     75 static const struct bdevsw rumpblk_bdevsw = {
     76 	rumpblk_open, rumpblk_close, rumpblk_strategy, rumpblk_ioctl,
     77 	nodump, nosize, D_DISK
     78 };
     79 
     80 static const struct bdevsw rumpblk_bdevsw_fail = {
     81 	rumpblk_open, rumpblk_close, rumpblk_strategy_fail, rumpblk_ioctl,
     82 	nodump, nosize, D_DISK
     83 };
     84 
     85 static const struct cdevsw rumpblk_cdevsw = {
     86 	rumpblk_open, rumpblk_close, rumpblk_read, rumpblk_write,
     87 	rumpblk_ioctl, nostop, notty, nopoll, nommap, nokqfilter, D_DISK
     88 };
     89 
     90 /* fail every n out of BLKFAIL_MAX */
     91 #define BLKFAIL_MAX 10000
     92 static int blkfail;
     93 static unsigned randstate;
     94 
     95 int
     96 rumpblk_init(void)
     97 {
     98 	char buf[64];
     99 	int rumpblk = RUMPBLK;
    100 	int error;
    101 
    102 	if (rumpuser_getenv("RUMP_BLKFAIL", buf, sizeof(buf), &error) == 0) {
    103 		blkfail = strtoul(buf, NULL, 10);
    104 		/* fail everything */
    105 		if (blkfail > BLKFAIL_MAX)
    106 			blkfail = BLKFAIL_MAX;
    107 		if (rumpuser_getenv("RUMP_BLKFAIL_SEED", buf, sizeof(buf),
    108 		    &error) == 0) {
    109 			randstate = strtoul(buf, NULL, 10);
    110 		} else {
    111 			randstate = arc4random(); /* XXX: not enough entropy */
    112 		}
    113 		printf("rumpblk: FAULT INJECTION ACTIVE!  every %d out of"
    114 		    " %d I/O will fail.  key %u\n", blkfail, BLKFAIL_MAX,
    115 		    randstate);
    116 	} else {
    117 		blkfail = 0;
    118 	}
    119 
    120 	if (blkfail) {
    121 		return devsw_attach("rumpblk", &rumpblk_bdevsw_fail, &rumpblk,
    122 		    &rumpblk_cdevsw, &rumpblk);
    123 	} else {
    124 		return devsw_attach("rumpblk", &rumpblk_bdevsw, &rumpblk,
    125 		    &rumpblk_cdevsw, &rumpblk);
    126 	}
    127 }
    128 
    129 int
    130 rumpblk_register(const char *path)
    131 {
    132 	size_t len;
    133 	int i;
    134 
    135 	for (i = 0; i < RUMPBLK_SIZE; i++)
    136 		if (minors[i].rblk_path && strcmp(minors[i].rblk_path, path) == 0)
    137 			return i;
    138 
    139 	for (i = 0; i < RUMPBLK_SIZE; i++)
    140 		if (minors[i].rblk_path == NULL)
    141 			break;
    142 	if (i == RUMPBLK_SIZE)
    143 		return -1;
    144 
    145 	len = strlen(path);
    146 	minors[i].rblk_path = malloc(len + 1, M_TEMP, M_WAITOK);
    147 	strcpy(minors[i].rblk_path, path);
    148 	minors[i].rblk_fd = -1;
    149 	return i;
    150 }
    151 
    152 int
    153 rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l)
    154 {
    155 	struct rblkdev *rblk = &minors[minor(dev)];
    156 	uint8_t *mem = NULL;
    157 	uint64_t fsize;
    158 	int ft, dummy;
    159 	int error, fd;
    160 
    161 	KASSERT(rblk->rblk_fd == -1);
    162 	fd = rumpuser_open(rblk->rblk_path, OFLAGS(flag), &error);
    163 	if (error)
    164 		return error;
    165 
    166 	if (rumpuser_getfileinfo(rblk->rblk_path, &fsize, &ft, &error) == -1) {
    167 		rumpuser_close(fd, &dummy);
    168 		return error;
    169 	}
    170 
    171 	if (ft == RUMPUSER_FT_REG) {
    172 		/*
    173 		 * Try to mmap the file if it's size is max. half of
    174 		 * the address space.  If mmap fails due to e.g. limits,
    175 		 * we fall back to the read/write path.  This test is only
    176 		 * to prevent size_t vs. off_t wraparounds.
    177 		 */
    178 		if (fsize < UINT64_C(1) << (sizeof(void *) * 8 - 1)) {
    179 			int mmflags;
    180 
    181 			mmflags = 0;
    182 			if (flag & FREAD)
    183 				mmflags |= RUMPUSER_FILEMMAP_READ;
    184 			if (flag & FWRITE) {
    185 				mmflags |= RUMPUSER_FILEMMAP_WRITE;
    186 				mmflags |= RUMPUSER_FILEMMAP_SHARED;
    187 			}
    188 			mem = rumpuser_filemmap(fd, 0, fsize, mmflags, &error);
    189 		}
    190 
    191 		memset(&rblk->rblk_dl, 0, sizeof(rblk->rblk_dl));
    192 
    193 		rblk->rblk_size = fsize;
    194 		rblk->rblk_pi.p_size = fsize >> DEV_BSHIFT;
    195 		rblk->rblk_dl.d_secsize = DEV_BSIZE;
    196 		rblk->rblk_curpi = &rblk->rblk_pi;
    197 	} else {
    198 		if (rumpuser_ioctl(fd, DIOCGDINFO, &rblk->rblk_dl,
    199 		    &error) != -1) {
    200 			rumpuser_close(fd, &dummy);
    201 			return error;
    202 		}
    203 
    204 		rblk->rblk_curpi = &rblk->rblk_dl.d_partitions[0];
    205 	}
    206 	rblk->rblk_fd = fd;
    207 	rblk->rblk_mem = mem;
    208 	if (rblk->rblk_mem != NULL)
    209 		printf("rumpblk%d: using mmio for %s\n",
    210 		    minor(dev), rblk->rblk_path);
    211 
    212 	return 0;
    213 }
    214 
    215 int
    216 rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l)
    217 {
    218 	struct rblkdev *rblk = &minors[minor(dev)];
    219 	int dummy;
    220 
    221 	if (rblk->rblk_mem) {
    222 		KASSERT(rblk->rblk_size);
    223 		rumpuser_memsync(rblk->rblk_mem, rblk->rblk_size, &dummy);
    224 		rumpuser_unmap(rblk->rblk_mem, rblk->rblk_size);
    225 		rblk->rblk_mem = NULL;
    226 	}
    227 	rumpuser_close(rblk->rblk_fd, &dummy);
    228 	rblk->rblk_fd = -1;
    229 
    230 	return 0;
    231 }
    232 
    233 int
    234 rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l)
    235 {
    236 	struct rblkdev *rblk = &minors[minor(dev)];
    237 	int rv, error;
    238 
    239 	if (xfer == DIOCGPART) {
    240 		struct partinfo *pi = (struct partinfo *)addr;
    241 
    242 		pi->part = rblk->rblk_curpi;
    243 		pi->disklab = &rblk->rblk_dl;
    244 
    245 		return 0;
    246 	}
    247 
    248 	rv = rumpuser_ioctl(rblk->rblk_fd, xfer, addr, &error);
    249 	if (rv == -1)
    250 		return error;
    251 
    252 	return 0;
    253 }
    254 
    255 int
    256 rumpblk_read(dev_t dev, struct uio *uio, int flags)
    257 {
    258 
    259 	panic("%s: unimplemented", __func__);
    260 }
    261 
    262 int
    263 rumpblk_write(dev_t dev, struct uio *uio, int flags)
    264 {
    265 
    266 	panic("%s: unimplemented", __func__);
    267 }
    268 
    269 static void
    270 dostrategy(struct buf *bp)
    271 {
    272 	struct rblkdev *rblk = &minors[minor(bp->b_dev)];
    273 	off_t off;
    274 	int async, error;
    275 
    276 	off = bp->b_blkno << DEV_BSHIFT;
    277 	/*
    278 	 * Do bounds checking if we're working on a file.  Otherwise
    279 	 * invalid file systems might attempt to read beyond EOF.  This
    280 	 * is bad(tm) especially on mmapped images.  This is essentially
    281 	 * the kernel bounds_check() routines.
    282 	 */
    283 	if (rblk->rblk_size && off + bp->b_bcount > rblk->rblk_size) {
    284 		int64_t sz = rblk->rblk_size - off;
    285 
    286 		/* EOF */
    287 		if (sz == 0) {
    288 			rump_biodone(bp, 0, 0);
    289 			return;
    290 		}
    291 		/* beyond EOF ==> error */
    292 		if (sz < 0) {
    293 			rump_biodone(bp, 0, EINVAL);
    294 			return;
    295 		}
    296 
    297 		/* truncate to device size */
    298 		bp->b_bcount = sz;
    299 	}
    300 
    301 	async = bp->b_flags & B_ASYNC;
    302 	DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64
    303 	    " (0x%" PRIx64 " - 0x%" PRIx64")\n",
    304 	    bp->b_bcount, BUF_ISREAD(bp) "READ" : "WRITE",
    305 	    off, off, (off + bp->b_bcount)));
    306 
    307 	/* mem optimization?  handle here and return */
    308 	if (rblk->rblk_mem) {
    309 		uint8_t *ioaddr = rblk->rblk_mem + off;
    310 
    311 		if (BUF_ISREAD(bp)) {
    312 			memcpy(bp->b_data, ioaddr, bp->b_bcount);
    313 		} else {
    314 			memcpy(ioaddr, bp->b_data, bp->b_bcount);
    315 		}
    316 
    317 		/* synchronous write, sync necessary bits back to disk */
    318 		if (BUF_ISWRITE(bp) && !async) {
    319 			rumpuser_memsync(ioaddr, bp->b_bcount, &error);
    320 		}
    321 		rump_biodone(bp, bp->b_bcount, 0);
    322 
    323 		return;
    324 	}
    325 
    326 	/*
    327 	 * Do I/O.  We have different paths for async and sync I/O.
    328 	 * Async I/O is done by passing a request to rumpuser where
    329 	 * it is executed.  The rumpuser routine then calls
    330 	 * biodone() to signal any waiters in the kernel.  I/O's are
    331 	 * executed in series.  Technically executing them in parallel
    332 	 * would produce better results, but then we'd need either
    333 	 * more threads or posix aio.  Maybe worth investigating
    334 	 * this later.
    335 	 *
    336 	 * Using bufq here might be a good idea.
    337 	 */
    338 	if (rump_threads) {
    339 		struct rumpuser_aio *rua;
    340 
    341 		rumpuser_mutex_enter(&rumpuser_aio_mtx);
    342 		while ((rumpuser_aio_head+1) % N_AIOS == rumpuser_aio_tail)
    343 			rumpuser_cv_wait(&rumpuser_aio_cv, &rumpuser_aio_mtx);
    344 
    345 		rua = &rumpuser_aios[rumpuser_aio_head];
    346 		KASSERT(rua->rua_bp == NULL);
    347 		rua->rua_fd = rblk->rblk_fd;
    348 		rua->rua_data = bp->b_data;
    349 		rua->rua_dlen = bp->b_bcount;
    350 		rua->rua_off = off;
    351 		rua->rua_bp = bp;
    352 		rua->rua_op = BUF_ISREAD(bp);
    353 
    354 		/* insert into queue & signal */
    355 		rumpuser_aio_head = (rumpuser_aio_head+1) % N_AIOS;
    356 		rumpuser_cv_signal(&rumpuser_aio_cv);
    357 		rumpuser_mutex_exit(&rumpuser_aio_mtx);
    358 
    359 		/* make sure non-async writes end up on backing media */
    360 		if (BUF_ISWRITE(bp) && !async) {
    361 			biowait(bp);
    362 			rumpuser_fsync(rblk->rblk_fd, &error);
    363 		}
    364 	} else {
    365 		if (BUF_ISREAD(bp)) {
    366 			rumpuser_read_bio(rblk->rblk_fd, bp->b_data,
    367 			    bp->b_bcount, off, rump_biodone, bp);
    368 		} else {
    369 			rumpuser_write_bio(rblk->rblk_fd, bp->b_data,
    370 			    bp->b_bcount, off, rump_biodone, bp);
    371 		}
    372 		if (!async) {
    373 			if (BUF_ISWRITE(bp))
    374 				rumpuser_fsync(rblk->rblk_fd, &error);
    375 		}
    376 	}
    377 }
    378 
    379 void
    380 rumpblk_strategy(struct buf *bp)
    381 {
    382 
    383 	dostrategy(bp);
    384 }
    385 
    386 /*
    387  * Simple random number generator.  This is private so that we can
    388  * very repeatedly control which blocks will fail.
    389  *
    390  * <mlelstv> pooka, rand()
    391  * <mlelstv> [paste]
    392  */
    393 static unsigned
    394 gimmerand(void)
    395 {
    396 
    397 	return (randstate = randstate * 1103515245 + 12345) % (0x80000000L);
    398 }
    399 
    400 /*
    401  * Block device with very simple fault injection.  Fails every
    402  * n out of BLKFAIL_MAX I/O with EIO.  n is determined by the env
    403  * variable RUMP_BLKFAIL.
    404  */
    405 void
    406 rumpblk_strategy_fail(struct buf *bp)
    407 {
    408 
    409 	if (gimmerand() % BLKFAIL_MAX >= blkfail) {
    410 		dostrategy(bp);
    411 	} else {
    412 		printf("block fault injection: failing I/O on block %lld\n",
    413 		    (long long)bp->b_blkno);
    414 		bp->b_error = EIO;
    415 		biodone(bp);
    416 	}
    417 }
    418