Home | History | Annotate | Line # | Download | only in rumpvfs
rumpblk.c revision 1.52
      1 /*	$NetBSD: rumpblk.c,v 1.52 2013/04/29 17:31:05 pooka Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 2009 Antti Kantee.  All Rights Reserved.
      5  *
      6  * Development of this software was supported by the
      7  * Finnish Cultural Foundation.
      8  *
      9  * Redistribution and use in source and binary forms, with or without
     10  * modification, are permitted provided that the following conditions
     11  * are met:
     12  * 1. Redistributions of source code must retain the above copyright
     13  *    notice, this list of conditions and the following disclaimer.
     14  * 2. Redistributions in binary form must reproduce the above copyright
     15  *    notice, this list of conditions and the following disclaimer in the
     16  *    documentation and/or other materials provided with the distribution.
     17  *
     18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
     19  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     20  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     21  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
     24  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     28  * SUCH DAMAGE.
     29  */
     30 
     31 /*
     32  * Block device emulation.  Presents a block device interface and
     33  * uses rumpuser system calls to satisfy I/O requests.
     34  *
     35  * We provide fault injection.  The driver can be made to fail
     36  * I/O occasionally.
     37  */
     38 
     39 #include <sys/cdefs.h>
     40 __KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.52 2013/04/29 17:31:05 pooka Exp $");
     41 
     42 #include <sys/param.h>
     43 #include <sys/buf.h>
     44 #include <sys/conf.h>
     45 #include <sys/condvar.h>
     46 #include <sys/disklabel.h>
     47 #include <sys/evcnt.h>
     48 #include <sys/fcntl.h>
     49 #include <sys/kmem.h>
     50 #include <sys/malloc.h>
     51 #include <sys/queue.h>
     52 #include <sys/stat.h>
     53 #include <sys/cprng.h>
     54 
     55 #include <rump/rumpuser.h>
     56 
     57 #include "rump_private.h"
     58 #include "rump_vfs_private.h"
     59 
     60 #if 0
     61 #define DPRINTF(x) printf x
     62 #else
     63 #define DPRINTF(x)
     64 #endif
     65 
     66 #define RUMPBLK_SIZE 16
     67 static struct rblkdev {
     68 	char *rblk_path;
     69 	int rblk_fd;
     70 	int rblk_mode;
     71 
     72 	uint64_t rblk_size;
     73 	uint64_t rblk_hostoffset;
     74 	uint64_t rblk_hostsize;
     75 	int rblk_ftype;
     76 
     77 	struct disklabel rblk_label;
     78 } minors[RUMPBLK_SIZE];
     79 
     80 static struct evcnt ev_io_total;
     81 static struct evcnt ev_io_async;
     82 
     83 static struct evcnt ev_bwrite_total;
     84 static struct evcnt ev_bwrite_async;
     85 static struct evcnt ev_bread_total;
     86 
     87 dev_type_open(rumpblk_open);
     88 dev_type_close(rumpblk_close);
     89 dev_type_read(rumpblk_read);
     90 dev_type_write(rumpblk_write);
     91 dev_type_ioctl(rumpblk_ioctl);
     92 dev_type_strategy(rumpblk_strategy);
     93 dev_type_strategy(rumpblk_strategy_fail);
     94 dev_type_dump(rumpblk_dump);
     95 dev_type_size(rumpblk_size);
     96 
     97 static const struct bdevsw rumpblk_bdevsw = {
     98 	rumpblk_open, rumpblk_close, rumpblk_strategy, rumpblk_ioctl,
     99 	nodump, nosize, D_DISK
    100 };
    101 
    102 static const struct bdevsw rumpblk_bdevsw_fail = {
    103 	rumpblk_open, rumpblk_close, rumpblk_strategy_fail, rumpblk_ioctl,
    104 	nodump, nosize, D_DISK
    105 };
    106 
    107 static const struct cdevsw rumpblk_cdevsw = {
    108 	rumpblk_open, rumpblk_close, rumpblk_read, rumpblk_write,
    109 	rumpblk_ioctl, nostop, notty, nopoll, nommap, nokqfilter, D_DISK
    110 };
    111 
    112 static int backend_open(struct rblkdev *, const char *);
    113 static int backend_close(struct rblkdev *);
    114 
    115 /* fail every n out of BLKFAIL_MAX */
    116 #define BLKFAIL_MAX 10000
    117 static int blkfail;
    118 static unsigned randstate;
    119 static kmutex_t rumpblk_lock;
    120 static int sectshift = DEV_BSHIFT;
    121 
    122 static void
    123 makedefaultlabel(struct disklabel *lp, off_t size, int part)
    124 {
    125 	int i;
    126 
    127 	memset(lp, 0, sizeof(*lp));
    128 
    129 	lp->d_secperunit = size;
    130 	lp->d_secsize = 1 << sectshift;
    131 	lp->d_nsectors = size >> sectshift;
    132 	lp->d_ntracks = 1;
    133 	lp->d_ncylinders = 1;
    134 	lp->d_secpercyl = lp->d_nsectors;
    135 
    136 	/* oh dear oh dear */
    137 	strncpy(lp->d_typename, "rumpd", sizeof(lp->d_typename));
    138 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
    139 
    140 	lp->d_type = DTYPE_RUMPD;
    141 	lp->d_rpm = 11;
    142 	lp->d_interleave = 1;
    143 	lp->d_flags = 0;
    144 
    145 	/* XXX: RAW_PART handling? */
    146 	for (i = 0; i < part; i++) {
    147 		lp->d_partitions[i].p_fstype = FS_UNUSED;
    148 	}
    149 	lp->d_partitions[part].p_size = size >> sectshift;
    150 	lp->d_npartitions = part+1;
    151 	/* XXX: file system type? */
    152 
    153 	lp->d_magic = DISKMAGIC;
    154 	lp->d_magic2 = DISKMAGIC;
    155 	lp->d_checksum = 0; /* XXX */
    156 }
    157 
    158 int
    159 rumpblk_init(void)
    160 {
    161 	char buf[64];
    162 	devmajor_t rumpblkmaj = RUMPBLK_DEVMAJOR;
    163 	unsigned tmp;
    164 	int error, i;
    165 
    166 	mutex_init(&rumpblk_lock, MUTEX_DEFAULT, IPL_NONE);
    167 
    168 	if (rumpuser_getparam("RUMP_BLKFAIL", buf, sizeof(buf)) == 0) {
    169 		blkfail = strtoul(buf, NULL, 10);
    170 		/* fail everything */
    171 		if (blkfail > BLKFAIL_MAX)
    172 			blkfail = BLKFAIL_MAX;
    173 		if (rumpuser_getparam("RUMP_BLKFAIL_SEED",
    174 		    buf, sizeof(buf)) == 0) {
    175 			randstate = strtoul(buf, NULL, 10);
    176 		} else {
    177 			randstate = cprng_fast32();
    178 		}
    179 		printf("rumpblk: FAULT INJECTION ACTIVE! fail %d/%d. "
    180 		    "seed %u\n", blkfail, BLKFAIL_MAX, randstate);
    181 	} else {
    182 		blkfail = 0;
    183 	}
    184 
    185 	if (rumpuser_getparam("RUMP_BLKSECTSHIFT", buf, sizeof(buf)) == 0) {
    186 		printf("rumpblk: ");
    187 		tmp = strtoul(buf, NULL, 10);
    188 		if (tmp >= DEV_BSHIFT)
    189 			sectshift = tmp;
    190 		else
    191 			printf("RUMP_BLKSECTSHIFT must be least %d (now %d), ",
    192 			   DEV_BSHIFT, tmp);
    193 		printf("using %d for sector shift (size %d)\n",
    194 		    sectshift, 1<<sectshift);
    195 	}
    196 
    197 	memset(minors, 0, sizeof(minors));
    198 	for (i = 0; i < RUMPBLK_SIZE; i++) {
    199 		minors[i].rblk_fd = -1;
    200 	}
    201 
    202 	evcnt_attach_dynamic(&ev_io_total, EVCNT_TYPE_MISC, NULL,
    203 	    "rumpblk", "I/O reqs");
    204 	evcnt_attach_dynamic(&ev_io_async, EVCNT_TYPE_MISC, NULL,
    205 	    "rumpblk", "async I/O");
    206 
    207 	evcnt_attach_dynamic(&ev_bread_total, EVCNT_TYPE_MISC, NULL,
    208 	    "rumpblk", "bytes read");
    209 	evcnt_attach_dynamic(&ev_bwrite_total, EVCNT_TYPE_MISC, NULL,
    210 	    "rumpblk", "bytes written");
    211 	evcnt_attach_dynamic(&ev_bwrite_async, EVCNT_TYPE_MISC, NULL,
    212 	    "rumpblk", "bytes written async");
    213 
    214 	if (blkfail) {
    215 		return devsw_attach("rumpblk",
    216 		    &rumpblk_bdevsw_fail, &rumpblkmaj,
    217 		    &rumpblk_cdevsw, &rumpblkmaj);
    218 	} else {
    219 		return devsw_attach("rumpblk",
    220 		    &rumpblk_bdevsw, &rumpblkmaj,
    221 		    &rumpblk_cdevsw, &rumpblkmaj);
    222 	}
    223 }
    224 
    225 int
    226 rumpblk_register(const char *path, devminor_t *dmin,
    227 	uint64_t offset, uint64_t size)
    228 {
    229 	struct rblkdev *rblk;
    230 	uint64_t flen;
    231 	size_t len;
    232 	int ftype, error, i;
    233 
    234 	/* devices might not report correct size unless they're open */
    235 	if (rumpuser_getfileinfo(path, &flen, &ftype, &error) == -1)
    236 		return error;
    237 
    238 	/* verify host file is of supported type */
    239 	if (!(ftype == RUMPUSER_FT_REG
    240 	   || ftype == RUMPUSER_FT_BLK
    241 	   || ftype == RUMPUSER_FT_CHR))
    242 		return EINVAL;
    243 
    244 	mutex_enter(&rumpblk_lock);
    245 	for (i = 0; i < RUMPBLK_SIZE; i++) {
    246 		if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
    247 			mutex_exit(&rumpblk_lock);
    248 			*dmin = i;
    249 			return 0;
    250 		}
    251 	}
    252 
    253 	for (i = 0; i < RUMPBLK_SIZE; i++)
    254 		if (minors[i].rblk_path == NULL)
    255 			break;
    256 	if (i == RUMPBLK_SIZE) {
    257 		mutex_exit(&rumpblk_lock);
    258 		return EBUSY;
    259 	}
    260 
    261 	rblk = &minors[i];
    262 	rblk->rblk_path = __UNCONST("taken");
    263 	mutex_exit(&rumpblk_lock);
    264 
    265 	len = strlen(path);
    266 	rblk->rblk_path = malloc(len + 1, M_TEMP, M_WAITOK);
    267 	strcpy(rblk->rblk_path, path);
    268 	rblk->rblk_hostoffset = offset;
    269 	if (size != RUMPBLK_SIZENOTSET) {
    270 		KASSERT(size + offset <= flen);
    271 		rblk->rblk_size = size;
    272 	} else {
    273 		KASSERT(offset < flen);
    274 		rblk->rblk_size = flen - offset;
    275 	}
    276 	rblk->rblk_hostsize = flen;
    277 	rblk->rblk_ftype = ftype;
    278 	makedefaultlabel(&rblk->rblk_label, rblk->rblk_size, i);
    279 
    280 	if ((error = backend_open(rblk, path)) != 0) {
    281 		memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label));
    282 		free(rblk->rblk_path, M_TEMP);
    283 		rblk->rblk_path = NULL;
    284 		return error;
    285 	}
    286 
    287 	*dmin = i;
    288 	return 0;
    289 }
    290 
    291 /*
    292  * Unregister rumpblk.  It's the callers responsibility to make
    293  * sure it's no longer in use.
    294  */
    295 int
    296 rumpblk_deregister(const char *path)
    297 {
    298 	struct rblkdev *rblk;
    299 	int i;
    300 
    301 	mutex_enter(&rumpblk_lock);
    302 	for (i = 0; i < RUMPBLK_SIZE; i++) {
    303 		if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
    304 			break;
    305 		}
    306 	}
    307 	mutex_exit(&rumpblk_lock);
    308 
    309 	if (i == RUMPBLK_SIZE)
    310 		return ENOENT;
    311 
    312 	rblk = &minors[i];
    313 	backend_close(rblk);
    314 
    315 	free(rblk->rblk_path, M_TEMP);
    316 	memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label));
    317 	rblk->rblk_path = NULL;
    318 
    319 	return 0;
    320 }
    321 
    322 static int
    323 backend_open(struct rblkdev *rblk, const char *path)
    324 {
    325 	int error, fd;
    326 
    327 	KASSERT(rblk->rblk_fd == -1);
    328 	fd = rumpuser_open(path,
    329 	    RUMPUSER_OPEN_RDWR | RUMPUSER_OPEN_BIO, &error);
    330 	if (error) {
    331 		fd = rumpuser_open(path,
    332 		    RUMPUSER_OPEN_RDONLY | RUMPUSER_OPEN_BIO, &error);
    333 		if (error)
    334 			return error;
    335 		rblk->rblk_mode = FREAD;
    336 	} else {
    337 		rblk->rblk_mode = FREAD|FWRITE;
    338 	}
    339 
    340 	rblk->rblk_fd = fd;
    341 	KASSERT(rblk->rblk_fd != -1);
    342 	return 0;
    343 }
    344 
    345 static int
    346 backend_close(struct rblkdev *rblk)
    347 {
    348 	int dummy;
    349 
    350 	rumpuser_close(rblk->rblk_fd, &dummy);
    351 	rblk->rblk_fd = -1;
    352 
    353 	return 0;
    354 }
    355 
    356 int
    357 rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l)
    358 {
    359 	struct rblkdev *rblk = &minors[minor(dev)];
    360 
    361 	if (rblk->rblk_fd == -1)
    362 		return ENXIO;
    363 
    364 	if (((flag & (FREAD|FWRITE)) & ~rblk->rblk_mode) != 0) {
    365 		return EACCES;
    366 	}
    367 
    368 	return 0;
    369 }
    370 
    371 int
    372 rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l)
    373 {
    374 
    375 	return 0;
    376 }
    377 
    378 int
    379 rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l)
    380 {
    381 	devminor_t dmin = minor(dev);
    382 	struct rblkdev *rblk = &minors[dmin];
    383 	struct partinfo *pi;
    384 	int error = 0;
    385 
    386 	/* well, me should support a few more, but we don't for now */
    387 	switch (xfer) {
    388 	case DIOCGDINFO:
    389 		*(struct disklabel *)addr = rblk->rblk_label;
    390 		break;
    391 
    392 	case DIOCGPART:
    393 		pi = addr;
    394 		pi->part = &rblk->rblk_label.d_partitions[DISKPART(dmin)];
    395 		pi->disklab = &rblk->rblk_label;
    396 		break;
    397 
    398 	/* it's synced enough along the write path */
    399 	case DIOCCACHESYNC:
    400 		break;
    401 
    402 	default:
    403 		error = ENOTTY;
    404 		break;
    405 	}
    406 
    407 	return error;
    408 }
    409 
    410 static int
    411 do_physio(dev_t dev, struct uio *uio, int which)
    412 {
    413 	void (*strat)(struct buf *);
    414 
    415 	if (blkfail)
    416 		strat = rumpblk_strategy_fail;
    417 	else
    418 		strat = rumpblk_strategy;
    419 
    420 	return physio(strat, NULL, dev, which, minphys, uio);
    421 }
    422 
    423 int
    424 rumpblk_read(dev_t dev, struct uio *uio, int flags)
    425 {
    426 
    427 	return do_physio(dev, uio, B_READ);
    428 }
    429 
    430 int
    431 rumpblk_write(dev_t dev, struct uio *uio, int flags)
    432 {
    433 
    434 	return do_physio(dev, uio, B_WRITE);
    435 }
    436 
    437 static void
    438 dostrategy(struct buf *bp)
    439 {
    440 	struct rblkdev *rblk = &minors[minor(bp->b_dev)];
    441 	off_t off;
    442 	int async = bp->b_flags & B_ASYNC;
    443 	int op;
    444 
    445 	if (bp->b_bcount % (1<<sectshift) != 0) {
    446 		rump_biodone(bp, 0, EINVAL);
    447 		return;
    448 	}
    449 
    450 	/* collect statistics */
    451 	ev_io_total.ev_count++;
    452 	if (async)
    453 		ev_io_async.ev_count++;
    454 	if (BUF_ISWRITE(bp)) {
    455 		ev_bwrite_total.ev_count += bp->b_bcount;
    456 		if (async)
    457 			ev_bwrite_async.ev_count += bp->b_bcount;
    458 	} else {
    459 		ev_bread_total.ev_count++;
    460 	}
    461 
    462 	/*
    463 	 * b_blkno is always in terms of DEV_BSIZE, and since we need
    464 	 * to translate to a byte offset for the host read, this
    465 	 * calculation does not need sectshift.
    466 	 */
    467 	off = bp->b_blkno << DEV_BSHIFT;
    468 
    469 	/*
    470 	 * Do bounds checking if we're working on a file.  Otherwise
    471 	 * invalid file systems might attempt to read beyond EOF.  This
    472 	 * is bad(tm) especially on mmapped images.  This is essentially
    473 	 * the kernel bounds_check() routines.
    474 	 */
    475 	if (off + bp->b_bcount > rblk->rblk_size) {
    476 		int64_t sz = rblk->rblk_size - off;
    477 
    478 		/* EOF */
    479 		if (sz == 0) {
    480 			rump_biodone(bp, 0, 0);
    481 			return;
    482 		}
    483 		/* beyond EOF ==> error */
    484 		if (sz < 0) {
    485 			rump_biodone(bp, 0, EINVAL);
    486 			return;
    487 		}
    488 
    489 		/* truncate to device size */
    490 		bp->b_bcount = sz;
    491 	}
    492 
    493 	off += rblk->rblk_hostoffset;
    494 	DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64
    495 	    " (0x%" PRIx64 " - 0x%" PRIx64 "), %ssync\n",
    496 	    bp->b_bcount, BUF_ISREAD(bp) ? "READ" : "WRITE",
    497 	    off, off, (off + bp->b_bcount), async ? "a" : ""));
    498 
    499 	op = BUF_ISREAD(bp) ? RUMPUSER_BIO_READ : RUMPUSER_BIO_WRITE;
    500 	if (BUF_ISWRITE(bp) && !async)
    501 		op |= RUMPUSER_BIO_SYNC;
    502 
    503 	rumpuser_bio(rblk->rblk_fd, op, bp->b_data, bp->b_bcount, off,
    504 	    rump_biodone, bp);
    505 }
    506 
    507 void
    508 rumpblk_strategy(struct buf *bp)
    509 {
    510 
    511 	dostrategy(bp);
    512 }
    513 
    514 /*
    515  * Simple random number generator.  This is private so that we can
    516  * very repeatedly control which blocks will fail.
    517  *
    518  * <mlelstv> pooka, rand()
    519  * <mlelstv> [paste]
    520  */
    521 static unsigned
    522 gimmerand(void)
    523 {
    524 
    525 	return (randstate = randstate * 1103515245 + 12345) % (0x80000000L);
    526 }
    527 
    528 /*
    529  * Block device with very simple fault injection.  Fails every
    530  * n out of BLKFAIL_MAX I/O with EIO.  n is determined by the env
    531  * variable RUMP_BLKFAIL.
    532  */
    533 void
    534 rumpblk_strategy_fail(struct buf *bp)
    535 {
    536 
    537 	if (gimmerand() % BLKFAIL_MAX >= blkfail) {
    538 		dostrategy(bp);
    539 	} else {
    540 		printf("block fault injection: failing I/O on block %lld\n",
    541 		    (long long)bp->b_blkno);
    542 		bp->b_error = EIO;
    543 		biodone(bp);
    544 	}
    545 }
    546