1 /* $NetBSD: rumpblk.c,v 1.64 2016/07/07 06:55:44 msaitoh Exp $ */ 2 3 /* 4 * Copyright (c) 2009 Antti Kantee. All Rights Reserved. 5 * 6 * Development of this software was supported by the 7 * Finnish Cultural Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 19 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 /* 32 * Block device emulation. Presents a block device interface and 33 * uses rumpuser system calls to satisfy I/O requests. 34 * 35 * We provide fault injection. The driver can be made to fail 36 * I/O occasionally. 37 */ 38 39 #include <sys/cdefs.h> 40 __KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.64 2016/07/07 06:55:44 msaitoh Exp $"); 41 42 #include <sys/param.h> 43 #include <sys/buf.h> 44 #include <sys/conf.h> 45 #include <sys/condvar.h> 46 #include <sys/disklabel.h> 47 #include <sys/evcnt.h> 48 #include <sys/fcntl.h> 49 #include <sys/kmem.h> 50 #include <sys/malloc.h> 51 #include <sys/queue.h> 52 #include <sys/stat.h> 53 #include <sys/cprng.h> 54 55 #include <rump-sys/kern.h> 56 #include <rump-sys/vfs.h> 57 58 #include <rump/rumpuser.h> 59 60 #if 0 61 #define DPRINTF(x) printf x 62 #else 63 #define DPRINTF(x) 64 #endif 65 66 #define RUMPBLK_SIZE 16 67 static struct rblkdev { 68 char *rblk_path; 69 int rblk_fd; 70 int rblk_mode; 71 72 uint64_t rblk_size; 73 uint64_t rblk_hostoffset; 74 uint64_t rblk_hostsize; 75 int rblk_ftype; 76 77 struct disklabel rblk_label; 78 } minors[RUMPBLK_SIZE]; 79 80 static struct evcnt ev_io_total; 81 static struct evcnt ev_io_async; 82 83 static struct evcnt ev_bwrite_total; 84 static struct evcnt ev_bwrite_async; 85 static struct evcnt ev_bread_total; 86 87 dev_type_open(rumpblk_open); 88 dev_type_close(rumpblk_close); 89 dev_type_read(rumpblk_read); 90 dev_type_write(rumpblk_write); 91 dev_type_ioctl(rumpblk_ioctl); 92 dev_type_strategy(rumpblk_strategy); 93 dev_type_strategy(rumpblk_strategy_fail); 94 dev_type_dump(rumpblk_dump); 95 dev_type_size(rumpblk_size); 96 97 static const struct bdevsw rumpblk_bdevsw = { 98 .d_open = rumpblk_open, 99 .d_close = rumpblk_close, 100 .d_strategy = rumpblk_strategy, 101 .d_ioctl = rumpblk_ioctl, 102 .d_dump = nodump, 103 .d_psize = nosize, 104 .d_discard = nodiscard, 105 .d_flag = D_DISK 106 }; 107 108 static const struct bdevsw rumpblk_bdevsw_fail = { 109 .d_open = rumpblk_open, 110 .d_close = rumpblk_close, 111 .d_strategy = rumpblk_strategy_fail, 112 .d_ioctl = rumpblk_ioctl, 113 .d_dump = nodump, 114 .d_psize = nosize, 115 .d_discard = nodiscard, 116 .d_flag = D_DISK 117 }; 118 119 static const struct cdevsw rumpblk_cdevsw = { 120 .d_open = rumpblk_open, 121 .d_close = rumpblk_close, 122 .d_read = rumpblk_read, 123 .d_write = rumpblk_write, 124 .d_ioctl = rumpblk_ioctl, 125 .d_stop = nostop, 126 .d_tty = notty, 127 .d_poll = nopoll, 128 .d_mmap = nommap, 129 .d_kqfilter = nokqfilter, 130 .d_discard = nodiscard, 131 .d_flag = D_DISK 132 }; 133 134 static int backend_open(struct rblkdev *, const char *); 135 static int backend_close(struct rblkdev *); 136 137 /* fail every n out of BLKFAIL_MAX */ 138 #define BLKFAIL_MAX 10000 139 static int blkfail; 140 static unsigned randstate; 141 static kmutex_t rumpblk_lock; 142 static int sectshift = DEV_BSHIFT; 143 144 static void 145 makedefaultlabel(struct disklabel *lp, off_t size, int part) 146 { 147 int i; 148 149 memset(lp, 0, sizeof(*lp)); 150 151 lp->d_secperunit = size; 152 lp->d_secsize = 1 << sectshift; 153 lp->d_nsectors = size >> sectshift; 154 lp->d_ntracks = 1; 155 lp->d_ncylinders = 1; 156 lp->d_secpercyl = lp->d_nsectors; 157 158 /* oh dear oh dear */ 159 strncpy(lp->d_typename, "rumpd", sizeof(lp->d_typename)); 160 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname)); 161 162 lp->d_type = DKTYPE_RUMPD; 163 lp->d_rpm = 11; 164 lp->d_interleave = 1; 165 lp->d_flags = 0; 166 167 /* XXX: RAW_PART handling? */ 168 for (i = 0; i < part; i++) { 169 lp->d_partitions[i].p_fstype = FS_UNUSED; 170 } 171 lp->d_partitions[part].p_size = size >> sectshift; 172 lp->d_npartitions = part+1; 173 /* XXX: file system type? */ 174 175 lp->d_magic = DISKMAGIC; 176 lp->d_magic2 = DISKMAGIC; 177 lp->d_checksum = 0; /* XXX */ 178 } 179 180 int 181 rumpblk_init(void) 182 { 183 char buf[64]; 184 devmajor_t rumpblkmaj = RUMPBLK_DEVMAJOR; 185 unsigned tmp; 186 int i; 187 188 mutex_init(&rumpblk_lock, MUTEX_DEFAULT, IPL_NONE); 189 190 if (rumpuser_getparam("RUMP_BLKFAIL", buf, sizeof(buf)) == 0) { 191 blkfail = strtoul(buf, NULL, 10); 192 /* fail everything */ 193 if (blkfail > BLKFAIL_MAX) 194 blkfail = BLKFAIL_MAX; 195 if (rumpuser_getparam("RUMP_BLKFAIL_SEED", 196 buf, sizeof(buf)) == 0) { 197 randstate = strtoul(buf, NULL, 10); 198 } else { 199 randstate = cprng_fast32(); 200 } 201 printf("rumpblk: FAULT INJECTION ACTIVE! fail %d/%d. " 202 "seed %u\n", blkfail, BLKFAIL_MAX, randstate); 203 } else { 204 blkfail = 0; 205 } 206 207 if (rumpuser_getparam("RUMP_BLKSECTSHIFT", buf, sizeof(buf)) == 0) { 208 printf("rumpblk: "); 209 tmp = strtoul(buf, NULL, 10); 210 if (tmp >= DEV_BSHIFT) 211 sectshift = tmp; 212 else 213 printf("RUMP_BLKSECTSHIFT must be least %d (now %d), ", 214 DEV_BSHIFT, tmp); 215 printf("using %d for sector shift (size %d)\n", 216 sectshift, 1<<sectshift); 217 } 218 219 memset(minors, 0, sizeof(minors)); 220 for (i = 0; i < RUMPBLK_SIZE; i++) { 221 minors[i].rblk_fd = -1; 222 } 223 224 evcnt_attach_dynamic(&ev_io_total, EVCNT_TYPE_MISC, NULL, 225 "rumpblk", "I/O reqs"); 226 evcnt_attach_dynamic(&ev_io_async, EVCNT_TYPE_MISC, NULL, 227 "rumpblk", "async I/O"); 228 229 evcnt_attach_dynamic(&ev_bread_total, EVCNT_TYPE_MISC, NULL, 230 "rumpblk", "bytes read"); 231 evcnt_attach_dynamic(&ev_bwrite_total, EVCNT_TYPE_MISC, NULL, 232 "rumpblk", "bytes written"); 233 evcnt_attach_dynamic(&ev_bwrite_async, EVCNT_TYPE_MISC, NULL, 234 "rumpblk", "bytes written async"); 235 236 if (blkfail) { 237 return devsw_attach("rumpblk", 238 &rumpblk_bdevsw_fail, &rumpblkmaj, 239 &rumpblk_cdevsw, &rumpblkmaj); 240 } else { 241 return devsw_attach("rumpblk", 242 &rumpblk_bdevsw, &rumpblkmaj, 243 &rumpblk_cdevsw, &rumpblkmaj); 244 } 245 } 246 247 int 248 rumpblk_register(const char *path, devminor_t *dmin, 249 uint64_t offset, uint64_t size) 250 { 251 struct rblkdev *rblk; 252 uint64_t flen; 253 size_t len; 254 int ftype, error, i; 255 256 /* devices might not report correct size unless they're open */ 257 if ((error = rumpuser_getfileinfo(path, &flen, &ftype)) != 0) 258 return error; 259 260 /* verify host file is of supported type */ 261 if (!(ftype == RUMPUSER_FT_REG 262 || ftype == RUMPUSER_FT_BLK 263 || ftype == RUMPUSER_FT_CHR)) 264 return EINVAL; 265 266 mutex_enter(&rumpblk_lock); 267 for (i = 0; i < RUMPBLK_SIZE; i++) { 268 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) { 269 mutex_exit(&rumpblk_lock); 270 *dmin = i; 271 return 0; 272 } 273 } 274 275 for (i = 0; i < RUMPBLK_SIZE; i++) 276 if (minors[i].rblk_path == NULL) 277 break; 278 if (i == RUMPBLK_SIZE) { 279 mutex_exit(&rumpblk_lock); 280 return EBUSY; 281 } 282 283 rblk = &minors[i]; 284 rblk->rblk_path = __UNCONST("taken"); 285 mutex_exit(&rumpblk_lock); 286 287 len = strlen(path); 288 rblk->rblk_path = malloc(len + 1, M_TEMP, M_WAITOK); 289 strcpy(rblk->rblk_path, path); 290 rblk->rblk_hostoffset = offset; 291 if (size != RUMPBLK_SIZENOTSET) { 292 KASSERT(size + offset <= flen); 293 rblk->rblk_size = size; 294 } else { 295 KASSERT(offset < flen); 296 rblk->rblk_size = flen - offset; 297 } 298 rblk->rblk_hostsize = flen; 299 rblk->rblk_ftype = ftype; 300 makedefaultlabel(&rblk->rblk_label, rblk->rblk_size, i); 301 302 if ((error = backend_open(rblk, path)) != 0) { 303 memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label)); 304 free(rblk->rblk_path, M_TEMP); 305 rblk->rblk_path = NULL; 306 return error; 307 } 308 309 *dmin = i; 310 return 0; 311 } 312 313 /* 314 * Unregister rumpblk. It's the callers responsibility to make 315 * sure it's no longer in use. 316 */ 317 int 318 rumpblk_deregister(const char *path) 319 { 320 struct rblkdev *rblk; 321 int i; 322 323 mutex_enter(&rumpblk_lock); 324 for (i = 0; i < RUMPBLK_SIZE; i++) { 325 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) { 326 break; 327 } 328 } 329 mutex_exit(&rumpblk_lock); 330 331 if (i == RUMPBLK_SIZE) 332 return ENOENT; 333 334 rblk = &minors[i]; 335 backend_close(rblk); 336 337 free(rblk->rblk_path, M_TEMP); 338 memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label)); 339 rblk->rblk_path = NULL; 340 341 return 0; 342 } 343 344 /* 345 * Release all backend resources, to be called only when the rump 346 * kernel is being shut down. 347 * This routine does not do a full "fini" since we're going down anyway. 348 */ 349 void 350 rumpblk_fini(void) 351 { 352 int i; 353 354 for (i = 0; i < RUMPBLK_SIZE; i++) { 355 struct rblkdev *rblk; 356 357 rblk = &minors[i]; 358 if (rblk->rblk_fd != -1) 359 backend_close(rblk); 360 } 361 } 362 363 static int 364 backend_open(struct rblkdev *rblk, const char *path) 365 { 366 int error, fd; 367 368 KASSERT(rblk->rblk_fd == -1); 369 error = rumpuser_open(path, 370 RUMPUSER_OPEN_RDWR | RUMPUSER_OPEN_BIO, &fd); 371 if (error) { 372 error = rumpuser_open(path, 373 RUMPUSER_OPEN_RDONLY | RUMPUSER_OPEN_BIO, &fd); 374 if (error) 375 return error; 376 rblk->rblk_mode = FREAD; 377 } else { 378 rblk->rblk_mode = FREAD|FWRITE; 379 } 380 381 rblk->rblk_fd = fd; 382 KASSERT(rblk->rblk_fd != -1); 383 return 0; 384 } 385 386 static int 387 backend_close(struct rblkdev *rblk) 388 { 389 390 rumpuser_close(rblk->rblk_fd); 391 rblk->rblk_fd = -1; 392 393 return 0; 394 } 395 396 int 397 rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l) 398 { 399 struct rblkdev *rblk = &minors[minor(dev)]; 400 401 if (rblk->rblk_fd == -1) 402 return ENXIO; 403 404 if (((flag & (FREAD|FWRITE)) & ~rblk->rblk_mode) != 0) { 405 return EACCES; 406 } 407 408 return 0; 409 } 410 411 int 412 rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l) 413 { 414 415 return 0; 416 } 417 418 int 419 rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l) 420 { 421 devminor_t dmin = minor(dev); 422 struct rblkdev *rblk = &minors[dmin]; 423 struct partinfo *pi; 424 struct partition *dp; 425 int error = 0; 426 427 /* well, me should support a few more, but we don't for now */ 428 switch (xfer) { 429 case DIOCGDINFO: 430 *(struct disklabel *)addr = rblk->rblk_label; 431 break; 432 433 case DIOCGPARTINFO: 434 dp = &rblk->rblk_label.d_partitions[DISKPART(dmin)]; 435 pi = addr; 436 pi->pi_offset = dp->p_offset; 437 pi->pi_size = dp->p_size; 438 pi->pi_secsize = rblk->rblk_label.d_secsize; 439 pi->pi_bsize = BLKDEV_IOSIZE; 440 pi->pi_fstype = dp->p_fstype; 441 pi->pi_fsize = dp->p_fsize; 442 pi->pi_frag = dp->p_frag; 443 pi->pi_cpg = dp->p_cpg; 444 break; 445 446 /* it's synced enough along the write path */ 447 case DIOCCACHESYNC: 448 break; 449 450 case DIOCGMEDIASIZE: 451 *(off_t *)addr = (off_t)rblk->rblk_size; 452 break; 453 454 default: 455 error = ENOTTY; 456 break; 457 } 458 459 return error; 460 } 461 462 static int 463 do_physio(dev_t dev, struct uio *uio, int which) 464 { 465 void (*strat)(struct buf *); 466 467 if (blkfail) 468 strat = rumpblk_strategy_fail; 469 else 470 strat = rumpblk_strategy; 471 472 return physio(strat, NULL, dev, which, minphys, uio); 473 } 474 475 int 476 rumpblk_read(dev_t dev, struct uio *uio, int flags) 477 { 478 479 return do_physio(dev, uio, B_READ); 480 } 481 482 int 483 rumpblk_write(dev_t dev, struct uio *uio, int flags) 484 { 485 486 return do_physio(dev, uio, B_WRITE); 487 } 488 489 static void 490 dostrategy(struct buf *bp) 491 { 492 struct rblkdev *rblk = &minors[minor(bp->b_dev)]; 493 off_t off; 494 int async = bp->b_flags & B_ASYNC; 495 int op; 496 497 if (bp->b_bcount % (1<<sectshift) != 0) { 498 rump_biodone(bp, 0, EINVAL); 499 return; 500 } 501 502 /* collect statistics */ 503 ev_io_total.ev_count++; 504 if (async) 505 ev_io_async.ev_count++; 506 if (BUF_ISWRITE(bp)) { 507 ev_bwrite_total.ev_count += bp->b_bcount; 508 if (async) 509 ev_bwrite_async.ev_count += bp->b_bcount; 510 } else { 511 ev_bread_total.ev_count++; 512 } 513 514 /* 515 * b_blkno is always in terms of DEV_BSIZE, and since we need 516 * to translate to a byte offset for the host read, this 517 * calculation does not need sectshift. 518 */ 519 off = bp->b_blkno << DEV_BSHIFT; 520 521 /* 522 * Do bounds checking if we're working on a file. Otherwise 523 * invalid file systems might attempt to read beyond EOF. This 524 * is bad(tm) especially on mmapped images. This is essentially 525 * the kernel bounds_check() routines. 526 */ 527 if (off + bp->b_bcount > rblk->rblk_size) { 528 int64_t sz = rblk->rblk_size - off; 529 530 /* EOF */ 531 if (sz == 0) { 532 rump_biodone(bp, 0, 0); 533 return; 534 } 535 /* beyond EOF ==> error */ 536 if (sz < 0) { 537 rump_biodone(bp, 0, EINVAL); 538 return; 539 } 540 541 /* truncate to device size */ 542 bp->b_bcount = sz; 543 } 544 545 off += rblk->rblk_hostoffset; 546 DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64 547 " (0x%" PRIx64 " - 0x%" PRIx64 "), %ssync\n", 548 bp->b_bcount, BUF_ISREAD(bp) ? "READ" : "WRITE", 549 off, off, (off + bp->b_bcount), async ? "a" : "")); 550 551 op = BUF_ISREAD(bp) ? RUMPUSER_BIO_READ : RUMPUSER_BIO_WRITE; 552 if (BUF_ISWRITE(bp) && !async) 553 op |= RUMPUSER_BIO_SYNC; 554 555 rumpuser_bio(rblk->rblk_fd, op, bp->b_data, bp->b_bcount, off, 556 rump_biodone, bp); 557 } 558 559 void 560 rumpblk_strategy(struct buf *bp) 561 { 562 563 dostrategy(bp); 564 } 565 566 /* 567 * Simple random number generator. This is private so that we can 568 * very repeatedly control which blocks will fail. 569 * 570 * <mlelstv> pooka, rand() 571 * <mlelstv> [paste] 572 */ 573 static unsigned 574 gimmerand(void) 575 { 576 577 return (randstate = randstate * 1103515245 + 12345) % (0x80000000L); 578 } 579 580 /* 581 * Block device with very simple fault injection. Fails every 582 * n out of BLKFAIL_MAX I/O with EIO. n is determined by the env 583 * variable RUMP_BLKFAIL. 584 */ 585 void 586 rumpblk_strategy_fail(struct buf *bp) 587 { 588 589 if (gimmerand() % BLKFAIL_MAX >= blkfail) { 590 dostrategy(bp); 591 } else { 592 printf("block fault injection: failing I/O on block %lld\n", 593 (long long)bp->b_blkno); 594 bp->b_error = EIO; 595 biodone(bp); 596 } 597 } 598