rumpblk.c revision 1.57 1 /* $NetBSD: rumpblk.c,v 1.57 2014/07/25 08:10:40 dholland Exp $ */
2
3 /*
4 * Copyright (c) 2009 Antti Kantee. All Rights Reserved.
5 *
6 * Development of this software was supported by the
7 * Finnish Cultural Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
19 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 /*
32 * Block device emulation. Presents a block device interface and
33 * uses rumpuser system calls to satisfy I/O requests.
34 *
35 * We provide fault injection. The driver can be made to fail
36 * I/O occasionally.
37 */
38
39 #include <sys/cdefs.h>
40 __KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.57 2014/07/25 08:10:40 dholland Exp $");
41
42 #include <sys/param.h>
43 #include <sys/buf.h>
44 #include <sys/conf.h>
45 #include <sys/condvar.h>
46 #include <sys/disklabel.h>
47 #include <sys/evcnt.h>
48 #include <sys/fcntl.h>
49 #include <sys/kmem.h>
50 #include <sys/malloc.h>
51 #include <sys/queue.h>
52 #include <sys/stat.h>
53 #include <sys/cprng.h>
54
55 #include <rump/rumpuser.h>
56
57 #include "rump_private.h"
58 #include "rump_vfs_private.h"
59
60 #if 0
61 #define DPRINTF(x) printf x
62 #else
63 #define DPRINTF(x)
64 #endif
65
66 #define RUMPBLK_SIZE 16
67 static struct rblkdev {
68 char *rblk_path;
69 int rblk_fd;
70 int rblk_mode;
71
72 uint64_t rblk_size;
73 uint64_t rblk_hostoffset;
74 uint64_t rblk_hostsize;
75 int rblk_ftype;
76
77 struct disklabel rblk_label;
78 } minors[RUMPBLK_SIZE];
79
80 static struct evcnt ev_io_total;
81 static struct evcnt ev_io_async;
82
83 static struct evcnt ev_bwrite_total;
84 static struct evcnt ev_bwrite_async;
85 static struct evcnt ev_bread_total;
86
87 dev_type_open(rumpblk_open);
88 dev_type_close(rumpblk_close);
89 dev_type_read(rumpblk_read);
90 dev_type_write(rumpblk_write);
91 dev_type_ioctl(rumpblk_ioctl);
92 dev_type_strategy(rumpblk_strategy);
93 dev_type_strategy(rumpblk_strategy_fail);
94 dev_type_dump(rumpblk_dump);
95 dev_type_size(rumpblk_size);
96
97 static const struct bdevsw rumpblk_bdevsw = {
98 .d_open = rumpblk_open,
99 .d_close = rumpblk_close,
100 .d_strategy = rumpblk_strategy,
101 .d_ioctl = rumpblk_ioctl,
102 .d_dump = nodump,
103 .d_psize = nosize,
104 .d_discard = nodiscard,
105 .d_flag = D_DISK
106 };
107
108 static const struct bdevsw rumpblk_bdevsw_fail = {
109 .d_open = rumpblk_open,
110 .d_close = rumpblk_close,
111 .d_strategy = rumpblk_strategy_fail,
112 .d_ioctl = rumpblk_ioctl,
113 .d_dump = nodump,
114 .d_psize = nosize,
115 .d_discard = nodiscard,
116 .d_flag = D_DISK
117 };
118
119 static const struct cdevsw rumpblk_cdevsw = {
120 .d_open = rumpblk_open,
121 .d_close = rumpblk_close,
122 .d_read = rumpblk_read,
123 .d_write = rumpblk_write,
124 .d_ioctl = rumpblk_ioctl,
125 .d_stop = nostop,
126 .d_tty = notty,
127 .d_poll = nopoll,
128 .d_mmap = nommap,
129 .d_kqfilter = nokqfilter,
130 .d_discard = nodiscard,
131 .d_flag = D_DISK
132 };
133
134 static int backend_open(struct rblkdev *, const char *);
135 static int backend_close(struct rblkdev *);
136
137 /* fail every n out of BLKFAIL_MAX */
138 #define BLKFAIL_MAX 10000
139 static int blkfail;
140 static unsigned randstate;
141 static kmutex_t rumpblk_lock;
142 static int sectshift = DEV_BSHIFT;
143
144 static void
145 makedefaultlabel(struct disklabel *lp, off_t size, int part)
146 {
147 int i;
148
149 memset(lp, 0, sizeof(*lp));
150
151 lp->d_secperunit = size;
152 lp->d_secsize = 1 << sectshift;
153 lp->d_nsectors = size >> sectshift;
154 lp->d_ntracks = 1;
155 lp->d_ncylinders = 1;
156 lp->d_secpercyl = lp->d_nsectors;
157
158 /* oh dear oh dear */
159 strncpy(lp->d_typename, "rumpd", sizeof(lp->d_typename));
160 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
161
162 lp->d_type = DTYPE_RUMPD;
163 lp->d_rpm = 11;
164 lp->d_interleave = 1;
165 lp->d_flags = 0;
166
167 /* XXX: RAW_PART handling? */
168 for (i = 0; i < part; i++) {
169 lp->d_partitions[i].p_fstype = FS_UNUSED;
170 }
171 lp->d_partitions[part].p_size = size >> sectshift;
172 lp->d_npartitions = part+1;
173 /* XXX: file system type? */
174
175 lp->d_magic = DISKMAGIC;
176 lp->d_magic2 = DISKMAGIC;
177 lp->d_checksum = 0; /* XXX */
178 }
179
180 int
181 rumpblk_init(void)
182 {
183 char buf[64];
184 devmajor_t rumpblkmaj = RUMPBLK_DEVMAJOR;
185 unsigned tmp;
186 int i;
187
188 mutex_init(&rumpblk_lock, MUTEX_DEFAULT, IPL_NONE);
189
190 if (rumpuser_getparam("RUMP_BLKFAIL", buf, sizeof(buf)) == 0) {
191 blkfail = strtoul(buf, NULL, 10);
192 /* fail everything */
193 if (blkfail > BLKFAIL_MAX)
194 blkfail = BLKFAIL_MAX;
195 if (rumpuser_getparam("RUMP_BLKFAIL_SEED",
196 buf, sizeof(buf)) == 0) {
197 randstate = strtoul(buf, NULL, 10);
198 } else {
199 randstate = cprng_fast32();
200 }
201 printf("rumpblk: FAULT INJECTION ACTIVE! fail %d/%d. "
202 "seed %u\n", blkfail, BLKFAIL_MAX, randstate);
203 } else {
204 blkfail = 0;
205 }
206
207 if (rumpuser_getparam("RUMP_BLKSECTSHIFT", buf, sizeof(buf)) == 0) {
208 printf("rumpblk: ");
209 tmp = strtoul(buf, NULL, 10);
210 if (tmp >= DEV_BSHIFT)
211 sectshift = tmp;
212 else
213 printf("RUMP_BLKSECTSHIFT must be least %d (now %d), ",
214 DEV_BSHIFT, tmp);
215 printf("using %d for sector shift (size %d)\n",
216 sectshift, 1<<sectshift);
217 }
218
219 memset(minors, 0, sizeof(minors));
220 for (i = 0; i < RUMPBLK_SIZE; i++) {
221 minors[i].rblk_fd = -1;
222 }
223
224 evcnt_attach_dynamic(&ev_io_total, EVCNT_TYPE_MISC, NULL,
225 "rumpblk", "I/O reqs");
226 evcnt_attach_dynamic(&ev_io_async, EVCNT_TYPE_MISC, NULL,
227 "rumpblk", "async I/O");
228
229 evcnt_attach_dynamic(&ev_bread_total, EVCNT_TYPE_MISC, NULL,
230 "rumpblk", "bytes read");
231 evcnt_attach_dynamic(&ev_bwrite_total, EVCNT_TYPE_MISC, NULL,
232 "rumpblk", "bytes written");
233 evcnt_attach_dynamic(&ev_bwrite_async, EVCNT_TYPE_MISC, NULL,
234 "rumpblk", "bytes written async");
235
236 if (blkfail) {
237 return devsw_attach("rumpblk",
238 &rumpblk_bdevsw_fail, &rumpblkmaj,
239 &rumpblk_cdevsw, &rumpblkmaj);
240 } else {
241 return devsw_attach("rumpblk",
242 &rumpblk_bdevsw, &rumpblkmaj,
243 &rumpblk_cdevsw, &rumpblkmaj);
244 }
245 }
246
247 int
248 rumpblk_register(const char *path, devminor_t *dmin,
249 uint64_t offset, uint64_t size)
250 {
251 struct rblkdev *rblk;
252 uint64_t flen;
253 size_t len;
254 int ftype, error, i;
255
256 /* devices might not report correct size unless they're open */
257 if ((error = rumpuser_getfileinfo(path, &flen, &ftype)) != 0)
258 return error;
259
260 /* verify host file is of supported type */
261 if (!(ftype == RUMPUSER_FT_REG
262 || ftype == RUMPUSER_FT_BLK
263 || ftype == RUMPUSER_FT_CHR))
264 return EINVAL;
265
266 mutex_enter(&rumpblk_lock);
267 for (i = 0; i < RUMPBLK_SIZE; i++) {
268 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
269 mutex_exit(&rumpblk_lock);
270 *dmin = i;
271 return 0;
272 }
273 }
274
275 for (i = 0; i < RUMPBLK_SIZE; i++)
276 if (minors[i].rblk_path == NULL)
277 break;
278 if (i == RUMPBLK_SIZE) {
279 mutex_exit(&rumpblk_lock);
280 return EBUSY;
281 }
282
283 rblk = &minors[i];
284 rblk->rblk_path = __UNCONST("taken");
285 mutex_exit(&rumpblk_lock);
286
287 len = strlen(path);
288 rblk->rblk_path = malloc(len + 1, M_TEMP, M_WAITOK);
289 strcpy(rblk->rblk_path, path);
290 rblk->rblk_hostoffset = offset;
291 if (size != RUMPBLK_SIZENOTSET) {
292 KASSERT(size + offset <= flen);
293 rblk->rblk_size = size;
294 } else {
295 KASSERT(offset < flen);
296 rblk->rblk_size = flen - offset;
297 }
298 rblk->rblk_hostsize = flen;
299 rblk->rblk_ftype = ftype;
300 makedefaultlabel(&rblk->rblk_label, rblk->rblk_size, i);
301
302 if ((error = backend_open(rblk, path)) != 0) {
303 memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label));
304 free(rblk->rblk_path, M_TEMP);
305 rblk->rblk_path = NULL;
306 return error;
307 }
308
309 *dmin = i;
310 return 0;
311 }
312
313 /*
314 * Unregister rumpblk. It's the callers responsibility to make
315 * sure it's no longer in use.
316 */
317 int
318 rumpblk_deregister(const char *path)
319 {
320 struct rblkdev *rblk;
321 int i;
322
323 mutex_enter(&rumpblk_lock);
324 for (i = 0; i < RUMPBLK_SIZE; i++) {
325 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
326 break;
327 }
328 }
329 mutex_exit(&rumpblk_lock);
330
331 if (i == RUMPBLK_SIZE)
332 return ENOENT;
333
334 rblk = &minors[i];
335 backend_close(rblk);
336
337 free(rblk->rblk_path, M_TEMP);
338 memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label));
339 rblk->rblk_path = NULL;
340
341 return 0;
342 }
343
344 static int
345 backend_open(struct rblkdev *rblk, const char *path)
346 {
347 int error, fd;
348
349 KASSERT(rblk->rblk_fd == -1);
350 error = rumpuser_open(path,
351 RUMPUSER_OPEN_RDWR | RUMPUSER_OPEN_BIO, &fd);
352 if (error) {
353 error = rumpuser_open(path,
354 RUMPUSER_OPEN_RDONLY | RUMPUSER_OPEN_BIO, &fd);
355 if (error)
356 return error;
357 rblk->rblk_mode = FREAD;
358 } else {
359 rblk->rblk_mode = FREAD|FWRITE;
360 }
361
362 rblk->rblk_fd = fd;
363 KASSERT(rblk->rblk_fd != -1);
364 return 0;
365 }
366
367 static int
368 backend_close(struct rblkdev *rblk)
369 {
370
371 rumpuser_close(rblk->rblk_fd);
372 rblk->rblk_fd = -1;
373
374 return 0;
375 }
376
377 int
378 rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l)
379 {
380 struct rblkdev *rblk = &minors[minor(dev)];
381
382 if (rblk->rblk_fd == -1)
383 return ENXIO;
384
385 if (((flag & (FREAD|FWRITE)) & ~rblk->rblk_mode) != 0) {
386 return EACCES;
387 }
388
389 return 0;
390 }
391
392 int
393 rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l)
394 {
395
396 return 0;
397 }
398
399 int
400 rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l)
401 {
402 devminor_t dmin = minor(dev);
403 struct rblkdev *rblk = &minors[dmin];
404 struct partinfo *pi;
405 int error = 0;
406
407 /* well, me should support a few more, but we don't for now */
408 switch (xfer) {
409 case DIOCGDINFO:
410 *(struct disklabel *)addr = rblk->rblk_label;
411 break;
412
413 case DIOCGPART:
414 pi = addr;
415 pi->part = &rblk->rblk_label.d_partitions[DISKPART(dmin)];
416 pi->disklab = &rblk->rblk_label;
417 break;
418
419 /* it's synced enough along the write path */
420 case DIOCCACHESYNC:
421 break;
422
423 default:
424 error = ENOTTY;
425 break;
426 }
427
428 return error;
429 }
430
431 static int
432 do_physio(dev_t dev, struct uio *uio, int which)
433 {
434 void (*strat)(struct buf *);
435
436 if (blkfail)
437 strat = rumpblk_strategy_fail;
438 else
439 strat = rumpblk_strategy;
440
441 return physio(strat, NULL, dev, which, minphys, uio);
442 }
443
444 int
445 rumpblk_read(dev_t dev, struct uio *uio, int flags)
446 {
447
448 return do_physio(dev, uio, B_READ);
449 }
450
451 int
452 rumpblk_write(dev_t dev, struct uio *uio, int flags)
453 {
454
455 return do_physio(dev, uio, B_WRITE);
456 }
457
458 static void
459 dostrategy(struct buf *bp)
460 {
461 struct rblkdev *rblk = &minors[minor(bp->b_dev)];
462 off_t off;
463 int async = bp->b_flags & B_ASYNC;
464 int op;
465
466 if (bp->b_bcount % (1<<sectshift) != 0) {
467 rump_biodone(bp, 0, EINVAL);
468 return;
469 }
470
471 /* collect statistics */
472 ev_io_total.ev_count++;
473 if (async)
474 ev_io_async.ev_count++;
475 if (BUF_ISWRITE(bp)) {
476 ev_bwrite_total.ev_count += bp->b_bcount;
477 if (async)
478 ev_bwrite_async.ev_count += bp->b_bcount;
479 } else {
480 ev_bread_total.ev_count++;
481 }
482
483 /*
484 * b_blkno is always in terms of DEV_BSIZE, and since we need
485 * to translate to a byte offset for the host read, this
486 * calculation does not need sectshift.
487 */
488 off = bp->b_blkno << DEV_BSHIFT;
489
490 /*
491 * Do bounds checking if we're working on a file. Otherwise
492 * invalid file systems might attempt to read beyond EOF. This
493 * is bad(tm) especially on mmapped images. This is essentially
494 * the kernel bounds_check() routines.
495 */
496 if (off + bp->b_bcount > rblk->rblk_size) {
497 int64_t sz = rblk->rblk_size - off;
498
499 /* EOF */
500 if (sz == 0) {
501 rump_biodone(bp, 0, 0);
502 return;
503 }
504 /* beyond EOF ==> error */
505 if (sz < 0) {
506 rump_biodone(bp, 0, EINVAL);
507 return;
508 }
509
510 /* truncate to device size */
511 bp->b_bcount = sz;
512 }
513
514 off += rblk->rblk_hostoffset;
515 DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64
516 " (0x%" PRIx64 " - 0x%" PRIx64 "), %ssync\n",
517 bp->b_bcount, BUF_ISREAD(bp) ? "READ" : "WRITE",
518 off, off, (off + bp->b_bcount), async ? "a" : ""));
519
520 op = BUF_ISREAD(bp) ? RUMPUSER_BIO_READ : RUMPUSER_BIO_WRITE;
521 if (BUF_ISWRITE(bp) && !async)
522 op |= RUMPUSER_BIO_SYNC;
523
524 rumpuser_bio(rblk->rblk_fd, op, bp->b_data, bp->b_bcount, off,
525 rump_biodone, bp);
526 }
527
528 void
529 rumpblk_strategy(struct buf *bp)
530 {
531
532 dostrategy(bp);
533 }
534
535 /*
536 * Simple random number generator. This is private so that we can
537 * very repeatedly control which blocks will fail.
538 *
539 * <mlelstv> pooka, rand()
540 * <mlelstv> [paste]
541 */
542 static unsigned
543 gimmerand(void)
544 {
545
546 return (randstate = randstate * 1103515245 + 12345) % (0x80000000L);
547 }
548
549 /*
550 * Block device with very simple fault injection. Fails every
551 * n out of BLKFAIL_MAX I/O with EIO. n is determined by the env
552 * variable RUMP_BLKFAIL.
553 */
554 void
555 rumpblk_strategy_fail(struct buf *bp)
556 {
557
558 if (gimmerand() % BLKFAIL_MAX >= blkfail) {
559 dostrategy(bp);
560 } else {
561 printf("block fault injection: failing I/O on block %lld\n",
562 (long long)bp->b_blkno);
563 bp->b_error = EIO;
564 biodone(bp);
565 }
566 }
567