rumpblk.c revision 1.64.2.2 1 /* $NetBSD: rumpblk.c,v 1.64.2.2 2016/07/19 06:27:00 pgoyette Exp $ */
2
3 /*
4 * Copyright (c) 2009 Antti Kantee. All Rights Reserved.
5 *
6 * Development of this software was supported by the
7 * Finnish Cultural Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
19 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 /*
32 * Block device emulation. Presents a block device interface and
33 * uses rumpuser system calls to satisfy I/O requests.
34 *
35 * We provide fault injection. The driver can be made to fail
36 * I/O occasionally.
37 */
38
39 #include <sys/cdefs.h>
40 __KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.64.2.2 2016/07/19 06:27:00 pgoyette Exp $");
41
42 #include <sys/param.h>
43 #include <sys/buf.h>
44 #include <sys/conf.h>
45 #include <sys/condvar.h>
46 #include <sys/disklabel.h>
47 #include <sys/evcnt.h>
48 #include <sys/fcntl.h>
49 #include <sys/kmem.h>
50 #include <sys/malloc.h>
51 #include <sys/queue.h>
52 #include <sys/stat.h>
53 #include <sys/cprng.h>
54 #include <sys/localcount.h>
55
56 #include <rump-sys/kern.h>
57 #include <rump-sys/vfs.h>
58
59 #include <rump/rumpuser.h>
60
61 #if 0
62 #define DPRINTF(x) printf x
63 #else
64 #define DPRINTF(x)
65 #endif
66
67 #define RUMPBLK_SIZE 16
68 static struct rblkdev {
69 char *rblk_path;
70 int rblk_fd;
71 int rblk_mode;
72
73 uint64_t rblk_size;
74 uint64_t rblk_hostoffset;
75 uint64_t rblk_hostsize;
76 int rblk_ftype;
77
78 struct disklabel rblk_label;
79 } minors[RUMPBLK_SIZE];
80
81 static struct evcnt ev_io_total;
82 static struct evcnt ev_io_async;
83
84 static struct evcnt ev_bwrite_total;
85 static struct evcnt ev_bwrite_async;
86 static struct evcnt ev_bread_total;
87
88 dev_type_open(rumpblk_open);
89 dev_type_close(rumpblk_close);
90 dev_type_read(rumpblk_read);
91 dev_type_write(rumpblk_write);
92 dev_type_ioctl(rumpblk_ioctl);
93 dev_type_strategy(rumpblk_strategy);
94 dev_type_strategy(rumpblk_strategy_fail);
95 dev_type_dump(rumpblk_dump);
96 dev_type_size(rumpblk_size);
97
98 static const struct bdevsw rumpblk_bdevsw = {
99 LOCALCOUNT_INITIALIZER
100 .d_open = rumpblk_open,
101 .d_close = rumpblk_close,
102 .d_strategy = rumpblk_strategy,
103 .d_ioctl = rumpblk_ioctl,
104 .d_dump = nodump,
105 .d_psize = nosize,
106 .d_discard = nodiscard,
107 .d_flag = D_DISK
108 };
109
110 static const struct bdevsw rumpblk_bdevsw_fail = {
111 LOCALCOUNT_INITIALIZER
112 .d_open = rumpblk_open,
113 .d_close = rumpblk_close,
114 .d_strategy = rumpblk_strategy_fail,
115 .d_ioctl = rumpblk_ioctl,
116 .d_dump = nodump,
117 .d_psize = nosize,
118 .d_discard = nodiscard,
119 .d_flag = D_DISK
120 };
121
122 static const struct cdevsw rumpblk_cdevsw = {
123 LOCALCOUNT_INITIALIZER
124 .d_open = rumpblk_open,
125 .d_close = rumpblk_close,
126 .d_read = rumpblk_read,
127 .d_write = rumpblk_write,
128 .d_ioctl = rumpblk_ioctl,
129 .d_stop = nostop,
130 .d_tty = notty,
131 .d_poll = nopoll,
132 .d_mmap = nommap,
133 .d_kqfilter = nokqfilter,
134 .d_discard = nodiscard,
135 .d_flag = D_DISK
136 };
137
138 static int backend_open(struct rblkdev *, const char *);
139 static int backend_close(struct rblkdev *);
140
141 /* fail every n out of BLKFAIL_MAX */
142 #define BLKFAIL_MAX 10000
143 static int blkfail;
144 static unsigned randstate;
145 static kmutex_t rumpblk_lock;
146 static int sectshift = DEV_BSHIFT;
147
148 static void
149 makedefaultlabel(struct disklabel *lp, off_t size, int part)
150 {
151 int i;
152
153 memset(lp, 0, sizeof(*lp));
154
155 lp->d_secperunit = size;
156 lp->d_secsize = 1 << sectshift;
157 lp->d_nsectors = size >> sectshift;
158 lp->d_ntracks = 1;
159 lp->d_ncylinders = 1;
160 lp->d_secpercyl = lp->d_nsectors;
161
162 /* oh dear oh dear */
163 strncpy(lp->d_typename, "rumpd", sizeof(lp->d_typename));
164 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
165
166 lp->d_type = DKTYPE_RUMPD;
167 lp->d_rpm = 11;
168 lp->d_interleave = 1;
169 lp->d_flags = 0;
170
171 /* XXX: RAW_PART handling? */
172 for (i = 0; i < part; i++) {
173 lp->d_partitions[i].p_fstype = FS_UNUSED;
174 }
175 lp->d_partitions[part].p_size = size >> sectshift;
176 lp->d_npartitions = part+1;
177 /* XXX: file system type? */
178
179 lp->d_magic = DISKMAGIC;
180 lp->d_magic2 = DISKMAGIC;
181 lp->d_checksum = 0; /* XXX */
182 }
183
184 int
185 rumpblk_init(void)
186 {
187 char buf[64];
188 devmajor_t rumpblkmaj = RUMPBLK_DEVMAJOR;
189 unsigned tmp;
190 int i;
191
192 mutex_init(&rumpblk_lock, MUTEX_DEFAULT, IPL_NONE);
193
194 if (rumpuser_getparam("RUMP_BLKFAIL", buf, sizeof(buf)) == 0) {
195 blkfail = strtoul(buf, NULL, 10);
196 /* fail everything */
197 if (blkfail > BLKFAIL_MAX)
198 blkfail = BLKFAIL_MAX;
199 if (rumpuser_getparam("RUMP_BLKFAIL_SEED",
200 buf, sizeof(buf)) == 0) {
201 randstate = strtoul(buf, NULL, 10);
202 } else {
203 randstate = cprng_fast32();
204 }
205 printf("rumpblk: FAULT INJECTION ACTIVE! fail %d/%d. "
206 "seed %u\n", blkfail, BLKFAIL_MAX, randstate);
207 } else {
208 blkfail = 0;
209 }
210
211 if (rumpuser_getparam("RUMP_BLKSECTSHIFT", buf, sizeof(buf)) == 0) {
212 printf("rumpblk: ");
213 tmp = strtoul(buf, NULL, 10);
214 if (tmp >= DEV_BSHIFT)
215 sectshift = tmp;
216 else
217 printf("RUMP_BLKSECTSHIFT must be least %d (now %d), ",
218 DEV_BSHIFT, tmp);
219 printf("using %d for sector shift (size %d)\n",
220 sectshift, 1<<sectshift);
221 }
222
223 memset(minors, 0, sizeof(minors));
224 for (i = 0; i < RUMPBLK_SIZE; i++) {
225 minors[i].rblk_fd = -1;
226 }
227
228 evcnt_attach_dynamic(&ev_io_total, EVCNT_TYPE_MISC, NULL,
229 "rumpblk", "I/O reqs");
230 evcnt_attach_dynamic(&ev_io_async, EVCNT_TYPE_MISC, NULL,
231 "rumpblk", "async I/O");
232
233 evcnt_attach_dynamic(&ev_bread_total, EVCNT_TYPE_MISC, NULL,
234 "rumpblk", "bytes read");
235 evcnt_attach_dynamic(&ev_bwrite_total, EVCNT_TYPE_MISC, NULL,
236 "rumpblk", "bytes written");
237 evcnt_attach_dynamic(&ev_bwrite_async, EVCNT_TYPE_MISC, NULL,
238 "rumpblk", "bytes written async");
239
240 if (blkfail) {
241 return devsw_attach("rumpblk",
242 &rumpblk_bdevsw_fail, &rumpblkmaj,
243 &rumpblk_cdevsw, &rumpblkmaj);
244 } else {
245 return devsw_attach("rumpblk",
246 &rumpblk_bdevsw, &rumpblkmaj,
247 &rumpblk_cdevsw, &rumpblkmaj);
248 }
249 }
250
251 int
252 rumpblk_register(const char *path, devminor_t *dmin,
253 uint64_t offset, uint64_t size)
254 {
255 struct rblkdev *rblk;
256 uint64_t flen;
257 size_t len;
258 int ftype, error, i;
259
260 /* devices might not report correct size unless they're open */
261 if ((error = rumpuser_getfileinfo(path, &flen, &ftype)) != 0)
262 return error;
263
264 /* verify host file is of supported type */
265 if (!(ftype == RUMPUSER_FT_REG
266 || ftype == RUMPUSER_FT_BLK
267 || ftype == RUMPUSER_FT_CHR))
268 return EINVAL;
269
270 mutex_enter(&rumpblk_lock);
271 for (i = 0; i < RUMPBLK_SIZE; i++) {
272 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
273 mutex_exit(&rumpblk_lock);
274 *dmin = i;
275 return 0;
276 }
277 }
278
279 for (i = 0; i < RUMPBLK_SIZE; i++)
280 if (minors[i].rblk_path == NULL)
281 break;
282 if (i == RUMPBLK_SIZE) {
283 mutex_exit(&rumpblk_lock);
284 return EBUSY;
285 }
286
287 rblk = &minors[i];
288 rblk->rblk_path = __UNCONST("taken");
289 mutex_exit(&rumpblk_lock);
290
291 len = strlen(path);
292 rblk->rblk_path = malloc(len + 1, M_TEMP, M_WAITOK);
293 strcpy(rblk->rblk_path, path);
294 rblk->rblk_hostoffset = offset;
295 if (size != RUMPBLK_SIZENOTSET) {
296 KASSERT(size + offset <= flen);
297 rblk->rblk_size = size;
298 } else {
299 KASSERT(offset < flen);
300 rblk->rblk_size = flen - offset;
301 }
302 rblk->rblk_hostsize = flen;
303 rblk->rblk_ftype = ftype;
304 makedefaultlabel(&rblk->rblk_label, rblk->rblk_size, i);
305
306 if ((error = backend_open(rblk, path)) != 0) {
307 memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label));
308 free(rblk->rblk_path, M_TEMP);
309 rblk->rblk_path = NULL;
310 return error;
311 }
312
313 *dmin = i;
314 return 0;
315 }
316
317 /*
318 * Unregister rumpblk. It's the callers responsibility to make
319 * sure it's no longer in use.
320 */
321 int
322 rumpblk_deregister(const char *path)
323 {
324 struct rblkdev *rblk;
325 int i;
326
327 mutex_enter(&rumpblk_lock);
328 for (i = 0; i < RUMPBLK_SIZE; i++) {
329 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
330 break;
331 }
332 }
333 mutex_exit(&rumpblk_lock);
334
335 if (i == RUMPBLK_SIZE)
336 return ENOENT;
337
338 rblk = &minors[i];
339 backend_close(rblk);
340
341 free(rblk->rblk_path, M_TEMP);
342 memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label));
343 rblk->rblk_path = NULL;
344
345 return 0;
346 }
347
348 /*
349 * Release all backend resources, to be called only when the rump
350 * kernel is being shut down.
351 * This routine does not do a full "fini" since we're going down anyway.
352 */
353 void
354 rumpblk_fini(void)
355 {
356 int i;
357
358 for (i = 0; i < RUMPBLK_SIZE; i++) {
359 struct rblkdev *rblk;
360
361 rblk = &minors[i];
362 if (rblk->rblk_fd != -1)
363 backend_close(rblk);
364 }
365 }
366
367 static int
368 backend_open(struct rblkdev *rblk, const char *path)
369 {
370 int error, fd;
371
372 KASSERT(rblk->rblk_fd == -1);
373 error = rumpuser_open(path,
374 RUMPUSER_OPEN_RDWR | RUMPUSER_OPEN_BIO, &fd);
375 if (error) {
376 error = rumpuser_open(path,
377 RUMPUSER_OPEN_RDONLY | RUMPUSER_OPEN_BIO, &fd);
378 if (error)
379 return error;
380 rblk->rblk_mode = FREAD;
381 } else {
382 rblk->rblk_mode = FREAD|FWRITE;
383 }
384
385 rblk->rblk_fd = fd;
386 KASSERT(rblk->rblk_fd != -1);
387 return 0;
388 }
389
390 static int
391 backend_close(struct rblkdev *rblk)
392 {
393
394 rumpuser_close(rblk->rblk_fd);
395 rblk->rblk_fd = -1;
396
397 return 0;
398 }
399
400 int
401 rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l)
402 {
403 struct rblkdev *rblk = &minors[minor(dev)];
404
405 if (rblk->rblk_fd == -1)
406 return ENXIO;
407
408 if (((flag & (FREAD|FWRITE)) & ~rblk->rblk_mode) != 0) {
409 return EACCES;
410 }
411
412 return 0;
413 }
414
415 int
416 rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l)
417 {
418
419 return 0;
420 }
421
422 int
423 rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l)
424 {
425 devminor_t dmin = minor(dev);
426 struct rblkdev *rblk = &minors[dmin];
427 struct partinfo *pi;
428 struct partition *dp;
429 int error = 0;
430
431 /* well, me should support a few more, but we don't for now */
432 switch (xfer) {
433 case DIOCGDINFO:
434 *(struct disklabel *)addr = rblk->rblk_label;
435 break;
436
437 case DIOCGPARTINFO:
438 dp = &rblk->rblk_label.d_partitions[DISKPART(dmin)];
439 pi = addr;
440 pi->pi_offset = dp->p_offset;
441 pi->pi_size = dp->p_size;
442 pi->pi_secsize = rblk->rblk_label.d_secsize;
443 pi->pi_bsize = BLKDEV_IOSIZE;
444 pi->pi_fstype = dp->p_fstype;
445 pi->pi_fsize = dp->p_fsize;
446 pi->pi_frag = dp->p_frag;
447 pi->pi_cpg = dp->p_cpg;
448 break;
449
450 /* it's synced enough along the write path */
451 case DIOCCACHESYNC:
452 break;
453
454 case DIOCGMEDIASIZE:
455 *(off_t *)addr = (off_t)rblk->rblk_size;
456 break;
457
458 default:
459 error = ENOTTY;
460 break;
461 }
462
463 return error;
464 }
465
466 static int
467 do_physio(dev_t dev, struct uio *uio, int which)
468 {
469 void (*strat)(struct buf *);
470
471 if (blkfail)
472 strat = rumpblk_strategy_fail;
473 else
474 strat = rumpblk_strategy;
475
476 return physio(strat, NULL, dev, which, minphys, uio);
477 }
478
479 int
480 rumpblk_read(dev_t dev, struct uio *uio, int flags)
481 {
482
483 return do_physio(dev, uio, B_READ);
484 }
485
486 int
487 rumpblk_write(dev_t dev, struct uio *uio, int flags)
488 {
489
490 return do_physio(dev, uio, B_WRITE);
491 }
492
493 static void
494 dostrategy(struct buf *bp)
495 {
496 struct rblkdev *rblk = &minors[minor(bp->b_dev)];
497 off_t off;
498 int async = bp->b_flags & B_ASYNC;
499 int op;
500
501 if (bp->b_bcount % (1<<sectshift) != 0) {
502 rump_biodone(bp, 0, EINVAL);
503 return;
504 }
505
506 /* collect statistics */
507 ev_io_total.ev_count++;
508 if (async)
509 ev_io_async.ev_count++;
510 if (BUF_ISWRITE(bp)) {
511 ev_bwrite_total.ev_count += bp->b_bcount;
512 if (async)
513 ev_bwrite_async.ev_count += bp->b_bcount;
514 } else {
515 ev_bread_total.ev_count++;
516 }
517
518 /*
519 * b_blkno is always in terms of DEV_BSIZE, and since we need
520 * to translate to a byte offset for the host read, this
521 * calculation does not need sectshift.
522 */
523 off = bp->b_blkno << DEV_BSHIFT;
524
525 /*
526 * Do bounds checking if we're working on a file. Otherwise
527 * invalid file systems might attempt to read beyond EOF. This
528 * is bad(tm) especially on mmapped images. This is essentially
529 * the kernel bounds_check() routines.
530 */
531 if (off + bp->b_bcount > rblk->rblk_size) {
532 int64_t sz = rblk->rblk_size - off;
533
534 /* EOF */
535 if (sz == 0) {
536 rump_biodone(bp, 0, 0);
537 return;
538 }
539 /* beyond EOF ==> error */
540 if (sz < 0) {
541 rump_biodone(bp, 0, EINVAL);
542 return;
543 }
544
545 /* truncate to device size */
546 bp->b_bcount = sz;
547 }
548
549 off += rblk->rblk_hostoffset;
550 DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64
551 " (0x%" PRIx64 " - 0x%" PRIx64 "), %ssync\n",
552 bp->b_bcount, BUF_ISREAD(bp) ? "READ" : "WRITE",
553 off, off, (off + bp->b_bcount), async ? "a" : ""));
554
555 op = BUF_ISREAD(bp) ? RUMPUSER_BIO_READ : RUMPUSER_BIO_WRITE;
556 if (BUF_ISWRITE(bp) && !async)
557 op |= RUMPUSER_BIO_SYNC;
558
559 rumpuser_bio(rblk->rblk_fd, op, bp->b_data, bp->b_bcount, off,
560 rump_biodone, bp);
561 }
562
563 void
564 rumpblk_strategy(struct buf *bp)
565 {
566
567 dostrategy(bp);
568 }
569
570 /*
571 * Simple random number generator. This is private so that we can
572 * very repeatedly control which blocks will fail.
573 *
574 * <mlelstv> pooka, rand()
575 * <mlelstv> [paste]
576 */
577 static unsigned
578 gimmerand(void)
579 {
580
581 return (randstate = randstate * 1103515245 + 12345) % (0x80000000L);
582 }
583
584 /*
585 * Block device with very simple fault injection. Fails every
586 * n out of BLKFAIL_MAX I/O with EIO. n is determined by the env
587 * variable RUMP_BLKFAIL.
588 */
589 void
590 rumpblk_strategy_fail(struct buf *bp)
591 {
592
593 if (gimmerand() % BLKFAIL_MAX >= blkfail) {
594 dostrategy(bp);
595 } else {
596 printf("block fault injection: failing I/O on block %lld\n",
597 (long long)bp->b_blkno);
598 bp->b_error = EIO;
599 biodone(bp);
600 }
601 }
602