rumpblk.c revision 1.64.8.2 1 /* $NetBSD: rumpblk.c,v 1.64.8.2 2017/04/29 11:12:15 pgoyette Exp $ */
2
3 /*
4 * Copyright (c) 2009 Antti Kantee. All Rights Reserved.
5 *
6 * Development of this software was supported by the
7 * Finnish Cultural Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
19 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 /*
32 * Block device emulation. Presents a block device interface and
33 * uses rumpuser system calls to satisfy I/O requests.
34 *
35 * We provide fault injection. The driver can be made to fail
36 * I/O occasionally.
37 */
38
39 #include <sys/cdefs.h>
40 __KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.64.8.2 2017/04/29 11:12:15 pgoyette Exp $");
41
42 #include <sys/param.h>
43 #include <sys/buf.h>
44 #include <sys/conf.h>
45 #include <sys/condvar.h>
46 #include <sys/disklabel.h>
47 #include <sys/evcnt.h>
48 #include <sys/fcntl.h>
49 #include <sys/kmem.h>
50 #include <sys/malloc.h>
51 #include <sys/queue.h>
52 #include <sys/stat.h>
53 #include <sys/cprng.h>
54
55 #include <rump-sys/kern.h>
56 #include <rump-sys/vfs.h>
57
58 #include <rump/rumpuser.h>
59
60 #if 0
61 #define DPRINTF(x) printf x
62 #else
63 #define DPRINTF(x)
64 #endif
65
66 #define RUMPBLK_SIZE 16
67 static struct rblkdev {
68 char *rblk_path;
69 int rblk_fd;
70 int rblk_mode;
71
72 uint64_t rblk_size;
73 uint64_t rblk_hostoffset;
74 uint64_t rblk_hostsize;
75 int rblk_ftype;
76
77 struct disklabel rblk_label;
78 } minors[RUMPBLK_SIZE];
79
80 static struct evcnt ev_io_total;
81 static struct evcnt ev_io_async;
82
83 static struct evcnt ev_bwrite_total;
84 static struct evcnt ev_bwrite_async;
85 static struct evcnt ev_bread_total;
86
87 dev_type_open(rumpblk_open);
88 dev_type_close(rumpblk_close);
89 dev_type_read(rumpblk_read);
90 dev_type_write(rumpblk_write);
91 dev_type_ioctl(rumpblk_ioctl);
92 dev_type_strategy(rumpblk_strategy);
93 dev_type_strategy(rumpblk_strategy_fail);
94 dev_type_dump(rumpblk_dump);
95 dev_type_size(rumpblk_size);
96
97 static const struct bdevsw rumpblk_bdevsw = {
98 DEVSW_MODULE_INIT
99 .d_open = rumpblk_open,
100 .d_close = rumpblk_close,
101 .d_strategy = rumpblk_strategy,
102 .d_ioctl = rumpblk_ioctl,
103 .d_dump = nodump,
104 .d_psize = nosize,
105 .d_discard = nodiscard,
106 .d_flag = D_DISK
107 };
108
109 static const struct bdevsw rumpblk_bdevsw_fail = {
110 DEVSW_MODULE_INIT
111 .d_open = rumpblk_open,
112 .d_close = rumpblk_close,
113 .d_strategy = rumpblk_strategy_fail,
114 .d_ioctl = rumpblk_ioctl,
115 .d_dump = nodump,
116 .d_psize = nosize,
117 .d_discard = nodiscard,
118 .d_flag = D_DISK
119 };
120
121 static const struct cdevsw rumpblk_cdevsw = {
122 DEVSW_MODULE_INIT
123 .d_open = rumpblk_open,
124 .d_close = rumpblk_close,
125 .d_read = rumpblk_read,
126 .d_write = rumpblk_write,
127 .d_ioctl = rumpblk_ioctl,
128 .d_stop = nostop,
129 .d_tty = notty,
130 .d_poll = nopoll,
131 .d_mmap = nommap,
132 .d_kqfilter = nokqfilter,
133 .d_discard = nodiscard,
134 .d_flag = D_DISK
135 };
136
137 static int backend_open(struct rblkdev *, const char *);
138 static int backend_close(struct rblkdev *);
139
140 /* fail every n out of BLKFAIL_MAX */
141 #define BLKFAIL_MAX 10000
142 static int blkfail;
143 static unsigned randstate;
144 static kmutex_t rumpblk_lock;
145 static int sectshift = DEV_BSHIFT;
146
147 static void
148 makedefaultlabel(struct disklabel *lp, off_t size, int part)
149 {
150 int i;
151
152 memset(lp, 0, sizeof(*lp));
153
154 lp->d_secperunit = size;
155 lp->d_secsize = 1 << sectshift;
156 lp->d_nsectors = size >> sectshift;
157 lp->d_ntracks = 1;
158 lp->d_ncylinders = 1;
159 lp->d_secpercyl = lp->d_nsectors;
160
161 /* oh dear oh dear */
162 strncpy(lp->d_typename, "rumpd", sizeof(lp->d_typename));
163 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
164
165 lp->d_type = DKTYPE_RUMPD;
166 lp->d_rpm = 11;
167 lp->d_interleave = 1;
168 lp->d_flags = 0;
169
170 /* XXX: RAW_PART handling? */
171 for (i = 0; i < part; i++) {
172 lp->d_partitions[i].p_fstype = FS_UNUSED;
173 }
174 lp->d_partitions[part].p_size = size >> sectshift;
175 lp->d_npartitions = part+1;
176 /* XXX: file system type? */
177
178 lp->d_magic = DISKMAGIC;
179 lp->d_magic2 = DISKMAGIC;
180 lp->d_checksum = 0; /* XXX */
181 }
182
183 int
184 rumpblk_init(void)
185 {
186 char buf[64];
187 devmajor_t rumpblkmaj = RUMPBLK_DEVMAJOR;
188 unsigned tmp;
189 int i;
190
191 mutex_init(&rumpblk_lock, MUTEX_DEFAULT, IPL_NONE);
192
193 if (rumpuser_getparam("RUMP_BLKFAIL", buf, sizeof(buf)) == 0) {
194 blkfail = strtoul(buf, NULL, 10);
195 /* fail everything */
196 if (blkfail > BLKFAIL_MAX)
197 blkfail = BLKFAIL_MAX;
198 if (rumpuser_getparam("RUMP_BLKFAIL_SEED",
199 buf, sizeof(buf)) == 0) {
200 randstate = strtoul(buf, NULL, 10);
201 } else {
202 randstate = cprng_fast32();
203 }
204 printf("rumpblk: FAULT INJECTION ACTIVE! fail %d/%d. "
205 "seed %u\n", blkfail, BLKFAIL_MAX, randstate);
206 } else {
207 blkfail = 0;
208 }
209
210 if (rumpuser_getparam("RUMP_BLKSECTSHIFT", buf, sizeof(buf)) == 0) {
211 printf("rumpblk: ");
212 tmp = strtoul(buf, NULL, 10);
213 if (tmp >= DEV_BSHIFT)
214 sectshift = tmp;
215 else
216 printf("RUMP_BLKSECTSHIFT must be least %d (now %d), ",
217 DEV_BSHIFT, tmp);
218 printf("using %d for sector shift (size %d)\n",
219 sectshift, 1<<sectshift);
220 }
221
222 memset(minors, 0, sizeof(minors));
223 for (i = 0; i < RUMPBLK_SIZE; i++) {
224 minors[i].rblk_fd = -1;
225 }
226
227 evcnt_attach_dynamic(&ev_io_total, EVCNT_TYPE_MISC, NULL,
228 "rumpblk", "I/O reqs");
229 evcnt_attach_dynamic(&ev_io_async, EVCNT_TYPE_MISC, NULL,
230 "rumpblk", "async I/O");
231
232 evcnt_attach_dynamic(&ev_bread_total, EVCNT_TYPE_MISC, NULL,
233 "rumpblk", "bytes read");
234 evcnt_attach_dynamic(&ev_bwrite_total, EVCNT_TYPE_MISC, NULL,
235 "rumpblk", "bytes written");
236 evcnt_attach_dynamic(&ev_bwrite_async, EVCNT_TYPE_MISC, NULL,
237 "rumpblk", "bytes written async");
238
239 if (blkfail) {
240 return devsw_attach("rumpblk",
241 &rumpblk_bdevsw_fail, &rumpblkmaj,
242 &rumpblk_cdevsw, &rumpblkmaj);
243 } else {
244 return devsw_attach("rumpblk",
245 &rumpblk_bdevsw, &rumpblkmaj,
246 &rumpblk_cdevsw, &rumpblkmaj);
247 }
248 }
249
250 int
251 rumpblk_register(const char *path, devminor_t *dmin,
252 uint64_t offset, uint64_t size)
253 {
254 struct rblkdev *rblk;
255 uint64_t flen;
256 size_t len;
257 int ftype, error, i;
258
259 /* devices might not report correct size unless they're open */
260 if ((error = rumpuser_getfileinfo(path, &flen, &ftype)) != 0)
261 return error;
262
263 /* verify host file is of supported type */
264 if (!(ftype == RUMPUSER_FT_REG
265 || ftype == RUMPUSER_FT_BLK
266 || ftype == RUMPUSER_FT_CHR))
267 return EINVAL;
268
269 mutex_enter(&rumpblk_lock);
270 for (i = 0; i < RUMPBLK_SIZE; i++) {
271 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
272 mutex_exit(&rumpblk_lock);
273 *dmin = i;
274 return 0;
275 }
276 }
277
278 for (i = 0; i < RUMPBLK_SIZE; i++)
279 if (minors[i].rblk_path == NULL)
280 break;
281 if (i == RUMPBLK_SIZE) {
282 mutex_exit(&rumpblk_lock);
283 return EBUSY;
284 }
285
286 rblk = &minors[i];
287 rblk->rblk_path = __UNCONST("taken");
288 mutex_exit(&rumpblk_lock);
289
290 len = strlen(path);
291 rblk->rblk_path = malloc(len + 1, M_TEMP, M_WAITOK);
292 strcpy(rblk->rblk_path, path);
293 rblk->rblk_hostoffset = offset;
294 if (size != RUMPBLK_SIZENOTSET) {
295 KASSERT(size + offset <= flen);
296 rblk->rblk_size = size;
297 } else {
298 KASSERT(offset < flen);
299 rblk->rblk_size = flen - offset;
300 }
301 rblk->rblk_hostsize = flen;
302 rblk->rblk_ftype = ftype;
303 makedefaultlabel(&rblk->rblk_label, rblk->rblk_size, i);
304
305 if ((error = backend_open(rblk, path)) != 0) {
306 memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label));
307 free(rblk->rblk_path, M_TEMP);
308 rblk->rblk_path = NULL;
309 return error;
310 }
311
312 *dmin = i;
313 return 0;
314 }
315
316 /*
317 * Unregister rumpblk. It's the callers responsibility to make
318 * sure it's no longer in use.
319 */
320 int
321 rumpblk_deregister(const char *path)
322 {
323 struct rblkdev *rblk;
324 int i;
325
326 mutex_enter(&rumpblk_lock);
327 for (i = 0; i < RUMPBLK_SIZE; i++) {
328 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
329 break;
330 }
331 }
332 mutex_exit(&rumpblk_lock);
333
334 if (i == RUMPBLK_SIZE)
335 return ENOENT;
336
337 rblk = &minors[i];
338 backend_close(rblk);
339
340 free(rblk->rblk_path, M_TEMP);
341 memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label));
342 rblk->rblk_path = NULL;
343
344 return 0;
345 }
346
347 /*
348 * Release all backend resources, to be called only when the rump
349 * kernel is being shut down.
350 * This routine does not do a full "fini" since we're going down anyway.
351 */
352 void
353 rumpblk_fini(void)
354 {
355 int i;
356
357 for (i = 0; i < RUMPBLK_SIZE; i++) {
358 struct rblkdev *rblk;
359
360 rblk = &minors[i];
361 if (rblk->rblk_fd != -1)
362 backend_close(rblk);
363 }
364 }
365
366 static int
367 backend_open(struct rblkdev *rblk, const char *path)
368 {
369 int error, fd;
370
371 KASSERT(rblk->rblk_fd == -1);
372 error = rumpuser_open(path,
373 RUMPUSER_OPEN_RDWR | RUMPUSER_OPEN_BIO, &fd);
374 if (error) {
375 error = rumpuser_open(path,
376 RUMPUSER_OPEN_RDONLY | RUMPUSER_OPEN_BIO, &fd);
377 if (error)
378 return error;
379 rblk->rblk_mode = FREAD;
380 } else {
381 rblk->rblk_mode = FREAD|FWRITE;
382 }
383
384 rblk->rblk_fd = fd;
385 KASSERT(rblk->rblk_fd != -1);
386 return 0;
387 }
388
389 static int
390 backend_close(struct rblkdev *rblk)
391 {
392
393 rumpuser_close(rblk->rblk_fd);
394 rblk->rblk_fd = -1;
395
396 return 0;
397 }
398
399 int
400 rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l)
401 {
402 struct rblkdev *rblk = &minors[minor(dev)];
403
404 if (rblk->rblk_fd == -1)
405 return ENXIO;
406
407 if (((flag & (FREAD|FWRITE)) & ~rblk->rblk_mode) != 0) {
408 return EACCES;
409 }
410
411 return 0;
412 }
413
414 int
415 rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l)
416 {
417
418 return 0;
419 }
420
421 int
422 rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l)
423 {
424 devminor_t dmin = minor(dev);
425 struct rblkdev *rblk = &minors[dmin];
426 struct partinfo *pi;
427 struct partition *dp;
428 int error = 0;
429
430 /* well, me should support a few more, but we don't for now */
431 switch (xfer) {
432 case DIOCGDINFO:
433 *(struct disklabel *)addr = rblk->rblk_label;
434 break;
435
436 case DIOCGPARTINFO:
437 dp = &rblk->rblk_label.d_partitions[DISKPART(dmin)];
438 pi = addr;
439 pi->pi_offset = dp->p_offset;
440 pi->pi_size = dp->p_size;
441 pi->pi_secsize = rblk->rblk_label.d_secsize;
442 pi->pi_bsize = BLKDEV_IOSIZE;
443 pi->pi_fstype = dp->p_fstype;
444 pi->pi_fsize = dp->p_fsize;
445 pi->pi_frag = dp->p_frag;
446 pi->pi_cpg = dp->p_cpg;
447 break;
448
449 /* it's synced enough along the write path */
450 case DIOCCACHESYNC:
451 break;
452
453 case DIOCGMEDIASIZE:
454 *(off_t *)addr = (off_t)rblk->rblk_size;
455 break;
456
457 default:
458 error = ENOTTY;
459 break;
460 }
461
462 return error;
463 }
464
465 static int
466 do_physio(dev_t dev, struct uio *uio, int which)
467 {
468 void (*strat)(struct buf *);
469
470 if (blkfail)
471 strat = rumpblk_strategy_fail;
472 else
473 strat = rumpblk_strategy;
474
475 return physio(strat, NULL, dev, which, minphys, uio);
476 }
477
478 int
479 rumpblk_read(dev_t dev, struct uio *uio, int flags)
480 {
481
482 return do_physio(dev, uio, B_READ);
483 }
484
485 int
486 rumpblk_write(dev_t dev, struct uio *uio, int flags)
487 {
488
489 return do_physio(dev, uio, B_WRITE);
490 }
491
492 static void
493 dostrategy(struct buf *bp)
494 {
495 struct rblkdev *rblk = &minors[minor(bp->b_dev)];
496 off_t off;
497 int async = bp->b_flags & B_ASYNC;
498 int op;
499
500 if (bp->b_bcount % (1<<sectshift) != 0) {
501 rump_biodone(bp, 0, EINVAL);
502 return;
503 }
504
505 /* collect statistics */
506 ev_io_total.ev_count++;
507 if (async)
508 ev_io_async.ev_count++;
509 if (BUF_ISWRITE(bp)) {
510 ev_bwrite_total.ev_count += bp->b_bcount;
511 if (async)
512 ev_bwrite_async.ev_count += bp->b_bcount;
513 } else {
514 ev_bread_total.ev_count++;
515 }
516
517 /*
518 * b_blkno is always in terms of DEV_BSIZE, and since we need
519 * to translate to a byte offset for the host read, this
520 * calculation does not need sectshift.
521 */
522 off = bp->b_blkno << DEV_BSHIFT;
523
524 /*
525 * Do bounds checking if we're working on a file. Otherwise
526 * invalid file systems might attempt to read beyond EOF. This
527 * is bad(tm) especially on mmapped images. This is essentially
528 * the kernel bounds_check() routines.
529 */
530 if (off + bp->b_bcount > rblk->rblk_size) {
531 int64_t sz = rblk->rblk_size - off;
532
533 /* EOF */
534 if (sz == 0) {
535 rump_biodone(bp, 0, 0);
536 return;
537 }
538 /* beyond EOF ==> error */
539 if (sz < 0) {
540 rump_biodone(bp, 0, EINVAL);
541 return;
542 }
543
544 /* truncate to device size */
545 bp->b_bcount = sz;
546 }
547
548 off += rblk->rblk_hostoffset;
549 DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64
550 " (0x%" PRIx64 " - 0x%" PRIx64 "), %ssync\n",
551 bp->b_bcount, BUF_ISREAD(bp) ? "READ" : "WRITE",
552 off, off, (off + bp->b_bcount), async ? "a" : ""));
553
554 op = BUF_ISREAD(bp) ? RUMPUSER_BIO_READ : RUMPUSER_BIO_WRITE;
555 if (BUF_ISWRITE(bp) && !async)
556 op |= RUMPUSER_BIO_SYNC;
557
558 rumpuser_bio(rblk->rblk_fd, op, bp->b_data, bp->b_bcount, off,
559 rump_biodone, bp);
560 }
561
562 void
563 rumpblk_strategy(struct buf *bp)
564 {
565
566 dostrategy(bp);
567 }
568
569 /*
570 * Simple random number generator. This is private so that we can
571 * very repeatedly control which blocks will fail.
572 *
573 * <mlelstv> pooka, rand()
574 * <mlelstv> [paste]
575 */
576 static unsigned
577 gimmerand(void)
578 {
579
580 return (randstate = randstate * 1103515245 + 12345) % (0x80000000L);
581 }
582
583 /*
584 * Block device with very simple fault injection. Fails every
585 * n out of BLKFAIL_MAX I/O with EIO. n is determined by the env
586 * variable RUMP_BLKFAIL.
587 */
588 void
589 rumpblk_strategy_fail(struct buf *bp)
590 {
591
592 if (gimmerand() % BLKFAIL_MAX >= blkfail) {
593 dostrategy(bp);
594 } else {
595 printf("block fault injection: failing I/O on block %lld\n",
596 (long long)bp->b_blkno);
597 bp->b_error = EIO;
598 biodone(bp);
599 }
600 }
601