rumpblk.c revision 1.50 1 /* $NetBSD: rumpblk.c,v 1.50 2013/04/29 13:07:37 pooka Exp $ */
2
3 /*
4 * Copyright (c) 2009 Antti Kantee. All Rights Reserved.
5 *
6 * Development of this software was supported by the
7 * Finnish Cultural Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
19 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 /*
32 * Block device emulation. Presents a block device interface and
33 * uses rumpuser system calls to satisfy I/O requests.
34 *
35 * We provide fault injection. The driver can be made to fail
36 * I/O occasionally.
37 */
38
39 #include <sys/cdefs.h>
40 __KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.50 2013/04/29 13:07:37 pooka Exp $");
41
42 #include <sys/param.h>
43 #include <sys/buf.h>
44 #include <sys/conf.h>
45 #include <sys/condvar.h>
46 #include <sys/disklabel.h>
47 #include <sys/evcnt.h>
48 #include <sys/fcntl.h>
49 #include <sys/kmem.h>
50 #include <sys/malloc.h>
51 #include <sys/queue.h>
52 #include <sys/stat.h>
53 #include <sys/cprng.h>
54
55 #include <rump/rumpuser.h>
56
57 #include "rump_private.h"
58 #include "rump_vfs_private.h"
59
60 #if 0
61 #define DPRINTF(x) printf x
62 #else
63 #define DPRINTF(x)
64 #endif
65
66 #define RUMPBLK_SIZE 16
67 static struct rblkdev {
68 char *rblk_path;
69 int rblk_fd;
70 int rblk_mode;
71
72 uint64_t rblk_size;
73 uint64_t rblk_hostoffset;
74 uint64_t rblk_hostsize;
75 int rblk_ftype;
76
77 struct disklabel rblk_label;
78 } minors[RUMPBLK_SIZE];
79
80 static struct evcnt ev_io_total;
81 static struct evcnt ev_io_async;
82
83 static struct evcnt ev_bwrite_total;
84 static struct evcnt ev_bwrite_async;
85 static struct evcnt ev_bread_total;
86
87 dev_type_open(rumpblk_open);
88 dev_type_close(rumpblk_close);
89 dev_type_read(rumpblk_read);
90 dev_type_write(rumpblk_write);
91 dev_type_ioctl(rumpblk_ioctl);
92 dev_type_strategy(rumpblk_strategy);
93 dev_type_strategy(rumpblk_strategy_fail);
94 dev_type_dump(rumpblk_dump);
95 dev_type_size(rumpblk_size);
96
97 static const struct bdevsw rumpblk_bdevsw = {
98 rumpblk_open, rumpblk_close, rumpblk_strategy, rumpblk_ioctl,
99 nodump, nosize, D_DISK
100 };
101
102 static const struct bdevsw rumpblk_bdevsw_fail = {
103 rumpblk_open, rumpblk_close, rumpblk_strategy_fail, rumpblk_ioctl,
104 nodump, nosize, D_DISK
105 };
106
107 static const struct cdevsw rumpblk_cdevsw = {
108 rumpblk_open, rumpblk_close, rumpblk_read, rumpblk_write,
109 rumpblk_ioctl, nostop, notty, nopoll, nommap, nokqfilter, D_DISK
110 };
111
112 static int backend_open(struct rblkdev *, const char *);
113 static int backend_close(struct rblkdev *);
114
115 /* fail every n out of BLKFAIL_MAX */
116 #define BLKFAIL_MAX 10000
117 static int blkfail;
118 static unsigned randstate;
119 static kmutex_t rumpblk_lock;
120 static int sectshift = DEV_BSHIFT;
121
122 static void
123 makedefaultlabel(struct disklabel *lp, off_t size, int part)
124 {
125 int i;
126
127 memset(lp, 0, sizeof(*lp));
128
129 lp->d_secperunit = size;
130 lp->d_secsize = 1 << sectshift;
131 lp->d_nsectors = size >> sectshift;
132 lp->d_ntracks = 1;
133 lp->d_ncylinders = 1;
134 lp->d_secpercyl = lp->d_nsectors;
135
136 /* oh dear oh dear */
137 strncpy(lp->d_typename, "rumpd", sizeof(lp->d_typename));
138 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
139
140 lp->d_type = DTYPE_RUMPD;
141 lp->d_rpm = 11;
142 lp->d_interleave = 1;
143 lp->d_flags = 0;
144
145 /* XXX: RAW_PART handling? */
146 for (i = 0; i < part; i++) {
147 lp->d_partitions[i].p_fstype = FS_UNUSED;
148 }
149 lp->d_partitions[part].p_size = size >> sectshift;
150 lp->d_npartitions = part+1;
151 /* XXX: file system type? */
152
153 lp->d_magic = DISKMAGIC;
154 lp->d_magic2 = DISKMAGIC;
155 lp->d_checksum = 0; /* XXX */
156 }
157
158 int
159 rumpblk_init(void)
160 {
161 char buf[64];
162 devmajor_t rumpblkmaj = RUMPBLK_DEVMAJOR;
163 unsigned tmp;
164 int error, i;
165
166 mutex_init(&rumpblk_lock, MUTEX_DEFAULT, IPL_NONE);
167
168 if (rumpuser_getenv("RUMP_BLKFAIL", buf, sizeof(buf), &error) == 0) {
169 blkfail = strtoul(buf, NULL, 10);
170 /* fail everything */
171 if (blkfail > BLKFAIL_MAX)
172 blkfail = BLKFAIL_MAX;
173 if (rumpuser_getenv("RUMP_BLKFAIL_SEED", buf, sizeof(buf),
174 &error) == 0) {
175 randstate = strtoul(buf, NULL, 10);
176 } else {
177 randstate = cprng_fast32();
178 }
179 printf("rumpblk: FAULT INJECTION ACTIVE! fail %d/%d. "
180 "seed %u\n", blkfail, BLKFAIL_MAX, randstate);
181 } else {
182 blkfail = 0;
183 }
184
185 if (rumpuser_getenv("RUMP_BLKSECTSHIFT", buf, sizeof(buf), &error)==0){
186 printf("rumpblk: ");
187 tmp = strtoul(buf, NULL, 10);
188 if (tmp >= DEV_BSHIFT)
189 sectshift = tmp;
190 else
191 printf("RUMP_BLKSECTSHIFT must be least %d (now %d), ",
192 DEV_BSHIFT, tmp);
193 printf("using %d for sector shift (size %d)\n",
194 sectshift, 1<<sectshift);
195 }
196
197 memset(minors, 0, sizeof(minors));
198 for (i = 0; i < RUMPBLK_SIZE; i++) {
199 minors[i].rblk_fd = -1;
200 }
201
202 evcnt_attach_dynamic(&ev_io_total, EVCNT_TYPE_MISC, NULL,
203 "rumpblk", "I/O reqs");
204 evcnt_attach_dynamic(&ev_io_async, EVCNT_TYPE_MISC, NULL,
205 "rumpblk", "async I/O");
206
207 evcnt_attach_dynamic(&ev_bread_total, EVCNT_TYPE_MISC, NULL,
208 "rumpblk", "bytes read");
209 evcnt_attach_dynamic(&ev_bwrite_total, EVCNT_TYPE_MISC, NULL,
210 "rumpblk", "bytes written");
211 evcnt_attach_dynamic(&ev_bwrite_async, EVCNT_TYPE_MISC, NULL,
212 "rumpblk", "bytes written async");
213
214 if (blkfail) {
215 return devsw_attach("rumpblk",
216 &rumpblk_bdevsw_fail, &rumpblkmaj,
217 &rumpblk_cdevsw, &rumpblkmaj);
218 } else {
219 return devsw_attach("rumpblk",
220 &rumpblk_bdevsw, &rumpblkmaj,
221 &rumpblk_cdevsw, &rumpblkmaj);
222 }
223 }
224
225 int
226 rumpblk_register(const char *path, devminor_t *dmin,
227 uint64_t offset, uint64_t size)
228 {
229 struct rblkdev *rblk;
230 uint64_t flen;
231 size_t len;
232 int ftype, error, i;
233
234 /* devices might not report correct size unless they're open */
235 if (rumpuser_getfileinfo(path, &flen, &ftype, &error) == -1)
236 return error;
237
238 /* verify host file is of supported type */
239 if (!(ftype == RUMPUSER_FT_REG
240 || ftype == RUMPUSER_FT_BLK
241 || ftype == RUMPUSER_FT_CHR))
242 return EINVAL;
243
244 mutex_enter(&rumpblk_lock);
245 for (i = 0; i < RUMPBLK_SIZE; i++) {
246 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
247 mutex_exit(&rumpblk_lock);
248 *dmin = i;
249 return 0;
250 }
251 }
252
253 for (i = 0; i < RUMPBLK_SIZE; i++)
254 if (minors[i].rblk_path == NULL)
255 break;
256 if (i == RUMPBLK_SIZE) {
257 mutex_exit(&rumpblk_lock);
258 return EBUSY;
259 }
260
261 rblk = &minors[i];
262 rblk->rblk_path = __UNCONST("taken");
263 mutex_exit(&rumpblk_lock);
264
265 len = strlen(path);
266 rblk->rblk_path = malloc(len + 1, M_TEMP, M_WAITOK);
267 strcpy(rblk->rblk_path, path);
268 rblk->rblk_hostoffset = offset;
269 if (size != RUMPBLK_SIZENOTSET) {
270 KASSERT(size + offset <= flen);
271 rblk->rblk_size = size;
272 } else {
273 KASSERT(offset < flen);
274 rblk->rblk_size = flen - offset;
275 }
276 rblk->rblk_hostsize = flen;
277 rblk->rblk_ftype = ftype;
278 makedefaultlabel(&rblk->rblk_label, rblk->rblk_size, i);
279
280 if ((error = backend_open(rblk, path)) != 0) {
281 memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label));
282 free(rblk->rblk_path, M_TEMP);
283 rblk->rblk_path = NULL;
284 return error;
285 }
286
287 *dmin = i;
288 return 0;
289 }
290
291 /*
292 * Unregister rumpblk. It's the callers responsibility to make
293 * sure it's no longer in use.
294 */
295 int
296 rumpblk_deregister(const char *path)
297 {
298 struct rblkdev *rblk;
299 int i;
300
301 mutex_enter(&rumpblk_lock);
302 for (i = 0; i < RUMPBLK_SIZE; i++) {
303 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
304 break;
305 }
306 }
307 mutex_exit(&rumpblk_lock);
308
309 if (i == RUMPBLK_SIZE)
310 return ENOENT;
311
312 rblk = &minors[i];
313 backend_close(rblk);
314
315 free(rblk->rblk_path, M_TEMP);
316 memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label));
317 rblk->rblk_path = NULL;
318
319 return 0;
320 }
321
322 static int
323 backend_open(struct rblkdev *rblk, const char *path)
324 {
325 int error, fd;
326
327 KASSERT(rblk->rblk_fd == -1);
328 fd = rumpuser_open(path,
329 RUMPUSER_OPEN_RDWR | RUMPUSER_OPEN_BIO, &error);
330 if (error) {
331 fd = rumpuser_open(path,
332 RUMPUSER_OPEN_RDONLY | RUMPUSER_OPEN_BIO, &error);
333 if (error)
334 return error;
335 rblk->rblk_mode = FREAD;
336 } else {
337 rblk->rblk_mode = FREAD|FWRITE;
338 }
339
340 rblk->rblk_fd = fd;
341 KASSERT(rblk->rblk_fd != -1);
342 return 0;
343 }
344
345 static int
346 backend_close(struct rblkdev *rblk)
347 {
348 int dummy;
349
350 rumpuser_fsync(rblk->rblk_fd, &dummy);
351 rumpuser_close(rblk->rblk_fd, &dummy);
352 rblk->rblk_fd = -1;
353
354 return 0;
355 }
356
357 int
358 rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l)
359 {
360 struct rblkdev *rblk = &minors[minor(dev)];
361
362 if (rblk->rblk_fd == -1)
363 return ENXIO;
364
365 if (((flag & (FREAD|FWRITE)) & ~rblk->rblk_mode) != 0) {
366 return EACCES;
367 }
368
369 return 0;
370 }
371
372 int
373 rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l)
374 {
375
376 return 0;
377 }
378
379 int
380 rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l)
381 {
382 devminor_t dmin = minor(dev);
383 struct rblkdev *rblk = &minors[dmin];
384 struct partinfo *pi;
385 int error = 0;
386
387 /* well, me should support a few more, but we don't for now */
388 switch (xfer) {
389 case DIOCGDINFO:
390 *(struct disklabel *)addr = rblk->rblk_label;
391 break;
392
393 case DIOCGPART:
394 pi = addr;
395 pi->part = &rblk->rblk_label.d_partitions[DISKPART(dmin)];
396 pi->disklab = &rblk->rblk_label;
397 break;
398
399 /* it's synced enough along the write path */
400 case DIOCCACHESYNC:
401 break;
402
403 default:
404 error = ENOTTY;
405 break;
406 }
407
408 return error;
409 }
410
411 static int
412 do_physio(dev_t dev, struct uio *uio, int which)
413 {
414 void (*strat)(struct buf *);
415
416 if (blkfail)
417 strat = rumpblk_strategy_fail;
418 else
419 strat = rumpblk_strategy;
420
421 return physio(strat, NULL, dev, which, minphys, uio);
422 }
423
424 int
425 rumpblk_read(dev_t dev, struct uio *uio, int flags)
426 {
427
428 return do_physio(dev, uio, B_READ);
429 }
430
431 int
432 rumpblk_write(dev_t dev, struct uio *uio, int flags)
433 {
434
435 return do_physio(dev, uio, B_WRITE);
436 }
437
438 static void
439 dostrategy(struct buf *bp)
440 {
441 struct rblkdev *rblk = &minors[minor(bp->b_dev)];
442 off_t off;
443 int async = bp->b_flags & B_ASYNC;
444 int op;
445
446 if (bp->b_bcount % (1<<sectshift) != 0) {
447 rump_biodone(bp, 0, EINVAL);
448 return;
449 }
450
451 /* collect statistics */
452 ev_io_total.ev_count++;
453 if (async)
454 ev_io_async.ev_count++;
455 if (BUF_ISWRITE(bp)) {
456 ev_bwrite_total.ev_count += bp->b_bcount;
457 if (async)
458 ev_bwrite_async.ev_count += bp->b_bcount;
459 } else {
460 ev_bread_total.ev_count++;
461 }
462
463 /*
464 * b_blkno is always in terms of DEV_BSIZE, and since we need
465 * to translate to a byte offset for the host read, this
466 * calculation does not need sectshift.
467 */
468 off = bp->b_blkno << DEV_BSHIFT;
469
470 /*
471 * Do bounds checking if we're working on a file. Otherwise
472 * invalid file systems might attempt to read beyond EOF. This
473 * is bad(tm) especially on mmapped images. This is essentially
474 * the kernel bounds_check() routines.
475 */
476 if (off + bp->b_bcount > rblk->rblk_size) {
477 int64_t sz = rblk->rblk_size - off;
478
479 /* EOF */
480 if (sz == 0) {
481 rump_biodone(bp, 0, 0);
482 return;
483 }
484 /* beyond EOF ==> error */
485 if (sz < 0) {
486 rump_biodone(bp, 0, EINVAL);
487 return;
488 }
489
490 /* truncate to device size */
491 bp->b_bcount = sz;
492 }
493
494 off += rblk->rblk_hostoffset;
495 DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64
496 " (0x%" PRIx64 " - 0x%" PRIx64 "), %ssync\n",
497 bp->b_bcount, BUF_ISREAD(bp) ? "READ" : "WRITE",
498 off, off, (off + bp->b_bcount), async ? "a" : ""));
499
500 op = BUF_ISREAD(bp) ? RUMPUSER_BIO_READ : RUMPUSER_BIO_WRITE;
501 if (BUF_ISWRITE(bp) && !async)
502 op |= RUMPUSER_BIO_SYNC;
503
504 rumpuser_bio(rblk->rblk_fd, op, bp->b_data, bp->b_bcount, off,
505 rump_biodone, bp);
506 }
507
508 void
509 rumpblk_strategy(struct buf *bp)
510 {
511
512 dostrategy(bp);
513 }
514
515 /*
516 * Simple random number generator. This is private so that we can
517 * very repeatedly control which blocks will fail.
518 *
519 * <mlelstv> pooka, rand()
520 * <mlelstv> [paste]
521 */
522 static unsigned
523 gimmerand(void)
524 {
525
526 return (randstate = randstate * 1103515245 + 12345) % (0x80000000L);
527 }
528
529 /*
530 * Block device with very simple fault injection. Fails every
531 * n out of BLKFAIL_MAX I/O with EIO. n is determined by the env
532 * variable RUMP_BLKFAIL.
533 */
534 void
535 rumpblk_strategy_fail(struct buf *bp)
536 {
537
538 if (gimmerand() % BLKFAIL_MAX >= blkfail) {
539 dostrategy(bp);
540 } else {
541 printf("block fault injection: failing I/O on block %lld\n",
542 (long long)bp->b_blkno);
543 bp->b_error = EIO;
544 biodone(bp);
545 }
546 }
547