rumpblk.c revision 1.49 1 /* $NetBSD: rumpblk.c,v 1.49 2013/04/29 12:56:03 pooka Exp $ */
2
3 /*
4 * Copyright (c) 2009 Antti Kantee. All Rights Reserved.
5 *
6 * Development of this software was supported by the
7 * Finnish Cultural Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
19 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 /*
32 * Block device emulation. Presents a block device interface and
33 * uses rumpuser system calls to satisfy I/O requests.
34 *
35 * We provide fault injection. The driver can be made to fail
36 * I/O occasionally.
37 *
38 * The driver also provides an optimization for regular files by
39 * using memory-mapped I/O. This avoids kernel access for every
40 * I/O operation. It also gives finer-grained control of how to
41 * flush data. Additionally, in case the rump kernel dumps core,
42 * we get way less carnage.
43 *
44 * However, it is quite costly in writing large amounts of
45 * file data, since old contents cannot merely be overwritten, but
46 * must be paged in first before replacing (i.e. r/m/w). Ideally,
47 * we should use directio. The problem is that directio can fail
48 * silently causing improper file system semantics (i.e. unflushed
49 * data). Therefore, default to mmap for now. Even so, directio
50 * _should_ be safe and can be enabled by compiling this module
51 * with -DHAS_DIRECTIO.
52 */
53
54 #include <sys/cdefs.h>
55 __KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.49 2013/04/29 12:56:03 pooka Exp $");
56
57 #include <sys/param.h>
58 #include <sys/buf.h>
59 #include <sys/conf.h>
60 #include <sys/condvar.h>
61 #include <sys/disklabel.h>
62 #include <sys/evcnt.h>
63 #include <sys/fcntl.h>
64 #include <sys/kmem.h>
65 #include <sys/malloc.h>
66 #include <sys/queue.h>
67 #include <sys/stat.h>
68 #include <sys/cprng.h>
69
70 #include <rump/rumpuser.h>
71
72 #include "rump_private.h"
73 #include "rump_vfs_private.h"
74
75 /*
76 * O_DIRECT is the fastest alternative, but since it falls back to
77 * non-direct writes silently, I am not sure it will always be 100% safe.
78 * Use it and play with it, but do that with caution.
79 */
80 #if 0
81 #define HAS_ODIRECT
82 #endif
83
84 #if 0
85 #define DPRINTF(x) printf x
86 #else
87 #define DPRINTF(x)
88 #endif
89
90 /* Default: 16 x 1MB windows */
91 unsigned memwinsize = (1<<20);
92 unsigned memwincnt = 16;
93
94 #define STARTWIN(off) ((off) & ~((off_t)memwinsize-1))
95 #define INWIN(win,off) ((win)->win_off == STARTWIN(off))
96 #define WINSIZE(rblk, win) (MIN((rblk->rblk_hostsize-win->win_off), \
97 memwinsize))
98 #define WINVALID(win) ((win)->win_off != (off_t)-1)
99 #define WINVALIDATE(win) ((win)->win_off = (off_t)-1)
100 struct blkwin {
101 off_t win_off;
102 void *win_mem;
103 int win_refcnt;
104
105 TAILQ_ENTRY(blkwin) win_lru;
106 };
107
108 #define RUMPBLK_SIZE 16
109 static struct rblkdev {
110 char *rblk_path;
111 int rblk_fd;
112 int rblk_mode;
113 #ifdef HAS_ODIRECT
114 int rblk_dfd;
115 #endif
116 uint64_t rblk_size;
117 uint64_t rblk_hostoffset;
118 uint64_t rblk_hostsize;
119 int rblk_ftype;
120
121 /* for mmap */
122 int rblk_mmflags;
123 kmutex_t rblk_memmtx;
124 kcondvar_t rblk_memcv;
125 TAILQ_HEAD(winlru, blkwin) rblk_lruq;
126 bool rblk_waiting;
127
128 struct disklabel rblk_label;
129 } minors[RUMPBLK_SIZE];
130
131 static struct evcnt ev_io_total;
132 static struct evcnt ev_io_async;
133
134 static struct evcnt ev_memblk_hits;
135 static struct evcnt ev_memblk_busy;
136
137 static struct evcnt ev_bwrite_total;
138 static struct evcnt ev_bwrite_async;
139 static struct evcnt ev_bread_total;
140
141 dev_type_open(rumpblk_open);
142 dev_type_close(rumpblk_close);
143 dev_type_read(rumpblk_read);
144 dev_type_write(rumpblk_write);
145 dev_type_ioctl(rumpblk_ioctl);
146 dev_type_strategy(rumpblk_strategy);
147 dev_type_strategy(rumpblk_strategy_fail);
148 dev_type_dump(rumpblk_dump);
149 dev_type_size(rumpblk_size);
150
151 static const struct bdevsw rumpblk_bdevsw = {
152 rumpblk_open, rumpblk_close, rumpblk_strategy, rumpblk_ioctl,
153 nodump, nosize, D_DISK
154 };
155
156 static const struct bdevsw rumpblk_bdevsw_fail = {
157 rumpblk_open, rumpblk_close, rumpblk_strategy_fail, rumpblk_ioctl,
158 nodump, nosize, D_DISK
159 };
160
161 static const struct cdevsw rumpblk_cdevsw = {
162 rumpblk_open, rumpblk_close, rumpblk_read, rumpblk_write,
163 rumpblk_ioctl, nostop, notty, nopoll, nommap, nokqfilter, D_DISK
164 };
165
166 static int backend_open(struct rblkdev *, const char *);
167 static int backend_close(struct rblkdev *);
168
169 /* fail every n out of BLKFAIL_MAX */
170 #define BLKFAIL_MAX 10000
171 static int blkfail;
172 static unsigned randstate;
173 static kmutex_t rumpblk_lock;
174 static int sectshift = DEV_BSHIFT;
175
176 static void
177 makedefaultlabel(struct disklabel *lp, off_t size, int part)
178 {
179 int i;
180
181 memset(lp, 0, sizeof(*lp));
182
183 lp->d_secperunit = size;
184 lp->d_secsize = 1 << sectshift;
185 lp->d_nsectors = size >> sectshift;
186 lp->d_ntracks = 1;
187 lp->d_ncylinders = 1;
188 lp->d_secpercyl = lp->d_nsectors;
189
190 /* oh dear oh dear */
191 strncpy(lp->d_typename, "rumpd", sizeof(lp->d_typename));
192 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
193
194 lp->d_type = DTYPE_RUMPD;
195 lp->d_rpm = 11;
196 lp->d_interleave = 1;
197 lp->d_flags = 0;
198
199 /* XXX: RAW_PART handling? */
200 for (i = 0; i < part; i++) {
201 lp->d_partitions[i].p_fstype = FS_UNUSED;
202 }
203 lp->d_partitions[part].p_size = size >> sectshift;
204 lp->d_npartitions = part+1;
205 /* XXX: file system type? */
206
207 lp->d_magic = DISKMAGIC;
208 lp->d_magic2 = DISKMAGIC;
209 lp->d_checksum = 0; /* XXX */
210 }
211
212 static struct blkwin *
213 getwindow(struct rblkdev *rblk, off_t off, int *wsize, int *error)
214 {
215 struct blkwin *win;
216
217 mutex_enter(&rblk->rblk_memmtx);
218 retry:
219 /* search for window */
220 TAILQ_FOREACH(win, &rblk->rblk_lruq, win_lru) {
221 if (INWIN(win, off) && WINVALID(win))
222 break;
223 }
224
225 /* found? return */
226 if (win) {
227 ev_memblk_hits.ev_count++;
228 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
229 goto good;
230 }
231
232 /*
233 * Else, create new window. If the least recently used is not
234 * currently in use, reuse that. Otherwise we need to wait.
235 */
236 win = TAILQ_LAST(&rblk->rblk_lruq, winlru);
237 if (win->win_refcnt == 0) {
238 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
239 mutex_exit(&rblk->rblk_memmtx);
240
241 if (WINVALID(win)) {
242 DPRINTF(("win %p, unmap mem %p, off 0x%" PRIx64 "\n",
243 win, win->win_mem, win->win_off));
244 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win));
245 WINVALIDATE(win);
246 }
247
248 win->win_off = STARTWIN(off);
249 win->win_mem = rumpuser_filemmap(rblk->rblk_fd, win->win_off,
250 WINSIZE(rblk, win), rblk->rblk_mmflags, error);
251 DPRINTF(("win %p, off 0x%" PRIx64 ", mem %p\n",
252 win, win->win_off, win->win_mem));
253
254 mutex_enter(&rblk->rblk_memmtx);
255 if (win->win_mem == NULL) {
256 WINVALIDATE(win);
257 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru);
258 mutex_exit(&rblk->rblk_memmtx);
259 return NULL;
260 }
261 } else {
262 DPRINTF(("memwin wait\n"));
263 ev_memblk_busy.ev_count++;
264
265 rblk->rblk_waiting = true;
266 cv_wait(&rblk->rblk_memcv, &rblk->rblk_memmtx);
267 goto retry;
268 }
269
270 good:
271 KASSERT(win);
272 win->win_refcnt++;
273 TAILQ_INSERT_HEAD(&rblk->rblk_lruq, win, win_lru);
274 mutex_exit(&rblk->rblk_memmtx);
275 *wsize = MIN(*wsize, memwinsize - (off-win->win_off));
276 KASSERT(*wsize);
277
278 return win;
279 }
280
281 static void
282 putwindow(struct rblkdev *rblk, struct blkwin *win)
283 {
284
285 mutex_enter(&rblk->rblk_memmtx);
286 if (--win->win_refcnt == 0 && rblk->rblk_waiting) {
287 rblk->rblk_waiting = false;
288 cv_broadcast(&rblk->rblk_memcv);
289 }
290 KASSERT(win->win_refcnt >= 0);
291 mutex_exit(&rblk->rblk_memmtx);
292 }
293
294 static void
295 wincleanup(struct rblkdev *rblk)
296 {
297 struct blkwin *win;
298
299 while ((win = TAILQ_FIRST(&rblk->rblk_lruq)) != NULL) {
300 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
301 if (WINVALID(win)) {
302 DPRINTF(("cleanup win %p addr %p\n",
303 win, win->win_mem));
304 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win));
305 }
306 kmem_free(win, sizeof(*win));
307 }
308 rblk->rblk_mmflags = 0;
309 }
310
311 int
312 rumpblk_init(void)
313 {
314 char buf[64];
315 devmajor_t rumpblkmaj = RUMPBLK_DEVMAJOR;
316 unsigned tmp;
317 int error, i;
318
319 mutex_init(&rumpblk_lock, MUTEX_DEFAULT, IPL_NONE);
320
321 if (rumpuser_getenv("RUMP_BLKFAIL", buf, sizeof(buf), &error) == 0) {
322 blkfail = strtoul(buf, NULL, 10);
323 /* fail everything */
324 if (blkfail > BLKFAIL_MAX)
325 blkfail = BLKFAIL_MAX;
326 if (rumpuser_getenv("RUMP_BLKFAIL_SEED", buf, sizeof(buf),
327 &error) == 0) {
328 randstate = strtoul(buf, NULL, 10);
329 } else {
330 randstate = cprng_fast32();
331 }
332 printf("rumpblk: FAULT INJECTION ACTIVE! fail %d/%d. "
333 "seed %u\n", blkfail, BLKFAIL_MAX, randstate);
334 } else {
335 blkfail = 0;
336 }
337
338 if (rumpuser_getenv("RUMP_BLKWINSIZE", buf, sizeof(buf), &error) == 0) {
339 printf("rumpblk: ");
340 tmp = strtoul(buf, NULL, 10);
341 if (tmp && !(tmp & (tmp-1)))
342 memwinsize = tmp;
343 else
344 printf("invalid RUMP_BLKWINSIZE %d, ", tmp);
345 printf("using %d for memwinsize\n", memwinsize);
346 }
347 if (rumpuser_getenv("RUMP_BLKWINCOUNT", buf, sizeof(buf), &error) == 0){
348 printf("rumpblk: ");
349 tmp = strtoul(buf, NULL, 10);
350 if (tmp)
351 memwincnt = tmp;
352 else
353 printf("invalid RUMP_BLKWINCOUNT %d, ", tmp);
354 printf("using %d for memwincount\n", memwincnt);
355 }
356 if (rumpuser_getenv("RUMP_BLKSECTSHIFT", buf, sizeof(buf), &error)==0){
357 printf("rumpblk: ");
358 tmp = strtoul(buf, NULL, 10);
359 if (tmp >= DEV_BSHIFT)
360 sectshift = tmp;
361 else
362 printf("RUMP_BLKSECTSHIFT must be least %d (now %d), ",
363 DEV_BSHIFT, tmp);
364 printf("using %d for sector shift (size %d)\n",
365 sectshift, 1<<sectshift);
366 }
367
368 memset(minors, 0, sizeof(minors));
369 for (i = 0; i < RUMPBLK_SIZE; i++) {
370 mutex_init(&minors[i].rblk_memmtx, MUTEX_DEFAULT, IPL_NONE);
371 cv_init(&minors[i].rblk_memcv, "rblkmcv");
372 minors[i].rblk_fd = -1;
373 }
374
375 evcnt_attach_dynamic(&ev_io_total, EVCNT_TYPE_MISC, NULL,
376 "rumpblk", "I/O reqs");
377 evcnt_attach_dynamic(&ev_io_async, EVCNT_TYPE_MISC, NULL,
378 "rumpblk", "async I/O");
379
380 evcnt_attach_dynamic(&ev_bread_total, EVCNT_TYPE_MISC, NULL,
381 "rumpblk", "bytes read");
382 evcnt_attach_dynamic(&ev_bwrite_total, EVCNT_TYPE_MISC, NULL,
383 "rumpblk", "bytes written");
384 evcnt_attach_dynamic(&ev_bwrite_async, EVCNT_TYPE_MISC, NULL,
385 "rumpblk", "bytes written async");
386
387 evcnt_attach_dynamic(&ev_memblk_hits, EVCNT_TYPE_MISC, NULL,
388 "rumpblk", "window hits");
389 evcnt_attach_dynamic(&ev_memblk_busy, EVCNT_TYPE_MISC, NULL,
390 "rumpblk", "all windows busy");
391
392 if (blkfail) {
393 return devsw_attach("rumpblk",
394 &rumpblk_bdevsw_fail, &rumpblkmaj,
395 &rumpblk_cdevsw, &rumpblkmaj);
396 } else {
397 return devsw_attach("rumpblk",
398 &rumpblk_bdevsw, &rumpblkmaj,
399 &rumpblk_cdevsw, &rumpblkmaj);
400 }
401 }
402
403 int
404 rumpblk_register(const char *path, devminor_t *dmin,
405 uint64_t offset, uint64_t size)
406 {
407 struct rblkdev *rblk;
408 uint64_t flen;
409 size_t len;
410 int ftype, error, i;
411
412 /* devices might not report correct size unless they're open */
413 if (rumpuser_getfileinfo(path, &flen, &ftype, &error) == -1)
414 return error;
415
416 /* verify host file is of supported type */
417 if (!(ftype == RUMPUSER_FT_REG
418 || ftype == RUMPUSER_FT_BLK
419 || ftype == RUMPUSER_FT_CHR))
420 return EINVAL;
421
422 mutex_enter(&rumpblk_lock);
423 for (i = 0; i < RUMPBLK_SIZE; i++) {
424 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
425 mutex_exit(&rumpblk_lock);
426 *dmin = i;
427 return 0;
428 }
429 }
430
431 for (i = 0; i < RUMPBLK_SIZE; i++)
432 if (minors[i].rblk_path == NULL)
433 break;
434 if (i == RUMPBLK_SIZE) {
435 mutex_exit(&rumpblk_lock);
436 return EBUSY;
437 }
438
439 rblk = &minors[i];
440 rblk->rblk_path = __UNCONST("taken");
441 mutex_exit(&rumpblk_lock);
442
443 len = strlen(path);
444 rblk->rblk_path = malloc(len + 1, M_TEMP, M_WAITOK);
445 strcpy(rblk->rblk_path, path);
446 rblk->rblk_hostoffset = offset;
447 if (size != RUMPBLK_SIZENOTSET) {
448 KASSERT(size + offset <= flen);
449 rblk->rblk_size = size;
450 } else {
451 KASSERT(offset < flen);
452 rblk->rblk_size = flen - offset;
453 }
454 rblk->rblk_hostsize = flen;
455 rblk->rblk_ftype = ftype;
456 makedefaultlabel(&rblk->rblk_label, rblk->rblk_size, i);
457
458 if ((error = backend_open(rblk, path)) != 0) {
459 memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label));
460 free(rblk->rblk_path, M_TEMP);
461 rblk->rblk_path = NULL;
462 return error;
463 }
464
465 *dmin = i;
466 return 0;
467 }
468
469 /*
470 * Unregister rumpblk. It's the callers responsibility to make
471 * sure it's no longer in use.
472 */
473 int
474 rumpblk_deregister(const char *path)
475 {
476 struct rblkdev *rblk;
477 int i;
478
479 mutex_enter(&rumpblk_lock);
480 for (i = 0; i < RUMPBLK_SIZE; i++) {
481 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
482 break;
483 }
484 }
485 mutex_exit(&rumpblk_lock);
486
487 if (i == RUMPBLK_SIZE)
488 return ENOENT;
489
490 rblk = &minors[i];
491 backend_close(rblk);
492
493 wincleanup(rblk);
494 free(rblk->rblk_path, M_TEMP);
495 memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label));
496 rblk->rblk_path = NULL;
497
498 return 0;
499 }
500
501 static int
502 backend_open(struct rblkdev *rblk, const char *path)
503 {
504 int error, fd;
505
506 KASSERT(rblk->rblk_fd == -1);
507 fd = rumpuser_open(path,
508 RUMPUSER_OPEN_RDWR | RUMPUSER_OPEN_BIO, &error);
509 if (error) {
510 fd = rumpuser_open(path,
511 RUMPUSER_OPEN_RDONLY | RUMPUSER_OPEN_BIO, &error);
512 if (error)
513 return error;
514 rblk->rblk_mode = FREAD;
515
516 #ifdef HAS_ODIRECT
517 rblk->rblk_dfd = rumpuser_open(path,
518 RUMPUSER_OPEN_RDONLY | RUMPUSER_OPEN_DIRECT, &error);
519 if (error) {
520 close(fd);
521 return error;
522 }
523 #endif
524 } else {
525 rblk->rblk_mode = FREAD|FWRITE;
526
527 #ifdef HAS_ODIRECT
528 rblk->rblk_dfd = rumpuser_open(path,
529 RUMPUSER_OPEN_RDWR | RUMPUSER_OPEN_DIRECT, &error);
530 if (error) {
531 close(fd);
532 return error;
533 }
534 #endif
535 }
536
537 if (rblk->rblk_ftype == RUMPUSER_FT_REG) {
538 uint64_t fsize= rblk->rblk_hostsize, off= rblk->rblk_hostoffset;
539 struct blkwin *win;
540 int i, winsize;
541
542 /*
543 * Use mmap to access a regular file. Allocate and
544 * cache initial windows here. Failure to allocate one
545 * means fallback to read/write i/o.
546 */
547
548 rblk->rblk_mmflags = 0;
549 if (rblk->rblk_mode & FREAD)
550 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_READ;
551 if (rblk->rblk_mode & FWRITE) {
552 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_WRITE;
553 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_SHARED;
554 }
555
556 TAILQ_INIT(&rblk->rblk_lruq);
557 rblk->rblk_fd = fd;
558
559 for (i = 0; i < memwincnt && off + i*memwinsize < fsize; i++) {
560 win = kmem_zalloc(sizeof(*win), KM_SLEEP);
561 WINVALIDATE(win);
562 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru);
563
564 /*
565 * Allocate first windows. Here we just generally
566 * make sure a) we can mmap at all b) we have the
567 * necessary VA available
568 */
569 winsize = memwinsize;
570 win = getwindow(rblk, off + i*memwinsize, &winsize,
571 &error);
572 if (win) {
573 putwindow(rblk, win);
574 } else {
575 wincleanup(rblk);
576 break;
577 }
578 }
579 } else {
580 rblk->rblk_fd = fd;
581 }
582
583 KASSERT(rblk->rblk_fd != -1);
584 return 0;
585 }
586
587 static int
588 backend_close(struct rblkdev *rblk)
589 {
590 int dummy;
591
592 if (rblk->rblk_mmflags)
593 wincleanup(rblk);
594 rumpuser_fsync(rblk->rblk_fd, &dummy);
595 rumpuser_close(rblk->rblk_fd, &dummy);
596 rblk->rblk_fd = -1;
597 #ifdef HAS_ODIRECT
598 if (rblk->rblk_dfd != -1) {
599 rumpuser_close(rblk->rblk_dfd, &dummy);
600 rblk->rblk_dfd = -1;
601 }
602 #endif
603
604 return 0;
605 }
606
607 int
608 rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l)
609 {
610 struct rblkdev *rblk = &minors[minor(dev)];
611
612 if (rblk->rblk_fd == -1)
613 return ENXIO;
614
615 if (((flag & (FREAD|FWRITE)) & ~rblk->rblk_mode) != 0) {
616 return EACCES;
617 }
618
619 return 0;
620 }
621
622 int
623 rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l)
624 {
625
626 return 0;
627 }
628
629 int
630 rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l)
631 {
632 devminor_t dmin = minor(dev);
633 struct rblkdev *rblk = &minors[dmin];
634 struct partinfo *pi;
635 int error = 0;
636
637 /* well, me should support a few more, but we don't for now */
638 switch (xfer) {
639 case DIOCGDINFO:
640 *(struct disklabel *)addr = rblk->rblk_label;
641 break;
642
643 case DIOCGPART:
644 pi = addr;
645 pi->part = &rblk->rblk_label.d_partitions[DISKPART(dmin)];
646 pi->disklab = &rblk->rblk_label;
647 break;
648
649 /* it's synced enough along the write path */
650 case DIOCCACHESYNC:
651 break;
652
653 default:
654 error = ENOTTY;
655 break;
656 }
657
658 return error;
659 }
660
661 static int
662 do_physio(dev_t dev, struct uio *uio, int which)
663 {
664 void (*strat)(struct buf *);
665
666 if (blkfail)
667 strat = rumpblk_strategy_fail;
668 else
669 strat = rumpblk_strategy;
670
671 return physio(strat, NULL, dev, which, minphys, uio);
672 }
673
674 int
675 rumpblk_read(dev_t dev, struct uio *uio, int flags)
676 {
677
678 return do_physio(dev, uio, B_READ);
679 }
680
681 int
682 rumpblk_write(dev_t dev, struct uio *uio, int flags)
683 {
684
685 return do_physio(dev, uio, B_WRITE);
686 }
687
688 static void
689 dostrategy(struct buf *bp)
690 {
691 struct rblkdev *rblk = &minors[minor(bp->b_dev)];
692 off_t off;
693 int async = bp->b_flags & B_ASYNC;
694 int error, op;
695
696 if (bp->b_bcount % (1<<sectshift) != 0) {
697 rump_biodone(bp, 0, EINVAL);
698 return;
699 }
700
701 /* collect statistics */
702 ev_io_total.ev_count++;
703 if (async)
704 ev_io_async.ev_count++;
705 if (BUF_ISWRITE(bp)) {
706 ev_bwrite_total.ev_count += bp->b_bcount;
707 if (async)
708 ev_bwrite_async.ev_count += bp->b_bcount;
709 } else {
710 ev_bread_total.ev_count++;
711 }
712
713 /*
714 * b_blkno is always in terms of DEV_BSIZE, and since we need
715 * to translate to a byte offset for the host read, this
716 * calculation does not need sectshift.
717 */
718 off = bp->b_blkno << DEV_BSHIFT;
719
720 /*
721 * Do bounds checking if we're working on a file. Otherwise
722 * invalid file systems might attempt to read beyond EOF. This
723 * is bad(tm) especially on mmapped images. This is essentially
724 * the kernel bounds_check() routines.
725 */
726 if (off + bp->b_bcount > rblk->rblk_size) {
727 int64_t sz = rblk->rblk_size - off;
728
729 /* EOF */
730 if (sz == 0) {
731 rump_biodone(bp, 0, 0);
732 return;
733 }
734 /* beyond EOF ==> error */
735 if (sz < 0) {
736 rump_biodone(bp, 0, EINVAL);
737 return;
738 }
739
740 /* truncate to device size */
741 bp->b_bcount = sz;
742 }
743
744 off += rblk->rblk_hostoffset;
745 DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64
746 " (0x%" PRIx64 " - 0x%" PRIx64 "), %ssync\n",
747 bp->b_bcount, BUF_ISREAD(bp) ? "READ" : "WRITE",
748 off, off, (off + bp->b_bcount), async ? "a" : ""));
749
750 /* mmap? handle here and return */
751 if (rblk->rblk_mmflags) {
752 struct blkwin *win;
753 int winsize, iodone;
754 uint8_t *ioaddr, *bufaddr;
755
756 for (iodone = 0; iodone < bp->b_bcount;
757 iodone += winsize, off += winsize) {
758 winsize = bp->b_bcount - iodone;
759 win = getwindow(rblk, off, &winsize, &error);
760 if (win == NULL) {
761 rump_biodone(bp, iodone, error);
762 return;
763 }
764
765 ioaddr = (uint8_t *)win->win_mem + (off-STARTWIN(off));
766 bufaddr = (uint8_t *)bp->b_data + iodone;
767
768 DPRINTF(("strat: %p off 0x%" PRIx64
769 ", ioaddr %p (%p)/buf %p\n", win,
770 win->win_off, ioaddr, win->win_mem, bufaddr));
771 if (BUF_ISREAD(bp)) {
772 memcpy(bufaddr, ioaddr, winsize);
773 } else {
774 memcpy(ioaddr, bufaddr, winsize);
775 }
776
777 /* synchronous write, sync bits back to disk */
778 if (BUF_ISWRITE(bp) && !async) {
779 rumpuser_memsync(ioaddr, winsize, &error);
780 }
781 putwindow(rblk, win);
782 }
783
784 rump_biodone(bp, bp->b_bcount, 0);
785 } else {
786 op = BUF_ISREAD(bp) ? RUMPUSER_BIO_READ : RUMPUSER_BIO_WRITE;
787 if (BUF_ISWRITE(bp) && !async)
788 op |= RUMPUSER_BIO_SYNC;
789
790 rumpuser_bio(rblk->rblk_fd, op, bp->b_data, bp->b_bcount, off,
791 rump_biodone, bp);
792 }
793 }
794
795 void
796 rumpblk_strategy(struct buf *bp)
797 {
798
799 dostrategy(bp);
800 }
801
802 /*
803 * Simple random number generator. This is private so that we can
804 * very repeatedly control which blocks will fail.
805 *
806 * <mlelstv> pooka, rand()
807 * <mlelstv> [paste]
808 */
809 static unsigned
810 gimmerand(void)
811 {
812
813 return (randstate = randstate * 1103515245 + 12345) % (0x80000000L);
814 }
815
816 /*
817 * Block device with very simple fault injection. Fails every
818 * n out of BLKFAIL_MAX I/O with EIO. n is determined by the env
819 * variable RUMP_BLKFAIL.
820 */
821 void
822 rumpblk_strategy_fail(struct buf *bp)
823 {
824
825 if (gimmerand() % BLKFAIL_MAX >= blkfail) {
826 dostrategy(bp);
827 } else {
828 printf("block fault injection: failing I/O on block %lld\n",
829 (long long)bp->b_blkno);
830 bp->b_error = EIO;
831 biodone(bp);
832 }
833 }
834