rumpblk.c revision 1.43 1 /* $NetBSD: rumpblk.c,v 1.43 2011/02/02 15:55:22 pooka Exp $ */
2
3 /*
4 * Copyright (c) 2009 Antti Kantee. All Rights Reserved.
5 *
6 * Development of this software was supported by the
7 * Finnish Cultural Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
19 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 /*
32 * Block device emulation. Presents a block device interface and
33 * uses rumpuser system calls to satisfy I/O requests.
34 *
35 * We provide fault injection. The driver can be made to fail
36 * I/O occasionally.
37 *
38 * The driver also provides an optimization for regular files by
39 * using memory-mapped I/O. This avoids kernel access for every
40 * I/O operation. It also gives finer-grained control of how to
41 * flush data. Additionally, in case the rump kernel dumps core,
42 * we get way less carnage.
43 *
44 * However, it is quite costly in writing large amounts of
45 * file data, since old contents cannot merely be overwritten, but
46 * must be paged in first before replacing (i.e. r/m/w). Ideally,
47 * we should use directio. The problem is that directio can fail
48 * silently causing improper file system semantics (i.e. unflushed
49 * data). Therefore, default to mmap for now. Even so, directio
50 * _should_ be safe and can be enabled by compiling this module
51 * with -DHAS_DIRECTIO.
52 */
53
54 #include <sys/cdefs.h>
55 __KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.43 2011/02/02 15:55:22 pooka Exp $");
56
57 #include <sys/param.h>
58 #include <sys/buf.h>
59 #include <sys/conf.h>
60 #include <sys/condvar.h>
61 #include <sys/disklabel.h>
62 #include <sys/evcnt.h>
63 #include <sys/fcntl.h>
64 #include <sys/kmem.h>
65 #include <sys/malloc.h>
66 #include <sys/queue.h>
67 #include <sys/stat.h>
68
69 #include <rump/rumpuser.h>
70
71 #include "rump_private.h"
72 #include "rump_vfs_private.h"
73
74 #if 0
75 #define DPRINTF(x) printf x
76 #else
77 #define DPRINTF(x)
78 #endif
79
80 /* Default: 16 x 1MB windows */
81 unsigned memwinsize = (1<<20);
82 unsigned memwincnt = 16;
83
84 #define STARTWIN(off) ((off) & ~((off_t)memwinsize-1))
85 #define INWIN(win,off) ((win)->win_off == STARTWIN(off))
86 #define WINSIZE(rblk, win) (MIN((rblk->rblk_hostsize-win->win_off), \
87 memwinsize))
88 #define WINVALID(win) ((win)->win_off != (off_t)-1)
89 #define WINVALIDATE(win) ((win)->win_off = (off_t)-1)
90 struct blkwin {
91 off_t win_off;
92 void *win_mem;
93 int win_refcnt;
94
95 TAILQ_ENTRY(blkwin) win_lru;
96 };
97
98 #define RUMPBLK_SIZE 16
99 static struct rblkdev {
100 char *rblk_path;
101 int rblk_fd;
102 int rblk_opencnt;
103 #ifdef HAS_ODIRECT
104 int rblk_dfd;
105 #endif
106 uint64_t rblk_size;
107 uint64_t rblk_hostoffset;
108 uint64_t rblk_hostsize;
109 int rblk_ftype;
110
111 /* for mmap */
112 int rblk_mmflags;
113 kmutex_t rblk_memmtx;
114 kcondvar_t rblk_memcv;
115 TAILQ_HEAD(winlru, blkwin) rblk_lruq;
116 bool rblk_waiting;
117
118 struct disklabel rblk_label;
119 } minors[RUMPBLK_SIZE];
120
121 static struct evcnt ev_io_total;
122 static struct evcnt ev_io_async;
123
124 static struct evcnt ev_memblk_hits;
125 static struct evcnt ev_memblk_busy;
126
127 static struct evcnt ev_bwrite_total;
128 static struct evcnt ev_bwrite_async;
129 static struct evcnt ev_bread_total;
130
131 dev_type_open(rumpblk_open);
132 dev_type_close(rumpblk_close);
133 dev_type_read(rumpblk_read);
134 dev_type_write(rumpblk_write);
135 dev_type_ioctl(rumpblk_ioctl);
136 dev_type_strategy(rumpblk_strategy);
137 dev_type_strategy(rumpblk_strategy_fail);
138 dev_type_dump(rumpblk_dump);
139 dev_type_size(rumpblk_size);
140
141 static const struct bdevsw rumpblk_bdevsw = {
142 rumpblk_open, rumpblk_close, rumpblk_strategy, rumpblk_ioctl,
143 nodump, nosize, D_DISK
144 };
145
146 static const struct bdevsw rumpblk_bdevsw_fail = {
147 rumpblk_open, rumpblk_close, rumpblk_strategy_fail, rumpblk_ioctl,
148 nodump, nosize, D_DISK
149 };
150
151 static const struct cdevsw rumpblk_cdevsw = {
152 rumpblk_open, rumpblk_close, rumpblk_read, rumpblk_write,
153 rumpblk_ioctl, nostop, notty, nopoll, nommap, nokqfilter, D_DISK
154 };
155
156 /* fail every n out of BLKFAIL_MAX */
157 #define BLKFAIL_MAX 10000
158 static int blkfail;
159 static unsigned randstate;
160 static kmutex_t rumpblk_lock;
161 static int sectshift = DEV_BSHIFT;
162
163 static void
164 makedefaultlabel(struct disklabel *lp, off_t size, int part)
165 {
166 int i;
167
168 memset(lp, 0, sizeof(*lp));
169
170 lp->d_secperunit = size;
171 lp->d_secsize = 1 << sectshift;
172 lp->d_nsectors = size >> sectshift;
173 lp->d_ntracks = 1;
174 lp->d_ncylinders = 1;
175 lp->d_secpercyl = lp->d_nsectors;
176
177 /* oh dear oh dear */
178 strncpy(lp->d_typename, "rumpd", sizeof(lp->d_typename));
179 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
180
181 lp->d_type = DTYPE_RUMPD;
182 lp->d_rpm = 11;
183 lp->d_interleave = 1;
184 lp->d_flags = 0;
185
186 /* XXX: RAW_PART handling? */
187 for (i = 0; i < part; i++) {
188 lp->d_partitions[i].p_fstype = FS_UNUSED;
189 }
190 lp->d_partitions[part].p_size = size >> sectshift;
191 lp->d_npartitions = part+1;
192 /* XXX: file system type? */
193
194 lp->d_magic = DISKMAGIC;
195 lp->d_magic2 = DISKMAGIC;
196 lp->d_checksum = 0; /* XXX */
197 }
198
199 static struct blkwin *
200 getwindow(struct rblkdev *rblk, off_t off, int *wsize, int *error)
201 {
202 struct blkwin *win;
203
204 mutex_enter(&rblk->rblk_memmtx);
205 retry:
206 /* search for window */
207 TAILQ_FOREACH(win, &rblk->rblk_lruq, win_lru) {
208 if (INWIN(win, off) && WINVALID(win))
209 break;
210 }
211
212 /* found? return */
213 if (win) {
214 ev_memblk_hits.ev_count++;
215 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
216 goto good;
217 }
218
219 /*
220 * Else, create new window. If the least recently used is not
221 * currently in use, reuse that. Otherwise we need to wait.
222 */
223 win = TAILQ_LAST(&rblk->rblk_lruq, winlru);
224 if (win->win_refcnt == 0) {
225 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
226 mutex_exit(&rblk->rblk_memmtx);
227
228 if (WINVALID(win)) {
229 DPRINTF(("win %p, unmap mem %p, off 0x%" PRIx64 "\n",
230 win, win->win_mem, win->win_off));
231 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win));
232 WINVALIDATE(win);
233 }
234
235 win->win_off = STARTWIN(off);
236 win->win_mem = rumpuser_filemmap(rblk->rblk_fd, win->win_off,
237 WINSIZE(rblk, win), rblk->rblk_mmflags, error);
238 DPRINTF(("win %p, off 0x%" PRIx64 ", mem %p\n",
239 win, win->win_off, win->win_mem));
240
241 mutex_enter(&rblk->rblk_memmtx);
242 if (win->win_mem == NULL) {
243 WINVALIDATE(win);
244 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru);
245 mutex_exit(&rblk->rblk_memmtx);
246 return NULL;
247 }
248 } else {
249 DPRINTF(("memwin wait\n"));
250 ev_memblk_busy.ev_count++;
251
252 rblk->rblk_waiting = true;
253 cv_wait(&rblk->rblk_memcv, &rblk->rblk_memmtx);
254 goto retry;
255 }
256
257 good:
258 KASSERT(win);
259 win->win_refcnt++;
260 TAILQ_INSERT_HEAD(&rblk->rblk_lruq, win, win_lru);
261 mutex_exit(&rblk->rblk_memmtx);
262 *wsize = MIN(*wsize, memwinsize - (off-win->win_off));
263 KASSERT(*wsize);
264
265 return win;
266 }
267
268 static void
269 putwindow(struct rblkdev *rblk, struct blkwin *win)
270 {
271
272 mutex_enter(&rblk->rblk_memmtx);
273 if (--win->win_refcnt == 0 && rblk->rblk_waiting) {
274 rblk->rblk_waiting = false;
275 cv_broadcast(&rblk->rblk_memcv);
276 }
277 KASSERT(win->win_refcnt >= 0);
278 mutex_exit(&rblk->rblk_memmtx);
279 }
280
281 static void
282 wincleanup(struct rblkdev *rblk)
283 {
284 struct blkwin *win;
285
286 while ((win = TAILQ_FIRST(&rblk->rblk_lruq)) != NULL) {
287 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
288 if (WINVALID(win)) {
289 DPRINTF(("cleanup win %p addr %p\n",
290 win, win->win_mem));
291 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win));
292 }
293 kmem_free(win, sizeof(*win));
294 }
295 rblk->rblk_mmflags = 0;
296 }
297
298 int
299 rumpblk_init(void)
300 {
301 char buf[64];
302 devmajor_t rumpblkmaj = RUMPBLK_DEVMAJOR;
303 unsigned tmp;
304 int error, i;
305
306 mutex_init(&rumpblk_lock, MUTEX_DEFAULT, IPL_NONE);
307
308 if (rumpuser_getenv("RUMP_BLKFAIL", buf, sizeof(buf), &error) == 0) {
309 blkfail = strtoul(buf, NULL, 10);
310 /* fail everything */
311 if (blkfail > BLKFAIL_MAX)
312 blkfail = BLKFAIL_MAX;
313 if (rumpuser_getenv("RUMP_BLKFAIL_SEED", buf, sizeof(buf),
314 &error) == 0) {
315 randstate = strtoul(buf, NULL, 10);
316 } else {
317 randstate = arc4random();
318 }
319 printf("rumpblk: FAULT INJECTION ACTIVE! fail %d/%d. "
320 "seed %u\n", blkfail, BLKFAIL_MAX, randstate);
321 } else {
322 blkfail = 0;
323 }
324
325 if (rumpuser_getenv("RUMP_BLKWINSIZE", buf, sizeof(buf), &error) == 0) {
326 printf("rumpblk: ");
327 tmp = strtoul(buf, NULL, 10);
328 if (tmp && !(tmp & (tmp-1)))
329 memwinsize = tmp;
330 else
331 printf("invalid RUMP_BLKWINSIZE %d, ", tmp);
332 printf("using %d for memwinsize\n", memwinsize);
333 }
334 if (rumpuser_getenv("RUMP_BLKWINCOUNT", buf, sizeof(buf), &error) == 0){
335 printf("rumpblk: ");
336 tmp = strtoul(buf, NULL, 10);
337 if (tmp)
338 memwincnt = tmp;
339 else
340 printf("invalid RUMP_BLKWINCOUNT %d, ", tmp);
341 printf("using %d for memwincount\n", memwincnt);
342 }
343 if (rumpuser_getenv("RUMP_BLKSECTSHIFT", buf, sizeof(buf), &error)==0){
344 printf("rumpblk: ");
345 tmp = strtoul(buf, NULL, 10);
346 if (tmp >= DEV_BSHIFT)
347 sectshift = tmp;
348 else
349 printf("RUMP_BLKSECTSHIFT must be least %d (now %d), ",
350 DEV_BSHIFT, tmp);
351 printf("using %d for sector shift (size %d)\n",
352 sectshift, 1<<sectshift);
353 }
354
355 memset(minors, 0, sizeof(minors));
356 for (i = 0; i < RUMPBLK_SIZE; i++) {
357 mutex_init(&minors[i].rblk_memmtx, MUTEX_DEFAULT, IPL_NONE);
358 cv_init(&minors[i].rblk_memcv, "rblkmcv");
359 }
360
361 evcnt_attach_dynamic(&ev_io_total, EVCNT_TYPE_MISC, NULL,
362 "rumpblk", "I/O reqs");
363 evcnt_attach_dynamic(&ev_io_async, EVCNT_TYPE_MISC, NULL,
364 "rumpblk", "async I/O");
365
366 evcnt_attach_dynamic(&ev_bread_total, EVCNT_TYPE_MISC, NULL,
367 "rumpblk", "bytes read");
368 evcnt_attach_dynamic(&ev_bwrite_total, EVCNT_TYPE_MISC, NULL,
369 "rumpblk", "bytes written");
370 evcnt_attach_dynamic(&ev_bwrite_async, EVCNT_TYPE_MISC, NULL,
371 "rumpblk", "bytes written async");
372
373 evcnt_attach_dynamic(&ev_memblk_hits, EVCNT_TYPE_MISC, NULL,
374 "rumpblk", "window hits");
375 evcnt_attach_dynamic(&ev_memblk_busy, EVCNT_TYPE_MISC, NULL,
376 "rumpblk", "all windows busy");
377
378 if (blkfail) {
379 return devsw_attach("rumpblk",
380 &rumpblk_bdevsw_fail, &rumpblkmaj,
381 &rumpblk_cdevsw, &rumpblkmaj);
382 } else {
383 return devsw_attach("rumpblk",
384 &rumpblk_bdevsw, &rumpblkmaj,
385 &rumpblk_cdevsw, &rumpblkmaj);
386 }
387 }
388
389 int
390 rumpblk_register(const char *path, devminor_t *dmin,
391 uint64_t offset, uint64_t size)
392 {
393 struct rblkdev *rblk;
394 uint64_t flen;
395 size_t len;
396 int ftype, error, i;
397
398 /* devices might not report correct size unless they're open */
399 if (rumpuser_getfileinfo(path, &flen, &ftype, &error) == -1)
400 return error;
401
402 /* verify host file is of supported type */
403 if (!(ftype == RUMPUSER_FT_REG
404 || ftype == RUMPUSER_FT_BLK
405 || ftype == RUMPUSER_FT_CHR))
406 return EINVAL;
407
408 mutex_enter(&rumpblk_lock);
409 for (i = 0; i < RUMPBLK_SIZE; i++) {
410 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
411 mutex_exit(&rumpblk_lock);
412 *dmin = i;
413 return 0;
414 }
415 }
416
417 for (i = 0; i < RUMPBLK_SIZE; i++)
418 if (minors[i].rblk_path == NULL)
419 break;
420 if (i == RUMPBLK_SIZE) {
421 mutex_exit(&rumpblk_lock);
422 return EBUSY;
423 }
424
425 rblk = &minors[i];
426 len = strlen(path);
427 rblk->rblk_path = malloc(len + 1, M_TEMP, M_WAITOK);
428 strcpy(rblk->rblk_path, path);
429 rblk->rblk_fd = -1;
430 rblk->rblk_hostoffset = offset;
431 if (size != RUMPBLK_SIZENOTSET) {
432 KASSERT(size + offset <= flen);
433 rblk->rblk_size = size;
434 } else {
435 KASSERT(offset < flen);
436 rblk->rblk_size = flen - offset;
437 }
438 rblk->rblk_hostsize = flen;
439 rblk->rblk_ftype = ftype;
440 makedefaultlabel(&rblk->rblk_label, rblk->rblk_size, i);
441 mutex_exit(&rumpblk_lock);
442
443 *dmin = i;
444 return 0;
445 }
446
447 /*
448 * Unregister rumpblk. It's the callers responsibility to make
449 * sure it's no longer in use.
450 */
451 int
452 rumpblk_deregister(const char *path)
453 {
454 struct rblkdev *rblk;
455 int i;
456
457 mutex_enter(&rumpblk_lock);
458 for (i = 0; i < RUMPBLK_SIZE; i++) {
459 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
460 break;
461 }
462 }
463 mutex_exit(&rumpblk_lock);
464
465 if (i == RUMPBLK_SIZE)
466 return ENOENT;
467
468 rblk = &minors[i];
469 KASSERT(rblk->rblk_fd == -1);
470 KASSERT(rblk->rblk_opencnt == 0);
471
472 wincleanup(rblk);
473 free(rblk->rblk_path, M_TEMP);
474 rblk->rblk_path = NULL;
475 memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label));
476
477 return 0;
478 }
479
480 int
481 rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l)
482 {
483 struct rblkdev *rblk = &minors[minor(dev)];
484 int error, fd;
485
486 if (rblk->rblk_path == NULL)
487 return ENXIO;
488
489 if (rblk->rblk_fd != -1)
490 return 0; /* XXX: refcount, open mode */
491 flag &= ~O_TRUNC;
492 fd = rumpuser_open(rblk->rblk_path, OFLAGS(flag), &error);
493 if (error)
494 return error;
495
496 #ifdef HAS_ODIRECT
497 rblk->rblk_dfd = rumpuser_open(rblk->rblk_path,
498 OFLAGS(flag) | O_DIRECT, &error);
499 if (error)
500 return error;
501 #endif
502
503 if (rblk->rblk_ftype == RUMPUSER_FT_REG) {
504 uint64_t fsize = rblk->rblk_size, off = rblk->rblk_hostoffset;
505 struct blkwin *win;
506 int i, winsize;
507
508 /*
509 * Use mmap to access a regular file. Allocate and
510 * cache initial windows here. Failure to allocate one
511 * means fallback to read/write i/o.
512 */
513
514 rblk->rblk_mmflags = 0;
515 if (flag & FREAD)
516 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_READ;
517 if (flag & FWRITE) {
518 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_WRITE;
519 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_SHARED;
520 }
521
522 TAILQ_INIT(&rblk->rblk_lruq);
523 rblk->rblk_fd = fd;
524
525 for (i = 0; i < memwincnt && off + i*memwinsize < fsize; i++) {
526 win = kmem_zalloc(sizeof(*win), KM_SLEEP);
527 WINVALIDATE(win);
528 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru);
529
530 /*
531 * Allocate first windows. Here we just generally
532 * make sure a) we can mmap at all b) we have the
533 * necessary VA available
534 */
535 winsize = memwinsize;
536 win = getwindow(rblk, off + i*memwinsize, &winsize,
537 &error);
538 if (win) {
539 putwindow(rblk, win);
540 } else {
541 wincleanup(rblk);
542 break;
543 }
544 }
545 } else {
546 rblk->rblk_fd = fd;
547 }
548
549 KASSERT(rblk->rblk_fd != -1);
550 return 0;
551 }
552
553 int
554 rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l)
555 {
556 struct rblkdev *rblk = &minors[minor(dev)];
557 int dummy;
558
559 if (rblk->rblk_mmflags)
560 wincleanup(rblk);
561 rumpuser_fsync(rblk->rblk_fd, &dummy);
562 rumpuser_close(rblk->rblk_fd, &dummy);
563 rblk->rblk_fd = -1;
564
565 return 0;
566 }
567
568 int
569 rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l)
570 {
571 devminor_t dmin = minor(dev);
572 struct rblkdev *rblk = &minors[dmin];
573 struct partinfo *pi;
574 int error = 0;
575
576 /* well, me should support a few more, but we don't for now */
577 switch (xfer) {
578 case DIOCGDINFO:
579 *(struct disklabel *)addr = rblk->rblk_label;
580 break;
581
582 case DIOCGPART:
583 pi = addr;
584 pi->part = &rblk->rblk_label.d_partitions[DISKPART(dmin)];
585 pi->disklab = &rblk->rblk_label;
586 break;
587
588 /* it's synced enough along the write path */
589 case DIOCCACHESYNC:
590 break;
591
592 default:
593 error = ENOTTY;
594 break;
595 }
596
597 return error;
598 }
599
600 static int
601 do_physio(dev_t dev, struct uio *uio, int which)
602 {
603 void (*strat)(struct buf *);
604
605 if (blkfail)
606 strat = rumpblk_strategy_fail;
607 else
608 strat = rumpblk_strategy;
609
610 return physio(strat, NULL, dev, which, minphys, uio);
611 }
612
613 int
614 rumpblk_read(dev_t dev, struct uio *uio, int flags)
615 {
616
617 return do_physio(dev, uio, B_READ);
618 }
619
620 int
621 rumpblk_write(dev_t dev, struct uio *uio, int flags)
622 {
623
624 return do_physio(dev, uio, B_WRITE);
625 }
626
627 static void
628 dostrategy(struct buf *bp)
629 {
630 struct rblkdev *rblk = &minors[minor(bp->b_dev)];
631 off_t off;
632 int async = bp->b_flags & B_ASYNC;
633 int error;
634
635 /* collect statistics */
636 ev_io_total.ev_count++;
637 if (async)
638 ev_io_async.ev_count++;
639 if (BUF_ISWRITE(bp)) {
640 ev_bwrite_total.ev_count += bp->b_bcount;
641 if (async)
642 ev_bwrite_async.ev_count += bp->b_bcount;
643 } else {
644 ev_bread_total.ev_count++;
645 }
646
647 off = bp->b_blkno << sectshift;
648 /*
649 * Do bounds checking if we're working on a file. Otherwise
650 * invalid file systems might attempt to read beyond EOF. This
651 * is bad(tm) especially on mmapped images. This is essentially
652 * the kernel bounds_check() routines.
653 */
654 if (off + bp->b_bcount > rblk->rblk_size) {
655 int64_t sz = rblk->rblk_size - off;
656
657 /* EOF */
658 if (sz == 0) {
659 rump_biodone(bp, 0, 0);
660 return;
661 }
662 /* beyond EOF ==> error */
663 if (sz < 0) {
664 rump_biodone(bp, 0, EINVAL);
665 return;
666 }
667
668 /* truncate to device size */
669 bp->b_bcount = sz;
670 }
671
672 off += rblk->rblk_hostoffset;
673 DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64
674 " (0x%" PRIx64 " - 0x%" PRIx64 "), %ssync\n",
675 bp->b_bcount, BUF_ISREAD(bp) ? "READ" : "WRITE",
676 off, off, (off + bp->b_bcount), async ? "a" : ""));
677
678 /* mmap? handle here and return */
679 if (rblk->rblk_mmflags) {
680 struct blkwin *win;
681 int winsize, iodone;
682 uint8_t *ioaddr, *bufaddr;
683
684 for (iodone = 0; iodone < bp->b_bcount;
685 iodone += winsize, off += winsize) {
686 winsize = bp->b_bcount - iodone;
687 win = getwindow(rblk, off, &winsize, &error);
688 if (win == NULL) {
689 rump_biodone(bp, iodone, error);
690 return;
691 }
692
693 ioaddr = (uint8_t *)win->win_mem + (off-STARTWIN(off));
694 bufaddr = (uint8_t *)bp->b_data + iodone;
695
696 DPRINTF(("strat: %p off 0x%" PRIx64
697 ", ioaddr %p (%p)/buf %p\n", win,
698 win->win_off, ioaddr, win->win_mem, bufaddr));
699 if (BUF_ISREAD(bp)) {
700 memcpy(bufaddr, ioaddr, winsize);
701 } else {
702 memcpy(ioaddr, bufaddr, winsize);
703 }
704
705 /* synchronous write, sync bits back to disk */
706 if (BUF_ISWRITE(bp) && !async) {
707 rumpuser_memsync(ioaddr, winsize, &error);
708 }
709 putwindow(rblk, win);
710 }
711
712 rump_biodone(bp, bp->b_bcount, 0);
713 return;
714 }
715
716 /*
717 * Do I/O. We have different paths for async and sync I/O.
718 * Async I/O is done by passing a request to rumpuser where
719 * it is executed. The rumpuser routine then calls
720 * biodone() to signal any waiters in the kernel. I/O's are
721 * executed in series. Technically executing them in parallel
722 * would produce better results, but then we'd need either
723 * more threads or posix aio. Maybe worth investigating
724 * this later.
725 *
726 * Using bufq here might be a good idea.
727 */
728
729 if (rump_threads) {
730 struct rumpuser_aio *rua;
731 int op, fd;
732
733 fd = rblk->rblk_fd;
734 if (BUF_ISREAD(bp)) {
735 op = RUA_OP_READ;
736 } else {
737 op = RUA_OP_WRITE;
738 if (!async) {
739 /* O_DIRECT not fully automatic yet */
740 #ifdef HAS_ODIRECT
741 if ((off & ((1<<sectshift)-1)) == 0
742 && ((intptr_t)bp->b_data
743 & ((1<<sectshift)-1)) == 0
744 && (bp->b_bcount & ((1<<sectshift)-1)) == 0)
745 fd = rblk->rblk_dfd;
746 else
747 #endif
748 op |= RUA_OP_SYNC;
749 }
750 }
751
752 rumpuser_mutex_enter(&rumpuser_aio_mtx);
753 while ((rumpuser_aio_head+1) % N_AIOS == rumpuser_aio_tail) {
754 rumpuser_cv_wait(&rumpuser_aio_cv, &rumpuser_aio_mtx);
755 }
756
757 rua = &rumpuser_aios[rumpuser_aio_head];
758 KASSERT(rua->rua_bp == NULL);
759 rua->rua_fd = fd;
760 rua->rua_data = bp->b_data;
761 rua->rua_dlen = bp->b_bcount;
762 rua->rua_off = off;
763 rua->rua_bp = bp;
764 rua->rua_op = op;
765
766 /* insert into queue & signal */
767 rumpuser_aio_head = (rumpuser_aio_head+1) % N_AIOS;
768 rumpuser_cv_signal(&rumpuser_aio_cv);
769 rumpuser_mutex_exit(&rumpuser_aio_mtx);
770 } else {
771 if (BUF_ISREAD(bp)) {
772 rumpuser_read_bio(rblk->rblk_fd, bp->b_data,
773 bp->b_bcount, off, rump_biodone, bp);
774 } else {
775 rumpuser_write_bio(rblk->rblk_fd, bp->b_data,
776 bp->b_bcount, off, rump_biodone, bp);
777 }
778 if (BUF_ISWRITE(bp) && !async)
779 rumpuser_fsync(rblk->rblk_fd, &error);
780 }
781 }
782
783 void
784 rumpblk_strategy(struct buf *bp)
785 {
786
787 dostrategy(bp);
788 }
789
790 /*
791 * Simple random number generator. This is private so that we can
792 * very repeatedly control which blocks will fail.
793 *
794 * <mlelstv> pooka, rand()
795 * <mlelstv> [paste]
796 */
797 static unsigned
798 gimmerand(void)
799 {
800
801 return (randstate = randstate * 1103515245 + 12345) % (0x80000000L);
802 }
803
804 /*
805 * Block device with very simple fault injection. Fails every
806 * n out of BLKFAIL_MAX I/O with EIO. n is determined by the env
807 * variable RUMP_BLKFAIL.
808 */
809 void
810 rumpblk_strategy_fail(struct buf *bp)
811 {
812
813 if (gimmerand() % BLKFAIL_MAX >= blkfail) {
814 dostrategy(bp);
815 } else {
816 printf("block fault injection: failing I/O on block %lld\n",
817 (long long)bp->b_blkno);
818 bp->b_error = EIO;
819 biodone(bp);
820 }
821 }
822