rumpblk.c revision 1.40 1 /* $NetBSD: rumpblk.c,v 1.40 2010/06/15 18:53:48 pooka Exp $ */
2
3 /*
4 * Copyright (c) 2009 Antti Kantee. All Rights Reserved.
5 *
6 * Development of this software was supported by the
7 * Finnish Cultural Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
19 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 /*
32 * Block device emulation. Presents a block device interface and
33 * uses rumpuser system calls to satisfy I/O requests.
34 *
35 * We provide fault injection. The driver can be made to fail
36 * I/O occasionally.
37 *
38 * The driver also provides an optimization for regular files by
39 * using memory-mapped I/O. This avoids kernel access for every
40 * I/O operation. It also gives finer-grained control of how to
41 * flush data. Additionally, in case the rump kernel dumps core,
42 * we get way less carnage.
43 *
44 * However, it is quite costly in writing large amounts of
45 * file data, since old contents cannot merely be overwritten, but
46 * must be paged in first before replacing (i.e. r/m/w). Ideally,
47 * we should use directio. The problem is that directio can fail
48 * silently causing improper file system semantics (i.e. unflushed
49 * data). Therefore, default to mmap for now. Even so, directio
50 * _should_ be safe and can be enabled by compiling this module
51 * with -DHAS_DIRECTIO.
52 */
53
54 #include <sys/cdefs.h>
55 __KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.40 2010/06/15 18:53:48 pooka Exp $");
56
57 #include <sys/param.h>
58 #include <sys/buf.h>
59 #include <sys/conf.h>
60 #include <sys/condvar.h>
61 #include <sys/disklabel.h>
62 #include <sys/evcnt.h>
63 #include <sys/fcntl.h>
64 #include <sys/kmem.h>
65 #include <sys/malloc.h>
66 #include <sys/queue.h>
67 #include <sys/stat.h>
68
69 #include <rump/rumpuser.h>
70
71 #include "rump_private.h"
72 #include "rump_vfs_private.h"
73
74 #if 0
75 #define DPRINTF(x) printf x
76 #else
77 #define DPRINTF(x)
78 #endif
79
80 /* Default: 16 x 1MB windows */
81 unsigned memwinsize = (1<<20);
82 unsigned memwincnt = 16;
83
84 #define STARTWIN(off) ((off) & ~((off_t)memwinsize-1))
85 #define INWIN(win,off) ((win)->win_off == STARTWIN(off))
86 #define WINSIZE(rblk, win) (MIN((rblk->rblk_size-win->win_off),memwinsize))
87 #define WINVALID(win) ((win)->win_off != (off_t)-1)
88 #define WINVALIDATE(win) ((win)->win_off = (off_t)-1)
89 struct blkwin {
90 off_t win_off;
91 void *win_mem;
92 int win_refcnt;
93
94 TAILQ_ENTRY(blkwin) win_lru;
95 };
96
97 #define RUMPBLK_SIZE 16
98 static struct rblkdev {
99 char *rblk_path;
100 int rblk_fd;
101 int rblk_opencnt;
102 #ifdef HAS_ODIRECT
103 int rblk_dfd;
104 #endif
105 uint64_t rblk_size;
106 uint64_t rblk_hostoffset;
107 int rblk_ftype;
108
109 /* for mmap */
110 int rblk_mmflags;
111 kmutex_t rblk_memmtx;
112 kcondvar_t rblk_memcv;
113 TAILQ_HEAD(winlru, blkwin) rblk_lruq;
114 bool rblk_waiting;
115
116 struct disklabel rblk_label;
117 } minors[RUMPBLK_SIZE];
118
119 static struct evcnt ev_io_total;
120 static struct evcnt ev_io_async;
121
122 static struct evcnt ev_memblk_hits;
123 static struct evcnt ev_memblk_busy;
124
125 static struct evcnt ev_bwrite_total;
126 static struct evcnt ev_bwrite_async;
127 static struct evcnt ev_bread_total;
128
129 dev_type_open(rumpblk_open);
130 dev_type_close(rumpblk_close);
131 dev_type_read(rumpblk_read);
132 dev_type_write(rumpblk_write);
133 dev_type_ioctl(rumpblk_ioctl);
134 dev_type_strategy(rumpblk_strategy);
135 dev_type_strategy(rumpblk_strategy_fail);
136 dev_type_dump(rumpblk_dump);
137 dev_type_size(rumpblk_size);
138
139 static const struct bdevsw rumpblk_bdevsw = {
140 rumpblk_open, rumpblk_close, rumpblk_strategy, rumpblk_ioctl,
141 nodump, nosize, D_DISK
142 };
143
144 static const struct bdevsw rumpblk_bdevsw_fail = {
145 rumpblk_open, rumpblk_close, rumpblk_strategy_fail, rumpblk_ioctl,
146 nodump, nosize, D_DISK
147 };
148
149 static const struct cdevsw rumpblk_cdevsw = {
150 rumpblk_open, rumpblk_close, rumpblk_read, rumpblk_write,
151 rumpblk_ioctl, nostop, notty, nopoll, nommap, nokqfilter, D_DISK
152 };
153
154 /* fail every n out of BLKFAIL_MAX */
155 #define BLKFAIL_MAX 10000
156 static int blkfail;
157 static unsigned randstate;
158 static kmutex_t rumpblk_lock;
159 static int sectshift = DEV_BSHIFT;
160
161 static void
162 makedefaultlabel(struct disklabel *lp, off_t size, int part)
163 {
164 int i;
165
166 memset(lp, 0, sizeof(*lp));
167
168 lp->d_secperunit = size;
169 lp->d_secsize = 1 << sectshift;
170 lp->d_nsectors = size >> sectshift;
171 lp->d_ntracks = 1;
172 lp->d_ncylinders = 1;
173 lp->d_secpercyl = lp->d_nsectors;
174
175 /* oh dear oh dear */
176 strncpy(lp->d_typename, "rumpd", sizeof(lp->d_typename));
177 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
178
179 lp->d_type = DTYPE_RUMPD;
180 lp->d_rpm = 11;
181 lp->d_interleave = 1;
182 lp->d_flags = 0;
183
184 /* XXX: RAW_PART handling? */
185 for (i = 0; i < part; i++) {
186 lp->d_partitions[i].p_fstype = FS_UNUSED;
187 }
188 lp->d_partitions[part].p_size = size >> sectshift;
189 lp->d_npartitions = part+1;
190 /* XXX: file system type? */
191
192 lp->d_magic = DISKMAGIC;
193 lp->d_magic2 = DISKMAGIC;
194 lp->d_checksum = 0; /* XXX */
195 }
196
197 static struct blkwin *
198 getwindow(struct rblkdev *rblk, off_t off, int *wsize, int *error)
199 {
200 struct blkwin *win;
201
202 mutex_enter(&rblk->rblk_memmtx);
203 retry:
204 /* search for window */
205 TAILQ_FOREACH(win, &rblk->rblk_lruq, win_lru) {
206 if (INWIN(win, off) && WINVALID(win))
207 break;
208 }
209
210 /* found? return */
211 if (win) {
212 ev_memblk_hits.ev_count++;
213 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
214 goto good;
215 }
216
217 /*
218 * Else, create new window. If the least recently used is not
219 * currently in use, reuse that. Otherwise we need to wait.
220 */
221 win = TAILQ_LAST(&rblk->rblk_lruq, winlru);
222 if (win->win_refcnt == 0) {
223 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
224 mutex_exit(&rblk->rblk_memmtx);
225
226 if (WINVALID(win)) {
227 DPRINTF(("win %p, unmap mem %p, off 0x%" PRIx64 "\n",
228 win, win->win_mem, win->win_off));
229 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win));
230 WINVALIDATE(win);
231 }
232
233 win->win_off = STARTWIN(off);
234 win->win_mem = rumpuser_filemmap(rblk->rblk_fd, win->win_off,
235 WINSIZE(rblk, win), rblk->rblk_mmflags, error);
236 DPRINTF(("win %p, off 0x%" PRIx64 ", mem %p\n",
237 win, win->win_off, win->win_mem));
238
239 mutex_enter(&rblk->rblk_memmtx);
240 if (win->win_mem == NULL) {
241 WINVALIDATE(win);
242 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru);
243 mutex_exit(&rblk->rblk_memmtx);
244 return NULL;
245 }
246 } else {
247 DPRINTF(("memwin wait\n"));
248 ev_memblk_busy.ev_count++;
249
250 rblk->rblk_waiting = true;
251 cv_wait(&rblk->rblk_memcv, &rblk->rblk_memmtx);
252 goto retry;
253 }
254
255 good:
256 KASSERT(win);
257 win->win_refcnt++;
258 TAILQ_INSERT_HEAD(&rblk->rblk_lruq, win, win_lru);
259 mutex_exit(&rblk->rblk_memmtx);
260 *wsize = MIN(*wsize, memwinsize - (off-win->win_off));
261 KASSERT(*wsize);
262
263 return win;
264 }
265
266 static void
267 putwindow(struct rblkdev *rblk, struct blkwin *win)
268 {
269
270 mutex_enter(&rblk->rblk_memmtx);
271 if (--win->win_refcnt == 0 && rblk->rblk_waiting) {
272 rblk->rblk_waiting = false;
273 cv_signal(&rblk->rblk_memcv);
274 }
275 KASSERT(win->win_refcnt >= 0);
276 mutex_exit(&rblk->rblk_memmtx);
277 }
278
279 static void
280 wincleanup(struct rblkdev *rblk)
281 {
282 struct blkwin *win;
283
284 while ((win = TAILQ_FIRST(&rblk->rblk_lruq)) != NULL) {
285 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
286 if (WINVALID(win)) {
287 DPRINTF(("cleanup win %p addr %p\n",
288 win, win->win_mem));
289 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win));
290 }
291 kmem_free(win, sizeof(*win));
292 }
293 rblk->rblk_mmflags = 0;
294 }
295
296 int
297 rumpblk_init(void)
298 {
299 char buf[64];
300 devmajor_t rumpblkmaj = RUMPBLK_DEVMAJOR;
301 unsigned tmp;
302 int error, i;
303
304 mutex_init(&rumpblk_lock, MUTEX_DEFAULT, IPL_NONE);
305
306 if (rumpuser_getenv("RUMP_BLKFAIL", buf, sizeof(buf), &error) == 0) {
307 blkfail = strtoul(buf, NULL, 10);
308 /* fail everything */
309 if (blkfail > BLKFAIL_MAX)
310 blkfail = BLKFAIL_MAX;
311 if (rumpuser_getenv("RUMP_BLKFAIL_SEED", buf, sizeof(buf),
312 &error) == 0) {
313 randstate = strtoul(buf, NULL, 10);
314 } else {
315 randstate = arc4random();
316 }
317 printf("rumpblk: FAULT INJECTION ACTIVE! fail %d/%d. "
318 "seed %u\n", blkfail, BLKFAIL_MAX, randstate);
319 } else {
320 blkfail = 0;
321 }
322
323 if (rumpuser_getenv("RUMP_BLKWINSIZE", buf, sizeof(buf), &error) == 0) {
324 printf("rumpblk: ");
325 tmp = strtoul(buf, NULL, 10);
326 if (tmp && !(tmp & (tmp-1)))
327 memwinsize = tmp;
328 else
329 printf("invalid RUMP_BLKWINSIZE %d, ", tmp);
330 printf("using %d for memwinsize\n", memwinsize);
331 }
332 if (rumpuser_getenv("RUMP_BLKWINCOUNT", buf, sizeof(buf), &error) == 0){
333 printf("rumpblk: ");
334 tmp = strtoul(buf, NULL, 10);
335 if (tmp)
336 memwincnt = tmp;
337 else
338 printf("invalid RUMP_BLKWINCOUNT %d, ", tmp);
339 printf("using %d for memwincount\n", memwincnt);
340 }
341 if (rumpuser_getenv("RUMP_BLKSECTSHIFT", buf, sizeof(buf), &error)==0){
342 printf("rumpblk: ");
343 tmp = strtoul(buf, NULL, 10);
344 if (tmp >= DEV_BSHIFT)
345 sectshift = tmp;
346 else
347 printf("RUMP_BLKSECTSHIFT must be least %d (now %d), ",
348 DEV_BSHIFT, tmp);
349 printf("using %d for sector shift (size %d)\n",
350 sectshift, 1<<sectshift);
351 }
352
353 memset(minors, 0, sizeof(minors));
354 for (i = 0; i < RUMPBLK_SIZE; i++) {
355 mutex_init(&minors[i].rblk_memmtx, MUTEX_DEFAULT, IPL_NONE);
356 cv_init(&minors[i].rblk_memcv, "rblkmcv");
357 }
358
359 evcnt_attach_dynamic(&ev_io_total, EVCNT_TYPE_MISC, NULL,
360 "rumpblk", "I/O reqs");
361 evcnt_attach_dynamic(&ev_io_async, EVCNT_TYPE_MISC, NULL,
362 "rumpblk", "async I/O");
363
364 evcnt_attach_dynamic(&ev_bread_total, EVCNT_TYPE_MISC, NULL,
365 "rumpblk", "bytes read");
366 evcnt_attach_dynamic(&ev_bwrite_total, EVCNT_TYPE_MISC, NULL,
367 "rumpblk", "bytes written");
368 evcnt_attach_dynamic(&ev_bwrite_async, EVCNT_TYPE_MISC, NULL,
369 "rumpblk", "bytes written async");
370
371 evcnt_attach_dynamic(&ev_memblk_hits, EVCNT_TYPE_MISC, NULL,
372 "rumpblk", "window hits");
373 evcnt_attach_dynamic(&ev_memblk_busy, EVCNT_TYPE_MISC, NULL,
374 "rumpblk", "all windows busy");
375
376 if (blkfail) {
377 return devsw_attach("rumpblk",
378 &rumpblk_bdevsw_fail, &rumpblkmaj,
379 &rumpblk_cdevsw, &rumpblkmaj);
380 } else {
381 return devsw_attach("rumpblk",
382 &rumpblk_bdevsw, &rumpblkmaj,
383 &rumpblk_cdevsw, &rumpblkmaj);
384 }
385 }
386
387 int
388 rumpblk_register(const char *path, devminor_t *dmin,
389 uint64_t offset, uint64_t size)
390 {
391 struct rblkdev *rblk;
392 uint64_t flen;
393 size_t len;
394 int ftype, error, i;
395
396 /* devices might not report correct size unless they're open */
397 if (rumpuser_getfileinfo(path, &flen, &ftype, &error) == -1)
398 return error;
399
400 /* verify host file is of supported type */
401 if (!(ftype == RUMPUSER_FT_REG
402 || ftype == RUMPUSER_FT_BLK
403 || ftype == RUMPUSER_FT_CHR))
404 return EINVAL;
405
406 mutex_enter(&rumpblk_lock);
407 for (i = 0; i < RUMPBLK_SIZE; i++) {
408 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
409 mutex_exit(&rumpblk_lock);
410 *dmin = i;
411 return 0;
412 }
413 }
414
415 for (i = 0; i < RUMPBLK_SIZE; i++)
416 if (minors[i].rblk_path == NULL)
417 break;
418 if (i == RUMPBLK_SIZE) {
419 mutex_exit(&rumpblk_lock);
420 return EBUSY;
421 }
422
423 rblk = &minors[i];
424 len = strlen(path);
425 rblk->rblk_path = malloc(len + 1, M_TEMP, M_WAITOK);
426 strcpy(rblk->rblk_path, path);
427 rblk->rblk_fd = -1;
428 rblk->rblk_hostoffset = offset;
429 if (size != RUMPBLK_SIZENOTSET) {
430 KASSERT(size + offset <= flen);
431 rblk->rblk_size = size;
432 } else {
433 KASSERT(offset < flen);
434 rblk->rblk_size = flen - offset;
435 }
436 rblk->rblk_ftype = ftype;
437 makedefaultlabel(&rblk->rblk_label, rblk->rblk_size, i);
438 mutex_exit(&rumpblk_lock);
439
440 *dmin = i;
441 return 0;
442 }
443
444 /*
445 * Unregister rumpblk. It's the callers responsibility to make
446 * sure it's no longer in use.
447 */
448 int
449 rumpblk_deregister(const char *path)
450 {
451 struct rblkdev *rblk;
452 int i;
453
454 mutex_enter(&rumpblk_lock);
455 for (i = 0; i < RUMPBLK_SIZE; i++) {
456 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
457 break;
458 }
459 }
460 mutex_exit(&rumpblk_lock);
461
462 if (i == RUMPBLK_SIZE)
463 return ENOENT;
464
465 rblk = &minors[i];
466 KASSERT(rblk->rblk_fd == -1);
467 KASSERT(rblk->rblk_opencnt == 0);
468
469 wincleanup(rblk);
470 free(rblk->rblk_path, M_TEMP);
471 rblk->rblk_path = NULL;
472 memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label));
473
474 return 0;
475 }
476
477 int
478 rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l)
479 {
480 struct rblkdev *rblk = &minors[minor(dev)];
481 int error, fd;
482
483 if (rblk->rblk_path == NULL)
484 return ENXIO;
485
486 if (rblk->rblk_fd != -1)
487 return 0; /* XXX: refcount, open mode */
488 fd = rumpuser_open(rblk->rblk_path, OFLAGS(flag), &error);
489 if (error)
490 return error;
491
492 #ifdef HAS_ODIRECT
493 rblk->rblk_dfd = rumpuser_open(rblk->rblk_path,
494 OFLAGS(flag) | O_DIRECT, &error);
495 if (error)
496 return error;
497 #endif
498
499 if (rblk->rblk_ftype == RUMPUSER_FT_REG) {
500 uint64_t fsize = rblk->rblk_size, off = rblk->rblk_hostoffset;
501 struct blkwin *win;
502 int i, winsize;
503
504 /*
505 * Use mmap to access a regular file. Allocate and
506 * cache initial windows here. Failure to allocate one
507 * means fallback to read/write i/o.
508 */
509
510 rblk->rblk_mmflags = 0;
511 if (flag & FREAD)
512 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_READ;
513 if (flag & FWRITE) {
514 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_WRITE;
515 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_SHARED;
516 }
517
518 TAILQ_INIT(&rblk->rblk_lruq);
519 rblk->rblk_fd = fd;
520
521 for (i = 0; i < memwincnt && off + i*memwinsize < fsize; i++) {
522 win = kmem_zalloc(sizeof(*win), KM_SLEEP);
523 WINVALIDATE(win);
524 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru);
525
526 /*
527 * Allocate first windows. Here we just generally
528 * make sure a) we can mmap at all b) we have the
529 * necessary VA available
530 */
531 winsize = memwinsize;
532 win = getwindow(rblk, off + i*memwinsize, &winsize,
533 &error);
534 if (win) {
535 putwindow(rblk, win);
536 } else {
537 wincleanup(rblk);
538 break;
539 }
540 }
541 } else {
542 rblk->rblk_fd = fd;
543 }
544
545 KASSERT(rblk->rblk_fd != -1);
546 return 0;
547 }
548
549 int
550 rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l)
551 {
552 struct rblkdev *rblk = &minors[minor(dev)];
553 int dummy;
554
555 if (rblk->rblk_mmflags)
556 wincleanup(rblk);
557 rumpuser_fsync(rblk->rblk_fd, &dummy);
558 rumpuser_close(rblk->rblk_fd, &dummy);
559 rblk->rblk_fd = -1;
560
561 return 0;
562 }
563
564 int
565 rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l)
566 {
567 devminor_t dmin = minor(dev);
568 struct rblkdev *rblk = &minors[dmin];
569 struct partinfo *pi;
570 int error = 0;
571
572 /* well, me should support a few more, but we don't for now */
573 switch (xfer) {
574 case DIOCGDINFO:
575 *(struct disklabel *)addr = rblk->rblk_label;
576 break;
577
578 case DIOCGPART:
579 pi = addr;
580 pi->part = &rblk->rblk_label.d_partitions[DISKPART(dmin)];
581 pi->disklab = &rblk->rblk_label;
582 break;
583
584 /* it's synced enough along the write path */
585 case DIOCCACHESYNC:
586 break;
587
588 default:
589 error = ENOTTY;
590 break;
591 }
592
593 return error;
594 }
595
596 static int
597 do_physio(dev_t dev, struct uio *uio, int which)
598 {
599 void (*strat)(struct buf *);
600
601 if (blkfail)
602 strat = rumpblk_strategy_fail;
603 else
604 strat = rumpblk_strategy;
605
606 return physio(strat, NULL, dev, which, minphys, uio);
607 }
608
609 int
610 rumpblk_read(dev_t dev, struct uio *uio, int flags)
611 {
612
613 return do_physio(dev, uio, B_READ);
614 }
615
616 int
617 rumpblk_write(dev_t dev, struct uio *uio, int flags)
618 {
619
620 return do_physio(dev, uio, B_WRITE);
621 }
622
623 static void
624 dostrategy(struct buf *bp)
625 {
626 struct rblkdev *rblk = &minors[minor(bp->b_dev)];
627 off_t off;
628 int async = bp->b_flags & B_ASYNC;
629 int error;
630
631 /* collect statistics */
632 ev_io_total.ev_count++;
633 if (async)
634 ev_io_async.ev_count++;
635 if (BUF_ISWRITE(bp)) {
636 ev_bwrite_total.ev_count += bp->b_bcount;
637 if (async)
638 ev_bwrite_async.ev_count += bp->b_bcount;
639 } else {
640 ev_bread_total.ev_count++;
641 }
642
643 off = bp->b_blkno << sectshift;
644 /*
645 * Do bounds checking if we're working on a file. Otherwise
646 * invalid file systems might attempt to read beyond EOF. This
647 * is bad(tm) especially on mmapped images. This is essentially
648 * the kernel bounds_check() routines.
649 */
650 if (off + bp->b_bcount > rblk->rblk_size) {
651 int64_t sz = rblk->rblk_size - off;
652
653 /* EOF */
654 if (sz == 0) {
655 rump_biodone(bp, 0, 0);
656 return;
657 }
658 /* beyond EOF ==> error */
659 if (sz < 0) {
660 rump_biodone(bp, 0, EINVAL);
661 return;
662 }
663
664 /* truncate to device size */
665 bp->b_bcount = sz;
666 }
667
668 off += rblk->rblk_hostoffset;
669 DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64
670 " (0x%" PRIx64 " - 0x%" PRIx64 "), %ssync\n",
671 bp->b_bcount, BUF_ISREAD(bp) ? "READ" : "WRITE",
672 off, off, (off + bp->b_bcount), async ? "a" : ""));
673
674 /* mmap? handle here and return */
675 if (rblk->rblk_mmflags) {
676 struct blkwin *win;
677 int winsize, iodone;
678 uint8_t *ioaddr, *bufaddr;
679
680 for (iodone = 0; iodone < bp->b_bcount;
681 iodone += winsize, off += winsize) {
682 winsize = bp->b_bcount - iodone;
683 win = getwindow(rblk, off, &winsize, &error);
684 if (win == NULL) {
685 rump_biodone(bp, iodone, error);
686 return;
687 }
688
689 ioaddr = (uint8_t *)win->win_mem + (off-STARTWIN(off));
690 bufaddr = (uint8_t *)bp->b_data + iodone;
691
692 DPRINTF(("strat: %p off 0x%" PRIx64
693 ", ioaddr %p (%p)/buf %p\n", win,
694 win->win_off, ioaddr, win->win_mem, bufaddr));
695 if (BUF_ISREAD(bp)) {
696 memcpy(bufaddr, ioaddr, winsize);
697 } else {
698 memcpy(ioaddr, bufaddr, winsize);
699 }
700
701 /* synchronous write, sync bits back to disk */
702 if (BUF_ISWRITE(bp) && !async) {
703 rumpuser_memsync(ioaddr, winsize, &error);
704 }
705 putwindow(rblk, win);
706 }
707
708 rump_biodone(bp, bp->b_bcount, 0);
709 return;
710 }
711
712 /*
713 * Do I/O. We have different paths for async and sync I/O.
714 * Async I/O is done by passing a request to rumpuser where
715 * it is executed. The rumpuser routine then calls
716 * biodone() to signal any waiters in the kernel. I/O's are
717 * executed in series. Technically executing them in parallel
718 * would produce better results, but then we'd need either
719 * more threads or posix aio. Maybe worth investigating
720 * this later.
721 *
722 * Using bufq here might be a good idea.
723 */
724
725 if (rump_threads) {
726 struct rumpuser_aio *rua;
727 int op, fd;
728
729 fd = rblk->rblk_fd;
730 if (BUF_ISREAD(bp)) {
731 op = RUA_OP_READ;
732 } else {
733 op = RUA_OP_WRITE;
734 if (!async) {
735 /* O_DIRECT not fully automatic yet */
736 #ifdef HAS_ODIRECT
737 if ((off & ((1<<sectshift)-1)) == 0
738 && ((intptr_t)bp->b_data
739 & ((1<<sectshift)-1)) == 0
740 && (bp->b_bcount & ((1<<sectshift)-1)) == 0)
741 fd = rblk->rblk_dfd;
742 else
743 #endif
744 op |= RUA_OP_SYNC;
745 }
746 }
747
748 rumpuser_mutex_enter(&rumpuser_aio_mtx);
749 while ((rumpuser_aio_head+1) % N_AIOS == rumpuser_aio_tail) {
750 rumpuser_cv_wait(&rumpuser_aio_cv, &rumpuser_aio_mtx);
751 }
752
753 rua = &rumpuser_aios[rumpuser_aio_head];
754 KASSERT(rua->rua_bp == NULL);
755 rua->rua_fd = fd;
756 rua->rua_data = bp->b_data;
757 rua->rua_dlen = bp->b_bcount;
758 rua->rua_off = off;
759 rua->rua_bp = bp;
760 rua->rua_op = op;
761
762 /* insert into queue & signal */
763 rumpuser_aio_head = (rumpuser_aio_head+1) % N_AIOS;
764 rumpuser_cv_signal(&rumpuser_aio_cv);
765 rumpuser_mutex_exit(&rumpuser_aio_mtx);
766 } else {
767 if (BUF_ISREAD(bp)) {
768 rumpuser_read_bio(rblk->rblk_fd, bp->b_data,
769 bp->b_bcount, off, rump_biodone, bp);
770 } else {
771 rumpuser_write_bio(rblk->rblk_fd, bp->b_data,
772 bp->b_bcount, off, rump_biodone, bp);
773 }
774 if (BUF_ISWRITE(bp) && !async)
775 rumpuser_fsync(rblk->rblk_fd, &error);
776 }
777 }
778
779 void
780 rumpblk_strategy(struct buf *bp)
781 {
782
783 dostrategy(bp);
784 }
785
786 /*
787 * Simple random number generator. This is private so that we can
788 * very repeatedly control which blocks will fail.
789 *
790 * <mlelstv> pooka, rand()
791 * <mlelstv> [paste]
792 */
793 static unsigned
794 gimmerand(void)
795 {
796
797 return (randstate = randstate * 1103515245 + 12345) % (0x80000000L);
798 }
799
800 /*
801 * Block device with very simple fault injection. Fails every
802 * n out of BLKFAIL_MAX I/O with EIO. n is determined by the env
803 * variable RUMP_BLKFAIL.
804 */
805 void
806 rumpblk_strategy_fail(struct buf *bp)
807 {
808
809 if (gimmerand() % BLKFAIL_MAX >= blkfail) {
810 dostrategy(bp);
811 } else {
812 printf("block fault injection: failing I/O on block %lld\n",
813 (long long)bp->b_blkno);
814 bp->b_error = EIO;
815 biodone(bp);
816 }
817 }
818