rumpblk.c revision 1.41 1 /* $NetBSD: rumpblk.c,v 1.41 2010/06/21 14:25:35 pooka Exp $ */
2
3 /*
4 * Copyright (c) 2009 Antti Kantee. All Rights Reserved.
5 *
6 * Development of this software was supported by the
7 * Finnish Cultural Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
19 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 /*
32 * Block device emulation. Presents a block device interface and
33 * uses rumpuser system calls to satisfy I/O requests.
34 *
35 * We provide fault injection. The driver can be made to fail
36 * I/O occasionally.
37 *
38 * The driver also provides an optimization for regular files by
39 * using memory-mapped I/O. This avoids kernel access for every
40 * I/O operation. It also gives finer-grained control of how to
41 * flush data. Additionally, in case the rump kernel dumps core,
42 * we get way less carnage.
43 *
44 * However, it is quite costly in writing large amounts of
45 * file data, since old contents cannot merely be overwritten, but
46 * must be paged in first before replacing (i.e. r/m/w). Ideally,
47 * we should use directio. The problem is that directio can fail
48 * silently causing improper file system semantics (i.e. unflushed
49 * data). Therefore, default to mmap for now. Even so, directio
50 * _should_ be safe and can be enabled by compiling this module
51 * with -DHAS_DIRECTIO.
52 */
53
54 #include <sys/cdefs.h>
55 __KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.41 2010/06/21 14:25:35 pooka Exp $");
56
57 #include <sys/param.h>
58 #include <sys/buf.h>
59 #include <sys/conf.h>
60 #include <sys/condvar.h>
61 #include <sys/disklabel.h>
62 #include <sys/evcnt.h>
63 #include <sys/fcntl.h>
64 #include <sys/kmem.h>
65 #include <sys/malloc.h>
66 #include <sys/queue.h>
67 #include <sys/stat.h>
68
69 #include <rump/rumpuser.h>
70
71 #include "rump_private.h"
72 #include "rump_vfs_private.h"
73
74 #if 0
75 #define DPRINTF(x) printf x
76 #else
77 #define DPRINTF(x)
78 #endif
79
80 /* Default: 16 x 1MB windows */
81 unsigned memwinsize = (1<<20);
82 unsigned memwincnt = 16;
83
84 #define STARTWIN(off) ((off) & ~((off_t)memwinsize-1))
85 #define INWIN(win,off) ((win)->win_off == STARTWIN(off))
86 #define WINSIZE(rblk, win) (MIN((rblk->rblk_hostsize-win->win_off), \
87 memwinsize))
88 #define WINVALID(win) ((win)->win_off != (off_t)-1)
89 #define WINVALIDATE(win) ((win)->win_off = (off_t)-1)
90 struct blkwin {
91 off_t win_off;
92 void *win_mem;
93 int win_refcnt;
94
95 TAILQ_ENTRY(blkwin) win_lru;
96 };
97
98 #define RUMPBLK_SIZE 16
99 static struct rblkdev {
100 char *rblk_path;
101 int rblk_fd;
102 int rblk_opencnt;
103 #ifdef HAS_ODIRECT
104 int rblk_dfd;
105 #endif
106 uint64_t rblk_size;
107 uint64_t rblk_hostoffset;
108 uint64_t rblk_hostsize;
109 int rblk_ftype;
110
111 /* for mmap */
112 int rblk_mmflags;
113 kmutex_t rblk_memmtx;
114 kcondvar_t rblk_memcv;
115 TAILQ_HEAD(winlru, blkwin) rblk_lruq;
116 bool rblk_waiting;
117
118 struct disklabel rblk_label;
119 } minors[RUMPBLK_SIZE];
120
121 static struct evcnt ev_io_total;
122 static struct evcnt ev_io_async;
123
124 static struct evcnt ev_memblk_hits;
125 static struct evcnt ev_memblk_busy;
126
127 static struct evcnt ev_bwrite_total;
128 static struct evcnt ev_bwrite_async;
129 static struct evcnt ev_bread_total;
130
131 dev_type_open(rumpblk_open);
132 dev_type_close(rumpblk_close);
133 dev_type_read(rumpblk_read);
134 dev_type_write(rumpblk_write);
135 dev_type_ioctl(rumpblk_ioctl);
136 dev_type_strategy(rumpblk_strategy);
137 dev_type_strategy(rumpblk_strategy_fail);
138 dev_type_dump(rumpblk_dump);
139 dev_type_size(rumpblk_size);
140
141 static const struct bdevsw rumpblk_bdevsw = {
142 rumpblk_open, rumpblk_close, rumpblk_strategy, rumpblk_ioctl,
143 nodump, nosize, D_DISK
144 };
145
146 static const struct bdevsw rumpblk_bdevsw_fail = {
147 rumpblk_open, rumpblk_close, rumpblk_strategy_fail, rumpblk_ioctl,
148 nodump, nosize, D_DISK
149 };
150
151 static const struct cdevsw rumpblk_cdevsw = {
152 rumpblk_open, rumpblk_close, rumpblk_read, rumpblk_write,
153 rumpblk_ioctl, nostop, notty, nopoll, nommap, nokqfilter, D_DISK
154 };
155
156 /* fail every n out of BLKFAIL_MAX */
157 #define BLKFAIL_MAX 10000
158 static int blkfail;
159 static unsigned randstate;
160 static kmutex_t rumpblk_lock;
161 static int sectshift = DEV_BSHIFT;
162
163 static void
164 makedefaultlabel(struct disklabel *lp, off_t size, int part)
165 {
166 int i;
167
168 memset(lp, 0, sizeof(*lp));
169
170 lp->d_secperunit = size;
171 lp->d_secsize = 1 << sectshift;
172 lp->d_nsectors = size >> sectshift;
173 lp->d_ntracks = 1;
174 lp->d_ncylinders = 1;
175 lp->d_secpercyl = lp->d_nsectors;
176
177 /* oh dear oh dear */
178 strncpy(lp->d_typename, "rumpd", sizeof(lp->d_typename));
179 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
180
181 lp->d_type = DTYPE_RUMPD;
182 lp->d_rpm = 11;
183 lp->d_interleave = 1;
184 lp->d_flags = 0;
185
186 /* XXX: RAW_PART handling? */
187 for (i = 0; i < part; i++) {
188 lp->d_partitions[i].p_fstype = FS_UNUSED;
189 }
190 lp->d_partitions[part].p_size = size >> sectshift;
191 lp->d_npartitions = part+1;
192 /* XXX: file system type? */
193
194 lp->d_magic = DISKMAGIC;
195 lp->d_magic2 = DISKMAGIC;
196 lp->d_checksum = 0; /* XXX */
197 }
198
199 static struct blkwin *
200 getwindow(struct rblkdev *rblk, off_t off, int *wsize, int *error)
201 {
202 struct blkwin *win;
203
204 mutex_enter(&rblk->rblk_memmtx);
205 retry:
206 /* search for window */
207 TAILQ_FOREACH(win, &rblk->rblk_lruq, win_lru) {
208 if (INWIN(win, off) && WINVALID(win))
209 break;
210 }
211
212 /* found? return */
213 if (win) {
214 ev_memblk_hits.ev_count++;
215 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
216 goto good;
217 }
218
219 /*
220 * Else, create new window. If the least recently used is not
221 * currently in use, reuse that. Otherwise we need to wait.
222 */
223 win = TAILQ_LAST(&rblk->rblk_lruq, winlru);
224 if (win->win_refcnt == 0) {
225 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
226 mutex_exit(&rblk->rblk_memmtx);
227
228 if (WINVALID(win)) {
229 DPRINTF(("win %p, unmap mem %p, off 0x%" PRIx64 "\n",
230 win, win->win_mem, win->win_off));
231 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win));
232 WINVALIDATE(win);
233 }
234
235 win->win_off = STARTWIN(off);
236 win->win_mem = rumpuser_filemmap(rblk->rblk_fd, win->win_off,
237 WINSIZE(rblk, win), rblk->rblk_mmflags, error);
238 DPRINTF(("win %p, off 0x%" PRIx64 ", mem %p\n",
239 win, win->win_off, win->win_mem));
240
241 mutex_enter(&rblk->rblk_memmtx);
242 if (win->win_mem == NULL) {
243 WINVALIDATE(win);
244 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru);
245 mutex_exit(&rblk->rblk_memmtx);
246 return NULL;
247 }
248 } else {
249 DPRINTF(("memwin wait\n"));
250 ev_memblk_busy.ev_count++;
251
252 rblk->rblk_waiting = true;
253 cv_wait(&rblk->rblk_memcv, &rblk->rblk_memmtx);
254 goto retry;
255 }
256
257 good:
258 KASSERT(win);
259 win->win_refcnt++;
260 TAILQ_INSERT_HEAD(&rblk->rblk_lruq, win, win_lru);
261 mutex_exit(&rblk->rblk_memmtx);
262 *wsize = MIN(*wsize, memwinsize - (off-win->win_off));
263 KASSERT(*wsize);
264
265 return win;
266 }
267
268 static void
269 putwindow(struct rblkdev *rblk, struct blkwin *win)
270 {
271
272 mutex_enter(&rblk->rblk_memmtx);
273 if (--win->win_refcnt == 0 && rblk->rblk_waiting) {
274 rblk->rblk_waiting = false;
275 cv_signal(&rblk->rblk_memcv);
276 }
277 KASSERT(win->win_refcnt >= 0);
278 mutex_exit(&rblk->rblk_memmtx);
279 }
280
281 static void
282 wincleanup(struct rblkdev *rblk)
283 {
284 struct blkwin *win;
285
286 while ((win = TAILQ_FIRST(&rblk->rblk_lruq)) != NULL) {
287 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
288 if (WINVALID(win)) {
289 DPRINTF(("cleanup win %p addr %p\n",
290 win, win->win_mem));
291 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win));
292 }
293 kmem_free(win, sizeof(*win));
294 }
295 rblk->rblk_mmflags = 0;
296 }
297
298 int
299 rumpblk_init(void)
300 {
301 char buf[64];
302 devmajor_t rumpblkmaj = RUMPBLK_DEVMAJOR;
303 unsigned tmp;
304 int error, i;
305
306 mutex_init(&rumpblk_lock, MUTEX_DEFAULT, IPL_NONE);
307
308 if (rumpuser_getenv("RUMP_BLKFAIL", buf, sizeof(buf), &error) == 0) {
309 blkfail = strtoul(buf, NULL, 10);
310 /* fail everything */
311 if (blkfail > BLKFAIL_MAX)
312 blkfail = BLKFAIL_MAX;
313 if (rumpuser_getenv("RUMP_BLKFAIL_SEED", buf, sizeof(buf),
314 &error) == 0) {
315 randstate = strtoul(buf, NULL, 10);
316 } else {
317 randstate = arc4random();
318 }
319 printf("rumpblk: FAULT INJECTION ACTIVE! fail %d/%d. "
320 "seed %u\n", blkfail, BLKFAIL_MAX, randstate);
321 } else {
322 blkfail = 0;
323 }
324
325 if (rumpuser_getenv("RUMP_BLKWINSIZE", buf, sizeof(buf), &error) == 0) {
326 printf("rumpblk: ");
327 tmp = strtoul(buf, NULL, 10);
328 if (tmp && !(tmp & (tmp-1)))
329 memwinsize = tmp;
330 else
331 printf("invalid RUMP_BLKWINSIZE %d, ", tmp);
332 printf("using %d for memwinsize\n", memwinsize);
333 }
334 if (rumpuser_getenv("RUMP_BLKWINCOUNT", buf, sizeof(buf), &error) == 0){
335 printf("rumpblk: ");
336 tmp = strtoul(buf, NULL, 10);
337 if (tmp)
338 memwincnt = tmp;
339 else
340 printf("invalid RUMP_BLKWINCOUNT %d, ", tmp);
341 printf("using %d for memwincount\n", memwincnt);
342 }
343 if (rumpuser_getenv("RUMP_BLKSECTSHIFT", buf, sizeof(buf), &error)==0){
344 printf("rumpblk: ");
345 tmp = strtoul(buf, NULL, 10);
346 if (tmp >= DEV_BSHIFT)
347 sectshift = tmp;
348 else
349 printf("RUMP_BLKSECTSHIFT must be least %d (now %d), ",
350 DEV_BSHIFT, tmp);
351 printf("using %d for sector shift (size %d)\n",
352 sectshift, 1<<sectshift);
353 }
354
355 memset(minors, 0, sizeof(minors));
356 for (i = 0; i < RUMPBLK_SIZE; i++) {
357 mutex_init(&minors[i].rblk_memmtx, MUTEX_DEFAULT, IPL_NONE);
358 cv_init(&minors[i].rblk_memcv, "rblkmcv");
359 }
360
361 evcnt_attach_dynamic(&ev_io_total, EVCNT_TYPE_MISC, NULL,
362 "rumpblk", "I/O reqs");
363 evcnt_attach_dynamic(&ev_io_async, EVCNT_TYPE_MISC, NULL,
364 "rumpblk", "async I/O");
365
366 evcnt_attach_dynamic(&ev_bread_total, EVCNT_TYPE_MISC, NULL,
367 "rumpblk", "bytes read");
368 evcnt_attach_dynamic(&ev_bwrite_total, EVCNT_TYPE_MISC, NULL,
369 "rumpblk", "bytes written");
370 evcnt_attach_dynamic(&ev_bwrite_async, EVCNT_TYPE_MISC, NULL,
371 "rumpblk", "bytes written async");
372
373 evcnt_attach_dynamic(&ev_memblk_hits, EVCNT_TYPE_MISC, NULL,
374 "rumpblk", "window hits");
375 evcnt_attach_dynamic(&ev_memblk_busy, EVCNT_TYPE_MISC, NULL,
376 "rumpblk", "all windows busy");
377
378 if (blkfail) {
379 return devsw_attach("rumpblk",
380 &rumpblk_bdevsw_fail, &rumpblkmaj,
381 &rumpblk_cdevsw, &rumpblkmaj);
382 } else {
383 return devsw_attach("rumpblk",
384 &rumpblk_bdevsw, &rumpblkmaj,
385 &rumpblk_cdevsw, &rumpblkmaj);
386 }
387 }
388
389 int
390 rumpblk_register(const char *path, devminor_t *dmin,
391 uint64_t offset, uint64_t size)
392 {
393 struct rblkdev *rblk;
394 uint64_t flen;
395 size_t len;
396 int ftype, error, i;
397
398 /* devices might not report correct size unless they're open */
399 if (rumpuser_getfileinfo(path, &flen, &ftype, &error) == -1)
400 return error;
401
402 /* verify host file is of supported type */
403 if (!(ftype == RUMPUSER_FT_REG
404 || ftype == RUMPUSER_FT_BLK
405 || ftype == RUMPUSER_FT_CHR))
406 return EINVAL;
407
408 mutex_enter(&rumpblk_lock);
409 for (i = 0; i < RUMPBLK_SIZE; i++) {
410 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
411 mutex_exit(&rumpblk_lock);
412 *dmin = i;
413 return 0;
414 }
415 }
416
417 for (i = 0; i < RUMPBLK_SIZE; i++)
418 if (minors[i].rblk_path == NULL)
419 break;
420 if (i == RUMPBLK_SIZE) {
421 mutex_exit(&rumpblk_lock);
422 return EBUSY;
423 }
424
425 rblk = &minors[i];
426 len = strlen(path);
427 rblk->rblk_path = malloc(len + 1, M_TEMP, M_WAITOK);
428 strcpy(rblk->rblk_path, path);
429 rblk->rblk_fd = -1;
430 rblk->rblk_hostoffset = offset;
431 if (size != RUMPBLK_SIZENOTSET) {
432 KASSERT(size + offset <= flen);
433 rblk->rblk_size = size;
434 } else {
435 KASSERT(offset < flen);
436 rblk->rblk_size = flen - offset;
437 }
438 rblk->rblk_hostsize = flen;
439 rblk->rblk_ftype = ftype;
440 makedefaultlabel(&rblk->rblk_label, rblk->rblk_size, i);
441 mutex_exit(&rumpblk_lock);
442
443 *dmin = i;
444 return 0;
445 }
446
447 /*
448 * Unregister rumpblk. It's the callers responsibility to make
449 * sure it's no longer in use.
450 */
451 int
452 rumpblk_deregister(const char *path)
453 {
454 struct rblkdev *rblk;
455 int i;
456
457 mutex_enter(&rumpblk_lock);
458 for (i = 0; i < RUMPBLK_SIZE; i++) {
459 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
460 break;
461 }
462 }
463 mutex_exit(&rumpblk_lock);
464
465 if (i == RUMPBLK_SIZE)
466 return ENOENT;
467
468 rblk = &minors[i];
469 KASSERT(rblk->rblk_fd == -1);
470 KASSERT(rblk->rblk_opencnt == 0);
471
472 wincleanup(rblk);
473 free(rblk->rblk_path, M_TEMP);
474 rblk->rblk_path = NULL;
475 memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label));
476
477 return 0;
478 }
479
480 int
481 rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l)
482 {
483 struct rblkdev *rblk = &minors[minor(dev)];
484 int error, fd;
485
486 if (rblk->rblk_path == NULL)
487 return ENXIO;
488
489 if (rblk->rblk_fd != -1)
490 return 0; /* XXX: refcount, open mode */
491 fd = rumpuser_open(rblk->rblk_path, OFLAGS(flag), &error);
492 if (error)
493 return error;
494
495 #ifdef HAS_ODIRECT
496 rblk->rblk_dfd = rumpuser_open(rblk->rblk_path,
497 OFLAGS(flag) | O_DIRECT, &error);
498 if (error)
499 return error;
500 #endif
501
502 if (rblk->rblk_ftype == RUMPUSER_FT_REG) {
503 uint64_t fsize = rblk->rblk_size, off = rblk->rblk_hostoffset;
504 struct blkwin *win;
505 int i, winsize;
506
507 /*
508 * Use mmap to access a regular file. Allocate and
509 * cache initial windows here. Failure to allocate one
510 * means fallback to read/write i/o.
511 */
512
513 rblk->rblk_mmflags = 0;
514 if (flag & FREAD)
515 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_READ;
516 if (flag & FWRITE) {
517 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_WRITE;
518 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_SHARED;
519 }
520
521 TAILQ_INIT(&rblk->rblk_lruq);
522 rblk->rblk_fd = fd;
523
524 for (i = 0; i < memwincnt && off + i*memwinsize < fsize; i++) {
525 win = kmem_zalloc(sizeof(*win), KM_SLEEP);
526 WINVALIDATE(win);
527 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru);
528
529 /*
530 * Allocate first windows. Here we just generally
531 * make sure a) we can mmap at all b) we have the
532 * necessary VA available
533 */
534 winsize = memwinsize;
535 win = getwindow(rblk, off + i*memwinsize, &winsize,
536 &error);
537 if (win) {
538 putwindow(rblk, win);
539 } else {
540 wincleanup(rblk);
541 break;
542 }
543 }
544 } else {
545 rblk->rblk_fd = fd;
546 }
547
548 KASSERT(rblk->rblk_fd != -1);
549 return 0;
550 }
551
552 int
553 rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l)
554 {
555 struct rblkdev *rblk = &minors[minor(dev)];
556 int dummy;
557
558 if (rblk->rblk_mmflags)
559 wincleanup(rblk);
560 rumpuser_fsync(rblk->rblk_fd, &dummy);
561 rumpuser_close(rblk->rblk_fd, &dummy);
562 rblk->rblk_fd = -1;
563
564 return 0;
565 }
566
567 int
568 rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l)
569 {
570 devminor_t dmin = minor(dev);
571 struct rblkdev *rblk = &minors[dmin];
572 struct partinfo *pi;
573 int error = 0;
574
575 /* well, me should support a few more, but we don't for now */
576 switch (xfer) {
577 case DIOCGDINFO:
578 *(struct disklabel *)addr = rblk->rblk_label;
579 break;
580
581 case DIOCGPART:
582 pi = addr;
583 pi->part = &rblk->rblk_label.d_partitions[DISKPART(dmin)];
584 pi->disklab = &rblk->rblk_label;
585 break;
586
587 /* it's synced enough along the write path */
588 case DIOCCACHESYNC:
589 break;
590
591 default:
592 error = ENOTTY;
593 break;
594 }
595
596 return error;
597 }
598
599 static int
600 do_physio(dev_t dev, struct uio *uio, int which)
601 {
602 void (*strat)(struct buf *);
603
604 if (blkfail)
605 strat = rumpblk_strategy_fail;
606 else
607 strat = rumpblk_strategy;
608
609 return physio(strat, NULL, dev, which, minphys, uio);
610 }
611
612 int
613 rumpblk_read(dev_t dev, struct uio *uio, int flags)
614 {
615
616 return do_physio(dev, uio, B_READ);
617 }
618
619 int
620 rumpblk_write(dev_t dev, struct uio *uio, int flags)
621 {
622
623 return do_physio(dev, uio, B_WRITE);
624 }
625
626 static void
627 dostrategy(struct buf *bp)
628 {
629 struct rblkdev *rblk = &minors[minor(bp->b_dev)];
630 off_t off;
631 int async = bp->b_flags & B_ASYNC;
632 int error;
633
634 /* collect statistics */
635 ev_io_total.ev_count++;
636 if (async)
637 ev_io_async.ev_count++;
638 if (BUF_ISWRITE(bp)) {
639 ev_bwrite_total.ev_count += bp->b_bcount;
640 if (async)
641 ev_bwrite_async.ev_count += bp->b_bcount;
642 } else {
643 ev_bread_total.ev_count++;
644 }
645
646 off = bp->b_blkno << sectshift;
647 /*
648 * Do bounds checking if we're working on a file. Otherwise
649 * invalid file systems might attempt to read beyond EOF. This
650 * is bad(tm) especially on mmapped images. This is essentially
651 * the kernel bounds_check() routines.
652 */
653 if (off + bp->b_bcount > rblk->rblk_size) {
654 int64_t sz = rblk->rblk_size - off;
655
656 /* EOF */
657 if (sz == 0) {
658 rump_biodone(bp, 0, 0);
659 return;
660 }
661 /* beyond EOF ==> error */
662 if (sz < 0) {
663 rump_biodone(bp, 0, EINVAL);
664 return;
665 }
666
667 /* truncate to device size */
668 bp->b_bcount = sz;
669 }
670
671 off += rblk->rblk_hostoffset;
672 DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64
673 " (0x%" PRIx64 " - 0x%" PRIx64 "), %ssync\n",
674 bp->b_bcount, BUF_ISREAD(bp) ? "READ" : "WRITE",
675 off, off, (off + bp->b_bcount), async ? "a" : ""));
676
677 /* mmap? handle here and return */
678 if (rblk->rblk_mmflags) {
679 struct blkwin *win;
680 int winsize, iodone;
681 uint8_t *ioaddr, *bufaddr;
682
683 for (iodone = 0; iodone < bp->b_bcount;
684 iodone += winsize, off += winsize) {
685 winsize = bp->b_bcount - iodone;
686 win = getwindow(rblk, off, &winsize, &error);
687 if (win == NULL) {
688 rump_biodone(bp, iodone, error);
689 return;
690 }
691
692 ioaddr = (uint8_t *)win->win_mem + (off-STARTWIN(off));
693 bufaddr = (uint8_t *)bp->b_data + iodone;
694
695 DPRINTF(("strat: %p off 0x%" PRIx64
696 ", ioaddr %p (%p)/buf %p\n", win,
697 win->win_off, ioaddr, win->win_mem, bufaddr));
698 if (BUF_ISREAD(bp)) {
699 memcpy(bufaddr, ioaddr, winsize);
700 } else {
701 memcpy(ioaddr, bufaddr, winsize);
702 }
703
704 /* synchronous write, sync bits back to disk */
705 if (BUF_ISWRITE(bp) && !async) {
706 rumpuser_memsync(ioaddr, winsize, &error);
707 }
708 putwindow(rblk, win);
709 }
710
711 rump_biodone(bp, bp->b_bcount, 0);
712 return;
713 }
714
715 /*
716 * Do I/O. We have different paths for async and sync I/O.
717 * Async I/O is done by passing a request to rumpuser where
718 * it is executed. The rumpuser routine then calls
719 * biodone() to signal any waiters in the kernel. I/O's are
720 * executed in series. Technically executing them in parallel
721 * would produce better results, but then we'd need either
722 * more threads or posix aio. Maybe worth investigating
723 * this later.
724 *
725 * Using bufq here might be a good idea.
726 */
727
728 if (rump_threads) {
729 struct rumpuser_aio *rua;
730 int op, fd;
731
732 fd = rblk->rblk_fd;
733 if (BUF_ISREAD(bp)) {
734 op = RUA_OP_READ;
735 } else {
736 op = RUA_OP_WRITE;
737 if (!async) {
738 /* O_DIRECT not fully automatic yet */
739 #ifdef HAS_ODIRECT
740 if ((off & ((1<<sectshift)-1)) == 0
741 && ((intptr_t)bp->b_data
742 & ((1<<sectshift)-1)) == 0
743 && (bp->b_bcount & ((1<<sectshift)-1)) == 0)
744 fd = rblk->rblk_dfd;
745 else
746 #endif
747 op |= RUA_OP_SYNC;
748 }
749 }
750
751 rumpuser_mutex_enter(&rumpuser_aio_mtx);
752 while ((rumpuser_aio_head+1) % N_AIOS == rumpuser_aio_tail) {
753 rumpuser_cv_wait(&rumpuser_aio_cv, &rumpuser_aio_mtx);
754 }
755
756 rua = &rumpuser_aios[rumpuser_aio_head];
757 KASSERT(rua->rua_bp == NULL);
758 rua->rua_fd = fd;
759 rua->rua_data = bp->b_data;
760 rua->rua_dlen = bp->b_bcount;
761 rua->rua_off = off;
762 rua->rua_bp = bp;
763 rua->rua_op = op;
764
765 /* insert into queue & signal */
766 rumpuser_aio_head = (rumpuser_aio_head+1) % N_AIOS;
767 rumpuser_cv_signal(&rumpuser_aio_cv);
768 rumpuser_mutex_exit(&rumpuser_aio_mtx);
769 } else {
770 if (BUF_ISREAD(bp)) {
771 rumpuser_read_bio(rblk->rblk_fd, bp->b_data,
772 bp->b_bcount, off, rump_biodone, bp);
773 } else {
774 rumpuser_write_bio(rblk->rblk_fd, bp->b_data,
775 bp->b_bcount, off, rump_biodone, bp);
776 }
777 if (BUF_ISWRITE(bp) && !async)
778 rumpuser_fsync(rblk->rblk_fd, &error);
779 }
780 }
781
782 void
783 rumpblk_strategy(struct buf *bp)
784 {
785
786 dostrategy(bp);
787 }
788
789 /*
790 * Simple random number generator. This is private so that we can
791 * very repeatedly control which blocks will fail.
792 *
793 * <mlelstv> pooka, rand()
794 * <mlelstv> [paste]
795 */
796 static unsigned
797 gimmerand(void)
798 {
799
800 return (randstate = randstate * 1103515245 + 12345) % (0x80000000L);
801 }
802
803 /*
804 * Block device with very simple fault injection. Fails every
805 * n out of BLKFAIL_MAX I/O with EIO. n is determined by the env
806 * variable RUMP_BLKFAIL.
807 */
808 void
809 rumpblk_strategy_fail(struct buf *bp)
810 {
811
812 if (gimmerand() % BLKFAIL_MAX >= blkfail) {
813 dostrategy(bp);
814 } else {
815 printf("block fault injection: failing I/O on block %lld\n",
816 (long long)bp->b_blkno);
817 bp->b_error = EIO;
818 biodone(bp);
819 }
820 }
821