rumpblk.c revision 1.26 1 /* $NetBSD: rumpblk.c,v 1.26 2009/10/06 13:05:44 pooka Exp $ */
2
3 /*
4 * Copyright (c) 2009 Antti Kantee. All Rights Reserved.
5 *
6 * Development of this software was supported by the
7 * Finnish Cultural Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
19 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 /*
32 * Block device emulation. Presents a block device interface and
33 * uses rumpuser system calls to satisfy I/O requests.
34 *
35 * We provide fault injection. The driver can be made to fail
36 * I/O occasionally.
37 *
38 * The driver also provides an optimization for regular files by
39 * using memory-mapped I/O. This avoids kernel access for every
40 * I/O operation. It also gives finer-grained control of how to
41 * flush data. Additionally, in case the rump kernel dumps core,
42 * we get way less carnage.
43 */
44
45 #include <sys/cdefs.h>
46 __KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.26 2009/10/06 13:05:44 pooka Exp $");
47
48 #include <sys/param.h>
49 #include <sys/buf.h>
50 #include <sys/conf.h>
51 #include <sys/condvar.h>
52 #include <sys/disklabel.h>
53 #include <sys/evcnt.h>
54 #include <sys/fcntl.h>
55 #include <sys/kmem.h>
56 #include <sys/malloc.h>
57 #include <sys/queue.h>
58 #include <sys/stat.h>
59
60 #include <rump/rumpuser.h>
61
62 #include "rump_private.h"
63 #include "rump_vfs_private.h"
64
65 #if 0
66 #define DPRINTF(x) printf x
67 #else
68 #define DPRINTF(x)
69 #endif
70
71 /* Default: 16 x 1MB windows */
72 unsigned memwinsize = (1<<20);
73 unsigned memwincnt = 16;
74
75 #define STARTWIN(off) ((off) & ~(memwinsize-1))
76 #define INWIN(win,off) ((win)->win_off == STARTWIN(off))
77 #define WINSIZE(rblk, win) (MIN((rblk->rblk_size-win->win_off),memwinsize))
78 #define WINVALID(win) ((win)->win_off != (off_t)-1)
79 #define WINVALIDATE(win) ((win)->win_off = (off_t)-1)
80 struct blkwin {
81 off_t win_off;
82 void *win_mem;
83 int win_refcnt;
84
85 TAILQ_ENTRY(blkwin) win_lru;
86 };
87
88 #define RUMPBLK_SIZE 16
89 static struct rblkdev {
90 char *rblk_path;
91 int rblk_fd;
92 int rblk_opencnt;
93 #ifdef HAS_ODIRECT
94 int rblk_dfd;
95 #endif
96 uint64_t rblk_size;
97
98 /* for mmap */
99 int rblk_mmflags;
100 kmutex_t rblk_memmtx;
101 kcondvar_t rblk_memcv;
102 TAILQ_HEAD(winlru, blkwin) rblk_lruq;
103 bool rblk_waiting;
104
105 struct partition *rblk_curpi;
106 struct partition rblk_pi;
107 struct disklabel rblk_dl;
108 } minors[RUMPBLK_SIZE];
109
110 static struct evcnt ev_io_total;
111 static struct evcnt ev_io_async;
112
113 static struct evcnt ev_memblk_hits;
114 static struct evcnt ev_memblk_busy;
115
116 static struct evcnt ev_bwrite_total;
117 static struct evcnt ev_bwrite_async;
118 static struct evcnt ev_bread_total;
119
120 dev_type_open(rumpblk_open);
121 dev_type_close(rumpblk_close);
122 dev_type_read(rumpblk_read);
123 dev_type_write(rumpblk_write);
124 dev_type_ioctl(rumpblk_ioctl);
125 dev_type_strategy(rumpblk_strategy);
126 dev_type_strategy(rumpblk_strategy_fail);
127 dev_type_dump(rumpblk_dump);
128 dev_type_size(rumpblk_size);
129
130 static const struct bdevsw rumpblk_bdevsw = {
131 rumpblk_open, rumpblk_close, rumpblk_strategy, rumpblk_ioctl,
132 nodump, nosize, D_DISK
133 };
134
135 static const struct bdevsw rumpblk_bdevsw_fail = {
136 rumpblk_open, rumpblk_close, rumpblk_strategy_fail, rumpblk_ioctl,
137 nodump, nosize, D_DISK
138 };
139
140 static const struct cdevsw rumpblk_cdevsw = {
141 rumpblk_open, rumpblk_close, rumpblk_read, rumpblk_write,
142 rumpblk_ioctl, nostop, notty, nopoll, nommap, nokqfilter, D_DISK
143 };
144
145 /* fail every n out of BLKFAIL_MAX */
146 #define BLKFAIL_MAX 10000
147 static int blkfail;
148 static unsigned randstate;
149 static kmutex_t rumpblk_lock;
150
151 static struct blkwin *
152 getwindow(struct rblkdev *rblk, off_t off, int *wsize, int *error)
153 {
154 struct blkwin *win;
155
156 mutex_enter(&rblk->rblk_memmtx);
157 retry:
158 /* search for window */
159 TAILQ_FOREACH(win, &rblk->rblk_lruq, win_lru) {
160 if (INWIN(win, off) && WINVALID(win))
161 break;
162 }
163
164 /* found? return */
165 if (win) {
166 ev_memblk_hits.ev_count++;
167 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
168 goto good;
169 }
170
171 /*
172 * Else, create new window. If the least recently used is not
173 * currently in use, reuse that. Otherwise we need to wait.
174 */
175 win = TAILQ_LAST(&rblk->rblk_lruq, winlru);
176 if (win->win_refcnt == 0) {
177 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
178 mutex_exit(&rblk->rblk_memmtx);
179
180 if (WINVALID(win)) {
181 DPRINTF(("win %p, unmap mem %p, off 0x%" PRIx64 "\n",
182 win, win->win_mem, win->win_off));
183 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win));
184 WINVALIDATE(win);
185 }
186
187 win->win_off = STARTWIN(off);
188 win->win_mem = rumpuser_filemmap(rblk->rblk_fd, win->win_off,
189 WINSIZE(rblk, win), rblk->rblk_mmflags, error);
190 DPRINTF(("win %p, off 0x%" PRIx64 ", mem %p\n",
191 win, win->win_off, win->win_mem));
192
193 mutex_enter(&rblk->rblk_memmtx);
194 if (win->win_mem == NULL) {
195 WINVALIDATE(win);
196 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru);
197 mutex_exit(&rblk->rblk_memmtx);
198 return NULL;
199 }
200 } else {
201 DPRINTF(("memwin wait\n"));
202 ev_memblk_busy.ev_count++;
203
204 rblk->rblk_waiting = true;
205 cv_wait(&rblk->rblk_memcv, &rblk->rblk_memmtx);
206 goto retry;
207 }
208
209 good:
210 KASSERT(win);
211 win->win_refcnt++;
212 TAILQ_INSERT_HEAD(&rblk->rblk_lruq, win, win_lru);
213 mutex_exit(&rblk->rblk_memmtx);
214 *wsize = MIN(*wsize, memwinsize - (off-win->win_off));
215 KASSERT(*wsize);
216
217 return win;
218 }
219
220 static void
221 putwindow(struct rblkdev *rblk, struct blkwin *win)
222 {
223
224 mutex_enter(&rblk->rblk_memmtx);
225 if (--win->win_refcnt == 0 && rblk->rblk_waiting) {
226 rblk->rblk_waiting = false;
227 cv_signal(&rblk->rblk_memcv);
228 }
229 KASSERT(win->win_refcnt >= 0);
230 mutex_exit(&rblk->rblk_memmtx);
231 }
232
233 static void
234 wincleanup(struct rblkdev *rblk)
235 {
236 struct blkwin *win;
237
238 while ((win = TAILQ_FIRST(&rblk->rblk_lruq)) != NULL) {
239 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
240 if (WINVALID(win)) {
241 DPRINTF(("cleanup win %p addr %p\n",
242 win, win->win_mem));
243 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win));
244 }
245 kmem_free(win, sizeof(*win));
246 }
247 rblk->rblk_mmflags = 0;
248 }
249
250 int
251 rumpblk_init(void)
252 {
253 char buf[64];
254 int rumpblk = RUMPBLK;
255 unsigned tmp;
256 int error, i;
257
258 mutex_init(&rumpblk_lock, MUTEX_DEFAULT, IPL_NONE);
259
260 if (rumpuser_getenv("RUMP_BLKFAIL", buf, sizeof(buf), &error) == 0) {
261 blkfail = strtoul(buf, NULL, 10);
262 /* fail everything */
263 if (blkfail > BLKFAIL_MAX)
264 blkfail = BLKFAIL_MAX;
265 if (rumpuser_getenv("RUMP_BLKFAIL_SEED", buf, sizeof(buf),
266 &error) == 0) {
267 randstate = strtoul(buf, NULL, 10);
268 } else {
269 randstate = arc4random();
270 }
271 printf("rumpblk: FAULT INJECTION ACTIVE! fail %d/%d. "
272 "seed %u\n", blkfail, BLKFAIL_MAX, randstate);
273 } else {
274 blkfail = 0;
275 }
276
277 if (rumpuser_getenv("RUMP_BLKWINSIZE", buf, sizeof(buf), &error) == 0) {
278 printf("rumpblk: ");
279 tmp = strtoul(buf, NULL, 10);
280 if (tmp && !(tmp & (tmp-1)))
281 memwinsize = tmp;
282 else
283 printf("invalid RUMP_BLKWINSIZE %d, ", tmp);
284 printf("using %d for memwinsize\n", memwinsize);
285 }
286 if (rumpuser_getenv("RUMP_BLKWINCOUNT", buf, sizeof(buf), &error) == 0){
287 printf("rumpblk: ");
288 tmp = strtoul(buf, NULL, 10);
289 if (tmp)
290 memwincnt = tmp;
291 else
292 printf("invalid RUMP_BLKWINCOUNT %d, ", tmp);
293 printf("using %d for memwincount\n", memwincnt);
294 }
295
296 memset(minors, 0, sizeof(minors));
297 for (i = 0; i < RUMPBLK_SIZE; i++) {
298 mutex_init(&minors[i].rblk_memmtx, MUTEX_DEFAULT, IPL_NONE);
299 cv_init(&minors[i].rblk_memcv, "rblkmcv");
300 }
301
302 evcnt_attach_dynamic(&ev_io_total, EVCNT_TYPE_MISC, NULL,
303 "rumpblk", "rumpblk I/O reqs");
304 evcnt_attach_dynamic(&ev_io_async, EVCNT_TYPE_MISC, NULL,
305 "rumpblk", "rumpblk async I/O");
306
307 evcnt_attach_dynamic(&ev_bread_total, EVCNT_TYPE_MISC, NULL,
308 "rumpblk", "rumpblk bytes read");
309 evcnt_attach_dynamic(&ev_bwrite_total, EVCNT_TYPE_MISC, NULL,
310 "rumpblk", "rumpblk bytes written");
311 evcnt_attach_dynamic(&ev_bwrite_async, EVCNT_TYPE_MISC, NULL,
312 "rumpblk", "rumpblk bytes written async");
313
314 evcnt_attach_dynamic(&ev_memblk_hits, EVCNT_TYPE_MISC, NULL,
315 "rumpblk", "memblk window hits");
316 evcnt_attach_dynamic(&ev_memblk_busy, EVCNT_TYPE_MISC, NULL,
317 "rumpblk", "memblk all windows busy");
318
319 if (blkfail) {
320 return devsw_attach("rumpblk", &rumpblk_bdevsw_fail, &rumpblk,
321 &rumpblk_cdevsw, &rumpblk);
322 } else {
323 return devsw_attach("rumpblk", &rumpblk_bdevsw, &rumpblk,
324 &rumpblk_cdevsw, &rumpblk);
325 }
326 }
327
328 /* XXX: no deregister */
329 int
330 rumpblk_register(const char *path, devminor_t *dmin)
331 {
332 uint64_t flen;
333 size_t len;
334 int ftype, error, i;
335
336 if (rumpuser_getfileinfo(path, &flen, &ftype, &error))
337 return error;
338 /* verify host file is of supported type */
339 if (!(ftype == RUMPUSER_FT_REG
340 || ftype == RUMPUSER_FT_BLK
341 || ftype == RUMPUSER_FT_CHR))
342 return EINVAL;
343
344 mutex_enter(&rumpblk_lock);
345 for (i = 0; i < RUMPBLK_SIZE; i++) {
346 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
347 mutex_exit(&rumpblk_lock);
348 *dmin = i;
349 return 0;
350 }
351 }
352
353 for (i = 0; i < RUMPBLK_SIZE; i++)
354 if (minors[i].rblk_path == NULL)
355 break;
356 if (i == RUMPBLK_SIZE) {
357 mutex_exit(&rumpblk_lock);
358 return EBUSY;
359 }
360
361 len = strlen(path);
362 minors[i].rblk_path = malloc(len + 1, M_TEMP, M_WAITOK);
363 strcpy(minors[i].rblk_path, path);
364 minors[i].rblk_fd = -1;
365 mutex_exit(&rumpblk_lock);
366
367 *dmin = i;
368 return 0;
369 }
370
371 int
372 rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l)
373 {
374 struct rblkdev *rblk = &minors[minor(dev)];
375 uint64_t fsize;
376 int ft, dummy;
377 int error, fd;
378
379 if (rblk->rblk_fd != -1)
380 return 0; /* XXX: refcount, open mode */
381 fd = rumpuser_open(rblk->rblk_path, OFLAGS(flag), &error);
382 if (error)
383 return error;
384
385 if (rumpuser_getfileinfo(rblk->rblk_path, &fsize, &ft, &error) == -1) {
386 rumpuser_close(fd, &dummy);
387 return error;
388 }
389
390 #ifdef HAS_ODIRECT
391 rblk->rblk_dfd = rumpuser_open(rblk->rblk_path,
392 OFLAGS(flag) | O_DIRECT, &error);
393 if (error)
394 return error;
395 #endif
396
397 if (ft == RUMPUSER_FT_REG) {
398 struct blkwin *win;
399 int i, winsize;
400
401 /*
402 * Use mmap to access a regular file. Allocate and
403 * cache initial windows here. Failure to allocate one
404 * means fallback to read/write i/o.
405 */
406
407 rblk->rblk_mmflags = 0;
408 if (flag & FREAD)
409 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_READ;
410 if (flag & FWRITE) {
411 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_WRITE;
412 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_SHARED;
413 }
414
415 TAILQ_INIT(&rblk->rblk_lruq);
416 rblk->rblk_size = fsize;
417 rblk->rblk_fd = fd;
418
419 for (i = 0; i < memwincnt && i * memwinsize < fsize; i++) {
420 win = kmem_zalloc(sizeof(*win), KM_SLEEP);
421 WINVALIDATE(win);
422 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru);
423
424 /*
425 * Allocate first windows. Here we just generally
426 * make sure a) we can mmap at all b) we have the
427 * necessary VA available
428 */
429 winsize = 1;
430 win = getwindow(rblk, i*memwinsize, &winsize, &error);
431 if (win) {
432 putwindow(rblk, win);
433 } else {
434 wincleanup(rblk);
435 break;
436 }
437 }
438
439 memset(&rblk->rblk_dl, 0, sizeof(rblk->rblk_dl));
440 rblk->rblk_pi.p_size = fsize >> DEV_BSHIFT;
441 rblk->rblk_dl.d_secsize = DEV_BSIZE;
442 rblk->rblk_curpi = &rblk->rblk_pi;
443 } else {
444 if (rumpuser_ioctl(fd, DIOCGDINFO, &rblk->rblk_dl,
445 &error) == -1) {
446 KASSERT(error);
447 rumpuser_close(fd, &dummy);
448 return error;
449 }
450
451 rblk->rblk_fd = fd;
452 rblk->rblk_curpi = &rblk->rblk_dl.d_partitions[0];
453 }
454
455 KASSERT(rblk->rblk_fd != -1);
456 return 0;
457 }
458
459 int
460 rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l)
461 {
462 struct rblkdev *rblk = &minors[minor(dev)];
463 int dummy;
464
465 if (rblk->rblk_mmflags)
466 wincleanup(rblk);
467 rumpuser_fsync(rblk->rblk_fd, &dummy);
468 rumpuser_close(rblk->rblk_fd, &dummy);
469 rblk->rblk_fd = -1;
470
471 return 0;
472 }
473
474 int
475 rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l)
476 {
477 struct rblkdev *rblk = &minors[minor(dev)];
478 int rv, error;
479
480 if (xfer == DIOCGPART) {
481 struct partinfo *pi = (struct partinfo *)addr;
482
483 pi->part = rblk->rblk_curpi;
484 pi->disklab = &rblk->rblk_dl;
485
486 return 0;
487 }
488
489 rv = rumpuser_ioctl(rblk->rblk_fd, xfer, addr, &error);
490 if (rv == -1)
491 return error;
492
493 return 0;
494 }
495
496 static int
497 do_physio(dev_t dev, struct uio *uio, int which)
498 {
499 void (*strat)(struct buf *);
500
501 if (blkfail)
502 strat = rumpblk_strategy_fail;
503 else
504 strat = rumpblk_strategy;
505
506 return physio(strat, NULL, dev, which, minphys, uio);
507 }
508
509 int
510 rumpblk_read(dev_t dev, struct uio *uio, int flags)
511 {
512
513 return do_physio(dev, uio, B_READ);
514 }
515
516 int
517 rumpblk_write(dev_t dev, struct uio *uio, int flags)
518 {
519
520 return do_physio(dev, uio, B_WRITE);
521 }
522
523 static void
524 dostrategy(struct buf *bp)
525 {
526 struct rblkdev *rblk = &minors[minor(bp->b_dev)];
527 off_t off;
528 int async = bp->b_flags & B_ASYNC;
529 int error;
530
531 /* collect statistics */
532 ev_io_total.ev_count++;
533 if (async)
534 ev_io_async.ev_count++;
535 if (BUF_ISWRITE(bp)) {
536 ev_bwrite_total.ev_count += bp->b_bcount;
537 if (async)
538 ev_bwrite_async.ev_count += bp->b_bcount;
539 } else {
540 ev_bread_total.ev_count++;
541 }
542
543 off = bp->b_blkno << DEV_BSHIFT;
544 /*
545 * Do bounds checking if we're working on a file. Otherwise
546 * invalid file systems might attempt to read beyond EOF. This
547 * is bad(tm) especially on mmapped images. This is essentially
548 * the kernel bounds_check() routines.
549 */
550 if (rblk->rblk_size && off + bp->b_bcount > rblk->rblk_size) {
551 int64_t sz = rblk->rblk_size - off;
552
553 /* EOF */
554 if (sz == 0) {
555 rump_biodone(bp, 0, 0);
556 return;
557 }
558 /* beyond EOF ==> error */
559 if (sz < 0) {
560 rump_biodone(bp, 0, EINVAL);
561 return;
562 }
563
564 /* truncate to device size */
565 bp->b_bcount = sz;
566 }
567
568 DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64
569 " (0x%" PRIx64 " - 0x%" PRIx64 "), %ssync\n",
570 bp->b_bcount, BUF_ISREAD(bp) ? "READ" : "WRITE",
571 off, off, (off + bp->b_bcount), async ? "a" : ""));
572
573 /* mmap? handle here and return */
574 if (rblk->rblk_mmflags) {
575 struct blkwin *win;
576 int winsize, iodone;
577 uint8_t *ioaddr, *bufaddr;
578
579 for (iodone = 0; iodone < bp->b_bcount;
580 iodone += winsize, off += winsize) {
581 winsize = bp->b_bcount - iodone;
582 win = getwindow(rblk, off, &winsize, &error);
583 if (win == NULL) {
584 rump_biodone(bp, iodone, error);
585 return;
586 }
587
588 ioaddr = (uint8_t *)win->win_mem + (off-STARTWIN(off));
589 bufaddr = (uint8_t *)bp->b_data + iodone;
590
591 DPRINTF(("strat: %p off 0x%" PRIx64
592 ", ioaddr %p (%p)/buf %p\n", win,
593 win->win_off, ioaddr, win->win_mem, bufaddr));
594 if (BUF_ISREAD(bp)) {
595 memcpy(bufaddr, ioaddr, winsize);
596 } else {
597 memcpy(ioaddr, bufaddr, winsize);
598 }
599
600 /* synchronous write, sync bits back to disk */
601 if (BUF_ISWRITE(bp) && !async) {
602 rumpuser_memsync(ioaddr, winsize, &error);
603 }
604 putwindow(rblk, win);
605 }
606
607 rump_biodone(bp, bp->b_bcount, 0);
608 return;
609 }
610
611 /*
612 * Do I/O. We have different paths for async and sync I/O.
613 * Async I/O is done by passing a request to rumpuser where
614 * it is executed. The rumpuser routine then calls
615 * biodone() to signal any waiters in the kernel. I/O's are
616 * executed in series. Technically executing them in parallel
617 * would produce better results, but then we'd need either
618 * more threads or posix aio. Maybe worth investigating
619 * this later.
620 *
621 * Using bufq here might be a good idea.
622 */
623
624 if (rump_threads) {
625 struct rumpuser_aio *rua;
626 int op, fd;
627
628 fd = rblk->rblk_fd;
629 if (BUF_ISREAD(bp)) {
630 op = RUA_OP_READ;
631 } else {
632 op = RUA_OP_WRITE;
633 if (!async) {
634 /* O_DIRECT not fully automatic yet */
635 #ifdef HAS_ODIRECT
636 if ((off & (DEV_BSIZE-1)) == 0
637 && ((intptr_t)bp->b_data&(DEV_BSIZE-1)) == 0
638 && (bp->b_bcount & (DEV_BSIZE-1)) == 0)
639 fd = rblk->rblk_dfd;
640 else
641 #endif
642 op |= RUA_OP_SYNC;
643 }
644 }
645
646 rumpuser_mutex_enter(&rumpuser_aio_mtx);
647 while ((rumpuser_aio_head+1) % N_AIOS == rumpuser_aio_tail) {
648 rumpuser_cv_wait(&rumpuser_aio_cv, &rumpuser_aio_mtx);
649 }
650
651 rua = &rumpuser_aios[rumpuser_aio_head];
652 KASSERT(rua->rua_bp == NULL);
653 rua->rua_fd = fd;
654 rua->rua_data = bp->b_data;
655 rua->rua_dlen = bp->b_bcount;
656 rua->rua_off = off;
657 rua->rua_bp = bp;
658 rua->rua_op = op;
659
660 /* insert into queue & signal */
661 rumpuser_aio_head = (rumpuser_aio_head+1) % N_AIOS;
662 rumpuser_cv_signal(&rumpuser_aio_cv);
663 rumpuser_mutex_exit(&rumpuser_aio_mtx);
664 } else {
665 if (BUF_ISREAD(bp)) {
666 rumpuser_read_bio(rblk->rblk_fd, bp->b_data,
667 bp->b_bcount, off, rump_biodone, bp);
668 } else {
669 rumpuser_write_bio(rblk->rblk_fd, bp->b_data,
670 bp->b_bcount, off, rump_biodone, bp);
671 }
672 if (BUF_ISWRITE(bp) && !async)
673 rumpuser_fsync(rblk->rblk_fd, &error);
674 }
675 }
676
677 void
678 rumpblk_strategy(struct buf *bp)
679 {
680
681 dostrategy(bp);
682 }
683
684 /*
685 * Simple random number generator. This is private so that we can
686 * very repeatedly control which blocks will fail.
687 *
688 * <mlelstv> pooka, rand()
689 * <mlelstv> [paste]
690 */
691 static unsigned
692 gimmerand(void)
693 {
694
695 return (randstate = randstate * 1103515245 + 12345) % (0x80000000L);
696 }
697
698 /*
699 * Block device with very simple fault injection. Fails every
700 * n out of BLKFAIL_MAX I/O with EIO. n is determined by the env
701 * variable RUMP_BLKFAIL.
702 */
703 void
704 rumpblk_strategy_fail(struct buf *bp)
705 {
706
707 if (gimmerand() % BLKFAIL_MAX >= blkfail) {
708 dostrategy(bp);
709 } else {
710 printf("block fault injection: failing I/O on block %lld\n",
711 (long long)bp->b_blkno);
712 bp->b_error = EIO;
713 biodone(bp);
714 }
715 }
716