rumpblk.c revision 1.30 1 /* $NetBSD: rumpblk.c,v 1.30 2009/11/19 13:46:55 pooka Exp $ */
2
3 /*
4 * Copyright (c) 2009 Antti Kantee. All Rights Reserved.
5 *
6 * Development of this software was supported by the
7 * Finnish Cultural Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
19 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 /*
32 * Block device emulation. Presents a block device interface and
33 * uses rumpuser system calls to satisfy I/O requests.
34 *
35 * We provide fault injection. The driver can be made to fail
36 * I/O occasionally.
37 *
38 * The driver also provides an optimization for regular files by
39 * using memory-mapped I/O. This avoids kernel access for every
40 * I/O operation. It also gives finer-grained control of how to
41 * flush data. Additionally, in case the rump kernel dumps core,
42 * we get way less carnage.
43 *
44 * However, it is quite costly in writing large amounts of
45 * file data, since old contents cannot merely be overwritten, but
46 * must be paged in first before replacing (i.e. r/m/w). Ideally,
47 * we should use directio. The problem is that directio can fail
48 * silently causing improper file system semantics (i.e. unflushed
49 * data). Therefore, default to mmap for now. Even so, directio
50 * _should_ be safe and can be enabled by compiling this module
51 * with -DHAS_DIRECTIO.
52 */
53
54 #include <sys/cdefs.h>
55 __KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.30 2009/11/19 13:46:55 pooka Exp $");
56
57 #include <sys/param.h>
58 #include <sys/buf.h>
59 #include <sys/conf.h>
60 #include <sys/condvar.h>
61 #include <sys/disklabel.h>
62 #include <sys/evcnt.h>
63 #include <sys/fcntl.h>
64 #include <sys/kmem.h>
65 #include <sys/malloc.h>
66 #include <sys/queue.h>
67 #include <sys/stat.h>
68
69 #include <rump/rumpuser.h>
70
71 #include "rump_private.h"
72 #include "rump_vfs_private.h"
73
74 #if 0
75 #define DPRINTF(x) printf x
76 #else
77 #define DPRINTF(x)
78 #endif
79
80 /* Default: 16 x 1MB windows */
81 unsigned memwinsize = (1<<20);
82 unsigned memwincnt = 16;
83
84 #define STARTWIN(off) ((off) & ~(memwinsize-1))
85 #define INWIN(win,off) ((win)->win_off == STARTWIN(off))
86 #define WINSIZE(rblk, win) (MIN((rblk->rblk_size-win->win_off),memwinsize))
87 #define WINVALID(win) ((win)->win_off != (off_t)-1)
88 #define WINVALIDATE(win) ((win)->win_off = (off_t)-1)
89 struct blkwin {
90 off_t win_off;
91 void *win_mem;
92 int win_refcnt;
93
94 TAILQ_ENTRY(blkwin) win_lru;
95 };
96
97 #define RUMPBLK_SIZE 16
98 static struct rblkdev {
99 char *rblk_path;
100 int rblk_fd;
101 int rblk_opencnt;
102 #ifdef HAS_ODIRECT
103 int rblk_dfd;
104 #endif
105 uint64_t rblk_size;
106 uint64_t rblk_hostoffset;
107 int rblk_ftype;
108
109 /* for mmap */
110 int rblk_mmflags;
111 kmutex_t rblk_memmtx;
112 kcondvar_t rblk_memcv;
113 TAILQ_HEAD(winlru, blkwin) rblk_lruq;
114 bool rblk_waiting;
115
116 struct partition *rblk_curpi;
117 struct partition rblk_pi;
118 struct disklabel rblk_dl;
119 } minors[RUMPBLK_SIZE];
120
121 static struct evcnt ev_io_total;
122 static struct evcnt ev_io_async;
123
124 static struct evcnt ev_memblk_hits;
125 static struct evcnt ev_memblk_busy;
126
127 static struct evcnt ev_bwrite_total;
128 static struct evcnt ev_bwrite_async;
129 static struct evcnt ev_bread_total;
130
131 dev_type_open(rumpblk_open);
132 dev_type_close(rumpblk_close);
133 dev_type_read(rumpblk_read);
134 dev_type_write(rumpblk_write);
135 dev_type_ioctl(rumpblk_ioctl);
136 dev_type_strategy(rumpblk_strategy);
137 dev_type_strategy(rumpblk_strategy_fail);
138 dev_type_dump(rumpblk_dump);
139 dev_type_size(rumpblk_size);
140
141 static const struct bdevsw rumpblk_bdevsw = {
142 rumpblk_open, rumpblk_close, rumpblk_strategy, rumpblk_ioctl,
143 nodump, nosize, D_DISK
144 };
145
146 static const struct bdevsw rumpblk_bdevsw_fail = {
147 rumpblk_open, rumpblk_close, rumpblk_strategy_fail, rumpblk_ioctl,
148 nodump, nosize, D_DISK
149 };
150
151 static const struct cdevsw rumpblk_cdevsw = {
152 rumpblk_open, rumpblk_close, rumpblk_read, rumpblk_write,
153 rumpblk_ioctl, nostop, notty, nopoll, nommap, nokqfilter, D_DISK
154 };
155
156 /* fail every n out of BLKFAIL_MAX */
157 #define BLKFAIL_MAX 10000
158 static int blkfail;
159 static unsigned randstate;
160 static kmutex_t rumpblk_lock;
161
162 static struct blkwin *
163 getwindow(struct rblkdev *rblk, off_t off, int *wsize, int *error)
164 {
165 struct blkwin *win;
166
167 mutex_enter(&rblk->rblk_memmtx);
168 retry:
169 /* search for window */
170 TAILQ_FOREACH(win, &rblk->rblk_lruq, win_lru) {
171 if (INWIN(win, off) && WINVALID(win))
172 break;
173 }
174
175 /* found? return */
176 if (win) {
177 ev_memblk_hits.ev_count++;
178 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
179 goto good;
180 }
181
182 /*
183 * Else, create new window. If the least recently used is not
184 * currently in use, reuse that. Otherwise we need to wait.
185 */
186 win = TAILQ_LAST(&rblk->rblk_lruq, winlru);
187 if (win->win_refcnt == 0) {
188 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
189 mutex_exit(&rblk->rblk_memmtx);
190
191 if (WINVALID(win)) {
192 DPRINTF(("win %p, unmap mem %p, off 0x%" PRIx64 "\n",
193 win, win->win_mem, win->win_off));
194 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win));
195 WINVALIDATE(win);
196 }
197
198 win->win_off = STARTWIN(off);
199 win->win_mem = rumpuser_filemmap(rblk->rblk_fd, win->win_off,
200 WINSIZE(rblk, win), rblk->rblk_mmflags, error);
201 DPRINTF(("win %p, off 0x%" PRIx64 ", mem %p\n",
202 win, win->win_off, win->win_mem));
203
204 mutex_enter(&rblk->rblk_memmtx);
205 if (win->win_mem == NULL) {
206 WINVALIDATE(win);
207 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru);
208 mutex_exit(&rblk->rblk_memmtx);
209 return NULL;
210 }
211 } else {
212 DPRINTF(("memwin wait\n"));
213 ev_memblk_busy.ev_count++;
214
215 rblk->rblk_waiting = true;
216 cv_wait(&rblk->rblk_memcv, &rblk->rblk_memmtx);
217 goto retry;
218 }
219
220 good:
221 KASSERT(win);
222 win->win_refcnt++;
223 TAILQ_INSERT_HEAD(&rblk->rblk_lruq, win, win_lru);
224 mutex_exit(&rblk->rblk_memmtx);
225 *wsize = MIN(*wsize, memwinsize - (off-win->win_off));
226 KASSERT(*wsize);
227
228 return win;
229 }
230
231 static void
232 putwindow(struct rblkdev *rblk, struct blkwin *win)
233 {
234
235 mutex_enter(&rblk->rblk_memmtx);
236 if (--win->win_refcnt == 0 && rblk->rblk_waiting) {
237 rblk->rblk_waiting = false;
238 cv_signal(&rblk->rblk_memcv);
239 }
240 KASSERT(win->win_refcnt >= 0);
241 mutex_exit(&rblk->rblk_memmtx);
242 }
243
244 static void
245 wincleanup(struct rblkdev *rblk)
246 {
247 struct blkwin *win;
248
249 while ((win = TAILQ_FIRST(&rblk->rblk_lruq)) != NULL) {
250 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
251 if (WINVALID(win)) {
252 DPRINTF(("cleanup win %p addr %p\n",
253 win, win->win_mem));
254 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win));
255 }
256 kmem_free(win, sizeof(*win));
257 }
258 rblk->rblk_mmflags = 0;
259 }
260
261 int
262 rumpblk_init(void)
263 {
264 char buf[64];
265 int rumpblk = RUMPBLK;
266 unsigned tmp;
267 int error, i;
268
269 mutex_init(&rumpblk_lock, MUTEX_DEFAULT, IPL_NONE);
270
271 if (rumpuser_getenv("RUMP_BLKFAIL", buf, sizeof(buf), &error) == 0) {
272 blkfail = strtoul(buf, NULL, 10);
273 /* fail everything */
274 if (blkfail > BLKFAIL_MAX)
275 blkfail = BLKFAIL_MAX;
276 if (rumpuser_getenv("RUMP_BLKFAIL_SEED", buf, sizeof(buf),
277 &error) == 0) {
278 randstate = strtoul(buf, NULL, 10);
279 } else {
280 randstate = arc4random();
281 }
282 printf("rumpblk: FAULT INJECTION ACTIVE! fail %d/%d. "
283 "seed %u\n", blkfail, BLKFAIL_MAX, randstate);
284 } else {
285 blkfail = 0;
286 }
287
288 if (rumpuser_getenv("RUMP_BLKWINSIZE", buf, sizeof(buf), &error) == 0) {
289 printf("rumpblk: ");
290 tmp = strtoul(buf, NULL, 10);
291 if (tmp && !(tmp & (tmp-1)))
292 memwinsize = tmp;
293 else
294 printf("invalid RUMP_BLKWINSIZE %d, ", tmp);
295 printf("using %d for memwinsize\n", memwinsize);
296 }
297 if (rumpuser_getenv("RUMP_BLKWINCOUNT", buf, sizeof(buf), &error) == 0){
298 printf("rumpblk: ");
299 tmp = strtoul(buf, NULL, 10);
300 if (tmp)
301 memwincnt = tmp;
302 else
303 printf("invalid RUMP_BLKWINCOUNT %d, ", tmp);
304 printf("using %d for memwincount\n", memwincnt);
305 }
306
307 memset(minors, 0, sizeof(minors));
308 for (i = 0; i < RUMPBLK_SIZE; i++) {
309 mutex_init(&minors[i].rblk_memmtx, MUTEX_DEFAULT, IPL_NONE);
310 cv_init(&minors[i].rblk_memcv, "rblkmcv");
311 }
312
313 evcnt_attach_dynamic(&ev_io_total, EVCNT_TYPE_MISC, NULL,
314 "rumpblk", "rumpblk I/O reqs");
315 evcnt_attach_dynamic(&ev_io_async, EVCNT_TYPE_MISC, NULL,
316 "rumpblk", "rumpblk async I/O");
317
318 evcnt_attach_dynamic(&ev_bread_total, EVCNT_TYPE_MISC, NULL,
319 "rumpblk", "rumpblk bytes read");
320 evcnt_attach_dynamic(&ev_bwrite_total, EVCNT_TYPE_MISC, NULL,
321 "rumpblk", "rumpblk bytes written");
322 evcnt_attach_dynamic(&ev_bwrite_async, EVCNT_TYPE_MISC, NULL,
323 "rumpblk", "rumpblk bytes written async");
324
325 evcnt_attach_dynamic(&ev_memblk_hits, EVCNT_TYPE_MISC, NULL,
326 "rumpblk", "memblk window hits");
327 evcnt_attach_dynamic(&ev_memblk_busy, EVCNT_TYPE_MISC, NULL,
328 "rumpblk", "memblk all windows busy");
329
330 if (blkfail) {
331 return devsw_attach("rumpblk", &rumpblk_bdevsw_fail, &rumpblk,
332 &rumpblk_cdevsw, &rumpblk);
333 } else {
334 return devsw_attach("rumpblk", &rumpblk_bdevsw, &rumpblk,
335 &rumpblk_cdevsw, &rumpblk);
336 }
337 }
338
339 /* XXX: no deregister */
340 int
341 rumpblk_register(const char *path, devminor_t *dmin,
342 uint64_t offset, uint64_t size)
343 {
344 struct rblkdev *rblk;
345 uint64_t flen;
346 size_t len;
347 int ftype, error, i;
348
349 /* devices might not report correct size unless they're open */
350 if (rumpuser_getfileinfo(path, &flen, &ftype, &error) == -1)
351 return error;
352
353 /* verify host file is of supported type */
354 if (!(ftype == RUMPUSER_FT_REG
355 || ftype == RUMPUSER_FT_BLK
356 || ftype == RUMPUSER_FT_CHR))
357 return EINVAL;
358
359 mutex_enter(&rumpblk_lock);
360 for (i = 0; i < RUMPBLK_SIZE; i++) {
361 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
362 mutex_exit(&rumpblk_lock);
363 *dmin = i;
364 return 0;
365 }
366 }
367
368 for (i = 0; i < RUMPBLK_SIZE; i++)
369 if (minors[i].rblk_path == NULL)
370 break;
371 if (i == RUMPBLK_SIZE) {
372 mutex_exit(&rumpblk_lock);
373 return EBUSY;
374 }
375
376 rblk = &minors[i];
377 len = strlen(path);
378 rblk->rblk_path = malloc(len + 1, M_TEMP, M_WAITOK);
379 strcpy(rblk->rblk_path, path);
380 rblk->rblk_fd = -1;
381 rblk->rblk_hostoffset = offset;
382 if (size == RUMPBLK_SIZENOTSET) {
383 KASSERT(size + offset <= flen);
384 rblk->rblk_size = size;
385 } else {
386 KASSERT(offset < flen);
387 rblk->rblk_size = flen - offset;
388 }
389 rblk->rblk_ftype = ftype;
390 mutex_exit(&rumpblk_lock);
391
392 *dmin = i;
393 return 0;
394 }
395
396 int
397 rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l)
398 {
399 struct rblkdev *rblk = &minors[minor(dev)];
400 int dummy;
401 int error, fd;
402
403 if (rblk->rblk_path == NULL)
404 return ENXIO;
405
406 if (rblk->rblk_fd != -1)
407 return 0; /* XXX: refcount, open mode */
408 fd = rumpuser_open(rblk->rblk_path, OFLAGS(flag), &error);
409 if (error)
410 return error;
411
412 #ifdef HAS_ODIRECT
413 rblk->rblk_dfd = rumpuser_open(rblk->rblk_path,
414 OFLAGS(flag) | O_DIRECT, &error);
415 if (error)
416 return error;
417 #endif
418
419 if (rblk->rblk_ftype == RUMPUSER_FT_REG) {
420 uint64_t fsize = rblk->rblk_size, off = rblk->rblk_hostoffset;
421 struct blkwin *win;
422 int i, winsize;
423
424 /*
425 * Use mmap to access a regular file. Allocate and
426 * cache initial windows here. Failure to allocate one
427 * means fallback to read/write i/o.
428 */
429
430 rblk->rblk_mmflags = 0;
431 if (flag & FREAD)
432 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_READ;
433 if (flag & FWRITE) {
434 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_WRITE;
435 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_SHARED;
436 }
437
438 TAILQ_INIT(&rblk->rblk_lruq);
439 rblk->rblk_fd = fd;
440
441 for (i = 0; i < memwincnt && off + i*memwinsize < fsize; i++) {
442 win = kmem_zalloc(sizeof(*win), KM_SLEEP);
443 WINVALIDATE(win);
444 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru);
445
446 /*
447 * Allocate first windows. Here we just generally
448 * make sure a) we can mmap at all b) we have the
449 * necessary VA available
450 */
451 winsize = memwinsize;
452 win = getwindow(rblk, off + i*memwinsize, &winsize,
453 &error);
454 if (win) {
455 putwindow(rblk, win);
456 } else {
457 wincleanup(rblk);
458 break;
459 }
460 }
461
462 memset(&rblk->rblk_dl, 0, sizeof(rblk->rblk_dl));
463 rblk->rblk_pi.p_size = fsize >> DEV_BSHIFT;
464 rblk->rblk_dl.d_secsize = DEV_BSIZE;
465 rblk->rblk_curpi = &rblk->rblk_pi;
466 } else {
467 rblk->rblk_fd = fd;
468
469 if ((error = rumpblk_ioctl(dev, DIOCGDINFO, &rblk->rblk_dl,
470 0, curlwp)) != 0) {
471 rumpuser_close(fd, &dummy);
472 return error;
473 }
474
475 rblk->rblk_curpi = &rblk->rblk_dl.d_partitions[0];
476 }
477
478 KASSERT(rblk->rblk_fd != -1);
479 return 0;
480 }
481
482 int
483 rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l)
484 {
485 struct rblkdev *rblk = &minors[minor(dev)];
486 int dummy;
487
488 if (rblk->rblk_mmflags)
489 wincleanup(rblk);
490 rumpuser_fsync(rblk->rblk_fd, &dummy);
491 rumpuser_close(rblk->rblk_fd, &dummy);
492 rblk->rblk_fd = -1;
493
494 return 0;
495 }
496
497 int
498 rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l)
499 {
500 struct rblkdev *rblk = &minors[minor(dev)];
501 int rv, error;
502
503 if (xfer == DIOCGPART) {
504 struct partinfo *pi = (struct partinfo *)addr;
505
506 pi->part = rblk->rblk_curpi;
507 pi->disklab = &rblk->rblk_dl;
508
509 return 0;
510 }
511
512 rv = rumpuser_ioctl(rblk->rblk_fd, xfer, addr, &error);
513 if (rv == -1)
514 return error;
515
516 return 0;
517 }
518
519 static int
520 do_physio(dev_t dev, struct uio *uio, int which)
521 {
522 void (*strat)(struct buf *);
523
524 if (blkfail)
525 strat = rumpblk_strategy_fail;
526 else
527 strat = rumpblk_strategy;
528
529 return physio(strat, NULL, dev, which, minphys, uio);
530 }
531
532 int
533 rumpblk_read(dev_t dev, struct uio *uio, int flags)
534 {
535
536 return do_physio(dev, uio, B_READ);
537 }
538
539 int
540 rumpblk_write(dev_t dev, struct uio *uio, int flags)
541 {
542
543 return do_physio(dev, uio, B_WRITE);
544 }
545
546 static void
547 dostrategy(struct buf *bp)
548 {
549 struct rblkdev *rblk = &minors[minor(bp->b_dev)];
550 off_t off;
551 int async = bp->b_flags & B_ASYNC;
552 int error;
553
554 /* collect statistics */
555 ev_io_total.ev_count++;
556 if (async)
557 ev_io_async.ev_count++;
558 if (BUF_ISWRITE(bp)) {
559 ev_bwrite_total.ev_count += bp->b_bcount;
560 if (async)
561 ev_bwrite_async.ev_count += bp->b_bcount;
562 } else {
563 ev_bread_total.ev_count++;
564 }
565
566 off = bp->b_blkno << DEV_BSHIFT;
567 off += rblk->rblk_hostoffset;
568 /*
569 * Do bounds checking if we're working on a file. Otherwise
570 * invalid file systems might attempt to read beyond EOF. This
571 * is bad(tm) especially on mmapped images. This is essentially
572 * the kernel bounds_check() routines.
573 */
574 if (off + bp->b_bcount > rblk->rblk_size) {
575 int64_t sz = rblk->rblk_size - off;
576
577 /* EOF */
578 if (sz == 0) {
579 rump_biodone(bp, 0, 0);
580 return;
581 }
582 /* beyond EOF ==> error */
583 if (sz < 0) {
584 rump_biodone(bp, 0, EINVAL);
585 return;
586 }
587
588 /* truncate to device size */
589 bp->b_bcount = sz;
590 }
591
592 DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64
593 " (0x%" PRIx64 " - 0x%" PRIx64 "), %ssync\n",
594 bp->b_bcount, BUF_ISREAD(bp) ? "READ" : "WRITE",
595 off, off, (off + bp->b_bcount), async ? "a" : ""));
596
597 /* mmap? handle here and return */
598 if (rblk->rblk_mmflags) {
599 struct blkwin *win;
600 int winsize, iodone;
601 uint8_t *ioaddr, *bufaddr;
602
603 for (iodone = 0; iodone < bp->b_bcount;
604 iodone += winsize, off += winsize) {
605 winsize = bp->b_bcount - iodone;
606 win = getwindow(rblk, off, &winsize, &error);
607 if (win == NULL) {
608 rump_biodone(bp, iodone, error);
609 return;
610 }
611
612 ioaddr = (uint8_t *)win->win_mem + (off-STARTWIN(off));
613 bufaddr = (uint8_t *)bp->b_data + iodone;
614
615 DPRINTF(("strat: %p off 0x%" PRIx64
616 ", ioaddr %p (%p)/buf %p\n", win,
617 win->win_off, ioaddr, win->win_mem, bufaddr));
618 if (BUF_ISREAD(bp)) {
619 memcpy(bufaddr, ioaddr, winsize);
620 } else {
621 memcpy(ioaddr, bufaddr, winsize);
622 }
623
624 /* synchronous write, sync bits back to disk */
625 if (BUF_ISWRITE(bp) && !async) {
626 rumpuser_memsync(ioaddr, winsize, &error);
627 }
628 putwindow(rblk, win);
629 }
630
631 rump_biodone(bp, bp->b_bcount, 0);
632 return;
633 }
634
635 /*
636 * Do I/O. We have different paths for async and sync I/O.
637 * Async I/O is done by passing a request to rumpuser where
638 * it is executed. The rumpuser routine then calls
639 * biodone() to signal any waiters in the kernel. I/O's are
640 * executed in series. Technically executing them in parallel
641 * would produce better results, but then we'd need either
642 * more threads or posix aio. Maybe worth investigating
643 * this later.
644 *
645 * Using bufq here might be a good idea.
646 */
647
648 if (rump_threads) {
649 struct rumpuser_aio *rua;
650 int op, fd;
651
652 fd = rblk->rblk_fd;
653 if (BUF_ISREAD(bp)) {
654 op = RUA_OP_READ;
655 } else {
656 op = RUA_OP_WRITE;
657 if (!async) {
658 /* O_DIRECT not fully automatic yet */
659 #ifdef HAS_ODIRECT
660 if ((off & (DEV_BSIZE-1)) == 0
661 && ((intptr_t)bp->b_data&(DEV_BSIZE-1)) == 0
662 && (bp->b_bcount & (DEV_BSIZE-1)) == 0)
663 fd = rblk->rblk_dfd;
664 else
665 #endif
666 op |= RUA_OP_SYNC;
667 }
668 }
669
670 rumpuser_mutex_enter(&rumpuser_aio_mtx);
671 while ((rumpuser_aio_head+1) % N_AIOS == rumpuser_aio_tail) {
672 rumpuser_cv_wait(&rumpuser_aio_cv, &rumpuser_aio_mtx);
673 }
674
675 rua = &rumpuser_aios[rumpuser_aio_head];
676 KASSERT(rua->rua_bp == NULL);
677 rua->rua_fd = fd;
678 rua->rua_data = bp->b_data;
679 rua->rua_dlen = bp->b_bcount;
680 rua->rua_off = off;
681 rua->rua_bp = bp;
682 rua->rua_op = op;
683
684 /* insert into queue & signal */
685 rumpuser_aio_head = (rumpuser_aio_head+1) % N_AIOS;
686 rumpuser_cv_signal(&rumpuser_aio_cv);
687 rumpuser_mutex_exit(&rumpuser_aio_mtx);
688 } else {
689 if (BUF_ISREAD(bp)) {
690 rumpuser_read_bio(rblk->rblk_fd, bp->b_data,
691 bp->b_bcount, off, rump_biodone, bp);
692 } else {
693 rumpuser_write_bio(rblk->rblk_fd, bp->b_data,
694 bp->b_bcount, off, rump_biodone, bp);
695 }
696 if (BUF_ISWRITE(bp) && !async)
697 rumpuser_fsync(rblk->rblk_fd, &error);
698 }
699 }
700
701 void
702 rumpblk_strategy(struct buf *bp)
703 {
704
705 dostrategy(bp);
706 }
707
708 /*
709 * Simple random number generator. This is private so that we can
710 * very repeatedly control which blocks will fail.
711 *
712 * <mlelstv> pooka, rand()
713 * <mlelstv> [paste]
714 */
715 static unsigned
716 gimmerand(void)
717 {
718
719 return (randstate = randstate * 1103515245 + 12345) % (0x80000000L);
720 }
721
722 /*
723 * Block device with very simple fault injection. Fails every
724 * n out of BLKFAIL_MAX I/O with EIO. n is determined by the env
725 * variable RUMP_BLKFAIL.
726 */
727 void
728 rumpblk_strategy_fail(struct buf *bp)
729 {
730
731 if (gimmerand() % BLKFAIL_MAX >= blkfail) {
732 dostrategy(bp);
733 } else {
734 printf("block fault injection: failing I/O on block %lld\n",
735 (long long)bp->b_blkno);
736 bp->b_error = EIO;
737 biodone(bp);
738 }
739 }
740