rumpblk.c revision 1.44 1 /* $NetBSD: rumpblk.c,v 1.44 2011/02/03 10:06:06 pooka Exp $ */
2
3 /*
4 * Copyright (c) 2009 Antti Kantee. All Rights Reserved.
5 *
6 * Development of this software was supported by the
7 * Finnish Cultural Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
19 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 /*
32 * Block device emulation. Presents a block device interface and
33 * uses rumpuser system calls to satisfy I/O requests.
34 *
35 * We provide fault injection. The driver can be made to fail
36 * I/O occasionally.
37 *
38 * The driver also provides an optimization for regular files by
39 * using memory-mapped I/O. This avoids kernel access for every
40 * I/O operation. It also gives finer-grained control of how to
41 * flush data. Additionally, in case the rump kernel dumps core,
42 * we get way less carnage.
43 *
44 * However, it is quite costly in writing large amounts of
45 * file data, since old contents cannot merely be overwritten, but
46 * must be paged in first before replacing (i.e. r/m/w). Ideally,
47 * we should use directio. The problem is that directio can fail
48 * silently causing improper file system semantics (i.e. unflushed
49 * data). Therefore, default to mmap for now. Even so, directio
50 * _should_ be safe and can be enabled by compiling this module
51 * with -DHAS_DIRECTIO.
52 */
53
54 #include <sys/cdefs.h>
55 __KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.44 2011/02/03 10:06:06 pooka Exp $");
56
57 #include <sys/param.h>
58 #include <sys/buf.h>
59 #include <sys/conf.h>
60 #include <sys/condvar.h>
61 #include <sys/disklabel.h>
62 #include <sys/evcnt.h>
63 #include <sys/fcntl.h>
64 #include <sys/kmem.h>
65 #include <sys/malloc.h>
66 #include <sys/queue.h>
67 #include <sys/stat.h>
68
69 #include <rump/rumpuser.h>
70
71 #include "rump_private.h"
72 #include "rump_vfs_private.h"
73
74 #if 0
75 #define DPRINTF(x) printf x
76 #else
77 #define DPRINTF(x)
78 #endif
79
80 /* Default: 16 x 1MB windows */
81 unsigned memwinsize = (1<<20);
82 unsigned memwincnt = 16;
83
84 #define STARTWIN(off) ((off) & ~((off_t)memwinsize-1))
85 #define INWIN(win,off) ((win)->win_off == STARTWIN(off))
86 #define WINSIZE(rblk, win) (MIN((rblk->rblk_hostsize-win->win_off), \
87 memwinsize))
88 #define WINVALID(win) ((win)->win_off != (off_t)-1)
89 #define WINVALIDATE(win) ((win)->win_off = (off_t)-1)
90 struct blkwin {
91 off_t win_off;
92 void *win_mem;
93 int win_refcnt;
94
95 TAILQ_ENTRY(blkwin) win_lru;
96 };
97
98 #define RUMPBLK_SIZE 16
99 static struct rblkdev {
100 char *rblk_path;
101 int rblk_fd;
102 #ifdef HAS_ODIRECT
103 int rblk_dfd;
104 #endif
105 uint64_t rblk_size;
106 uint64_t rblk_hostoffset;
107 uint64_t rblk_hostsize;
108 int rblk_ftype;
109
110 /* for mmap */
111 int rblk_mmflags;
112 kmutex_t rblk_memmtx;
113 kcondvar_t rblk_memcv;
114 TAILQ_HEAD(winlru, blkwin) rblk_lruq;
115 bool rblk_waiting;
116
117 struct disklabel rblk_label;
118 } minors[RUMPBLK_SIZE];
119
120 static struct evcnt ev_io_total;
121 static struct evcnt ev_io_async;
122
123 static struct evcnt ev_memblk_hits;
124 static struct evcnt ev_memblk_busy;
125
126 static struct evcnt ev_bwrite_total;
127 static struct evcnt ev_bwrite_async;
128 static struct evcnt ev_bread_total;
129
130 dev_type_open(rumpblk_open);
131 dev_type_close(rumpblk_close);
132 dev_type_read(rumpblk_read);
133 dev_type_write(rumpblk_write);
134 dev_type_ioctl(rumpblk_ioctl);
135 dev_type_strategy(rumpblk_strategy);
136 dev_type_strategy(rumpblk_strategy_fail);
137 dev_type_dump(rumpblk_dump);
138 dev_type_size(rumpblk_size);
139
140 static const struct bdevsw rumpblk_bdevsw = {
141 rumpblk_open, rumpblk_close, rumpblk_strategy, rumpblk_ioctl,
142 nodump, nosize, D_DISK
143 };
144
145 static const struct bdevsw rumpblk_bdevsw_fail = {
146 rumpblk_open, rumpblk_close, rumpblk_strategy_fail, rumpblk_ioctl,
147 nodump, nosize, D_DISK
148 };
149
150 static const struct cdevsw rumpblk_cdevsw = {
151 rumpblk_open, rumpblk_close, rumpblk_read, rumpblk_write,
152 rumpblk_ioctl, nostop, notty, nopoll, nommap, nokqfilter, D_DISK
153 };
154
155 /* fail every n out of BLKFAIL_MAX */
156 #define BLKFAIL_MAX 10000
157 static int blkfail;
158 static unsigned randstate;
159 static kmutex_t rumpblk_lock;
160 static int sectshift = DEV_BSHIFT;
161
162 static void
163 makedefaultlabel(struct disklabel *lp, off_t size, int part)
164 {
165 int i;
166
167 memset(lp, 0, sizeof(*lp));
168
169 lp->d_secperunit = size;
170 lp->d_secsize = 1 << sectshift;
171 lp->d_nsectors = size >> sectshift;
172 lp->d_ntracks = 1;
173 lp->d_ncylinders = 1;
174 lp->d_secpercyl = lp->d_nsectors;
175
176 /* oh dear oh dear */
177 strncpy(lp->d_typename, "rumpd", sizeof(lp->d_typename));
178 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
179
180 lp->d_type = DTYPE_RUMPD;
181 lp->d_rpm = 11;
182 lp->d_interleave = 1;
183 lp->d_flags = 0;
184
185 /* XXX: RAW_PART handling? */
186 for (i = 0; i < part; i++) {
187 lp->d_partitions[i].p_fstype = FS_UNUSED;
188 }
189 lp->d_partitions[part].p_size = size >> sectshift;
190 lp->d_npartitions = part+1;
191 /* XXX: file system type? */
192
193 lp->d_magic = DISKMAGIC;
194 lp->d_magic2 = DISKMAGIC;
195 lp->d_checksum = 0; /* XXX */
196 }
197
198 static struct blkwin *
199 getwindow(struct rblkdev *rblk, off_t off, int *wsize, int *error)
200 {
201 struct blkwin *win;
202
203 mutex_enter(&rblk->rblk_memmtx);
204 retry:
205 /* search for window */
206 TAILQ_FOREACH(win, &rblk->rblk_lruq, win_lru) {
207 if (INWIN(win, off) && WINVALID(win))
208 break;
209 }
210
211 /* found? return */
212 if (win) {
213 ev_memblk_hits.ev_count++;
214 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
215 goto good;
216 }
217
218 /*
219 * Else, create new window. If the least recently used is not
220 * currently in use, reuse that. Otherwise we need to wait.
221 */
222 win = TAILQ_LAST(&rblk->rblk_lruq, winlru);
223 if (win->win_refcnt == 0) {
224 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
225 mutex_exit(&rblk->rblk_memmtx);
226
227 if (WINVALID(win)) {
228 DPRINTF(("win %p, unmap mem %p, off 0x%" PRIx64 "\n",
229 win, win->win_mem, win->win_off));
230 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win));
231 WINVALIDATE(win);
232 }
233
234 win->win_off = STARTWIN(off);
235 win->win_mem = rumpuser_filemmap(rblk->rblk_fd, win->win_off,
236 WINSIZE(rblk, win), rblk->rblk_mmflags, error);
237 DPRINTF(("win %p, off 0x%" PRIx64 ", mem %p\n",
238 win, win->win_off, win->win_mem));
239
240 mutex_enter(&rblk->rblk_memmtx);
241 if (win->win_mem == NULL) {
242 WINVALIDATE(win);
243 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru);
244 mutex_exit(&rblk->rblk_memmtx);
245 return NULL;
246 }
247 } else {
248 DPRINTF(("memwin wait\n"));
249 ev_memblk_busy.ev_count++;
250
251 rblk->rblk_waiting = true;
252 cv_wait(&rblk->rblk_memcv, &rblk->rblk_memmtx);
253 goto retry;
254 }
255
256 good:
257 KASSERT(win);
258 win->win_refcnt++;
259 TAILQ_INSERT_HEAD(&rblk->rblk_lruq, win, win_lru);
260 mutex_exit(&rblk->rblk_memmtx);
261 *wsize = MIN(*wsize, memwinsize - (off-win->win_off));
262 KASSERT(*wsize);
263
264 return win;
265 }
266
267 static void
268 putwindow(struct rblkdev *rblk, struct blkwin *win)
269 {
270
271 mutex_enter(&rblk->rblk_memmtx);
272 if (--win->win_refcnt == 0 && rblk->rblk_waiting) {
273 rblk->rblk_waiting = false;
274 cv_broadcast(&rblk->rblk_memcv);
275 }
276 KASSERT(win->win_refcnt >= 0);
277 mutex_exit(&rblk->rblk_memmtx);
278 }
279
280 static void
281 wincleanup(struct rblkdev *rblk)
282 {
283 struct blkwin *win;
284
285 while ((win = TAILQ_FIRST(&rblk->rblk_lruq)) != NULL) {
286 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
287 if (WINVALID(win)) {
288 DPRINTF(("cleanup win %p addr %p\n",
289 win, win->win_mem));
290 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win));
291 }
292 kmem_free(win, sizeof(*win));
293 }
294 rblk->rblk_mmflags = 0;
295 }
296
297 int
298 rumpblk_init(void)
299 {
300 char buf[64];
301 devmajor_t rumpblkmaj = RUMPBLK_DEVMAJOR;
302 unsigned tmp;
303 int error, i;
304
305 mutex_init(&rumpblk_lock, MUTEX_DEFAULT, IPL_NONE);
306
307 if (rumpuser_getenv("RUMP_BLKFAIL", buf, sizeof(buf), &error) == 0) {
308 blkfail = strtoul(buf, NULL, 10);
309 /* fail everything */
310 if (blkfail > BLKFAIL_MAX)
311 blkfail = BLKFAIL_MAX;
312 if (rumpuser_getenv("RUMP_BLKFAIL_SEED", buf, sizeof(buf),
313 &error) == 0) {
314 randstate = strtoul(buf, NULL, 10);
315 } else {
316 randstate = arc4random();
317 }
318 printf("rumpblk: FAULT INJECTION ACTIVE! fail %d/%d. "
319 "seed %u\n", blkfail, BLKFAIL_MAX, randstate);
320 } else {
321 blkfail = 0;
322 }
323
324 if (rumpuser_getenv("RUMP_BLKWINSIZE", buf, sizeof(buf), &error) == 0) {
325 printf("rumpblk: ");
326 tmp = strtoul(buf, NULL, 10);
327 if (tmp && !(tmp & (tmp-1)))
328 memwinsize = tmp;
329 else
330 printf("invalid RUMP_BLKWINSIZE %d, ", tmp);
331 printf("using %d for memwinsize\n", memwinsize);
332 }
333 if (rumpuser_getenv("RUMP_BLKWINCOUNT", buf, sizeof(buf), &error) == 0){
334 printf("rumpblk: ");
335 tmp = strtoul(buf, NULL, 10);
336 if (tmp)
337 memwincnt = tmp;
338 else
339 printf("invalid RUMP_BLKWINCOUNT %d, ", tmp);
340 printf("using %d for memwincount\n", memwincnt);
341 }
342 if (rumpuser_getenv("RUMP_BLKSECTSHIFT", buf, sizeof(buf), &error)==0){
343 printf("rumpblk: ");
344 tmp = strtoul(buf, NULL, 10);
345 if (tmp >= DEV_BSHIFT)
346 sectshift = tmp;
347 else
348 printf("RUMP_BLKSECTSHIFT must be least %d (now %d), ",
349 DEV_BSHIFT, tmp);
350 printf("using %d for sector shift (size %d)\n",
351 sectshift, 1<<sectshift);
352 }
353
354 memset(minors, 0, sizeof(minors));
355 for (i = 0; i < RUMPBLK_SIZE; i++) {
356 mutex_init(&minors[i].rblk_memmtx, MUTEX_DEFAULT, IPL_NONE);
357 cv_init(&minors[i].rblk_memcv, "rblkmcv");
358 }
359
360 evcnt_attach_dynamic(&ev_io_total, EVCNT_TYPE_MISC, NULL,
361 "rumpblk", "I/O reqs");
362 evcnt_attach_dynamic(&ev_io_async, EVCNT_TYPE_MISC, NULL,
363 "rumpblk", "async I/O");
364
365 evcnt_attach_dynamic(&ev_bread_total, EVCNT_TYPE_MISC, NULL,
366 "rumpblk", "bytes read");
367 evcnt_attach_dynamic(&ev_bwrite_total, EVCNT_TYPE_MISC, NULL,
368 "rumpblk", "bytes written");
369 evcnt_attach_dynamic(&ev_bwrite_async, EVCNT_TYPE_MISC, NULL,
370 "rumpblk", "bytes written async");
371
372 evcnt_attach_dynamic(&ev_memblk_hits, EVCNT_TYPE_MISC, NULL,
373 "rumpblk", "window hits");
374 evcnt_attach_dynamic(&ev_memblk_busy, EVCNT_TYPE_MISC, NULL,
375 "rumpblk", "all windows busy");
376
377 if (blkfail) {
378 return devsw_attach("rumpblk",
379 &rumpblk_bdevsw_fail, &rumpblkmaj,
380 &rumpblk_cdevsw, &rumpblkmaj);
381 } else {
382 return devsw_attach("rumpblk",
383 &rumpblk_bdevsw, &rumpblkmaj,
384 &rumpblk_cdevsw, &rumpblkmaj);
385 }
386 }
387
388 int
389 rumpblk_register(const char *path, devminor_t *dmin,
390 uint64_t offset, uint64_t size)
391 {
392 struct rblkdev *rblk;
393 uint64_t flen;
394 size_t len;
395 int ftype, error, i;
396
397 /* devices might not report correct size unless they're open */
398 if (rumpuser_getfileinfo(path, &flen, &ftype, &error) == -1)
399 return error;
400
401 /* verify host file is of supported type */
402 if (!(ftype == RUMPUSER_FT_REG
403 || ftype == RUMPUSER_FT_BLK
404 || ftype == RUMPUSER_FT_CHR))
405 return EINVAL;
406
407 mutex_enter(&rumpblk_lock);
408 for (i = 0; i < RUMPBLK_SIZE; i++) {
409 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
410 mutex_exit(&rumpblk_lock);
411 *dmin = i;
412 return 0;
413 }
414 }
415
416 for (i = 0; i < RUMPBLK_SIZE; i++)
417 if (minors[i].rblk_path == NULL)
418 break;
419 if (i == RUMPBLK_SIZE) {
420 mutex_exit(&rumpblk_lock);
421 return EBUSY;
422 }
423
424 rblk = &minors[i];
425 len = strlen(path);
426 rblk->rblk_path = malloc(len + 1, M_TEMP, M_WAITOK);
427 strcpy(rblk->rblk_path, path);
428 rblk->rblk_fd = -1;
429 rblk->rblk_hostoffset = offset;
430 if (size != RUMPBLK_SIZENOTSET) {
431 KASSERT(size + offset <= flen);
432 rblk->rblk_size = size;
433 } else {
434 KASSERT(offset < flen);
435 rblk->rblk_size = flen - offset;
436 }
437 rblk->rblk_hostsize = flen;
438 rblk->rblk_ftype = ftype;
439 makedefaultlabel(&rblk->rblk_label, rblk->rblk_size, i);
440 mutex_exit(&rumpblk_lock);
441
442 *dmin = i;
443 return 0;
444 }
445
446 /*
447 * Unregister rumpblk. It's the callers responsibility to make
448 * sure it's no longer in use.
449 */
450 int
451 rumpblk_deregister(const char *path)
452 {
453 struct rblkdev *rblk;
454 int i;
455
456 mutex_enter(&rumpblk_lock);
457 for (i = 0; i < RUMPBLK_SIZE; i++) {
458 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
459 break;
460 }
461 }
462 mutex_exit(&rumpblk_lock);
463
464 if (i == RUMPBLK_SIZE)
465 return ENOENT;
466
467 rblk = &minors[i];
468 KASSERT(rblk->rblk_fd == -1);
469
470 wincleanup(rblk);
471 free(rblk->rblk_path, M_TEMP);
472 rblk->rblk_path = NULL;
473 memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label));
474
475 return 0;
476 }
477
478 int
479 rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l)
480 {
481 struct rblkdev *rblk = &minors[minor(dev)];
482 int error, fd;
483
484 if (rblk->rblk_path == NULL)
485 return ENXIO;
486
487 if (rblk->rblk_fd != -1)
488 return 0; /* XXX: refcount, open mode */
489 flag &= ~O_TRUNC;
490 fd = rumpuser_open(rblk->rblk_path, OFLAGS(flag), &error);
491 if (error)
492 return error;
493
494 #ifdef HAS_ODIRECT
495 rblk->rblk_dfd = rumpuser_open(rblk->rblk_path,
496 OFLAGS(flag) | O_DIRECT, &error);
497 if (error)
498 return error;
499 #endif
500
501 if (rblk->rblk_ftype == RUMPUSER_FT_REG) {
502 uint64_t fsize = rblk->rblk_size, off = rblk->rblk_hostoffset;
503 struct blkwin *win;
504 int i, winsize;
505
506 /*
507 * Use mmap to access a regular file. Allocate and
508 * cache initial windows here. Failure to allocate one
509 * means fallback to read/write i/o.
510 */
511
512 rblk->rblk_mmflags = 0;
513 if (flag & FREAD)
514 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_READ;
515 if (flag & FWRITE) {
516 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_WRITE;
517 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_SHARED;
518 }
519
520 TAILQ_INIT(&rblk->rblk_lruq);
521 rblk->rblk_fd = fd;
522
523 for (i = 0; i < memwincnt && off + i*memwinsize < fsize; i++) {
524 win = kmem_zalloc(sizeof(*win), KM_SLEEP);
525 WINVALIDATE(win);
526 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru);
527
528 /*
529 * Allocate first windows. Here we just generally
530 * make sure a) we can mmap at all b) we have the
531 * necessary VA available
532 */
533 winsize = memwinsize;
534 win = getwindow(rblk, off + i*memwinsize, &winsize,
535 &error);
536 if (win) {
537 putwindow(rblk, win);
538 } else {
539 wincleanup(rblk);
540 break;
541 }
542 }
543 } else {
544 rblk->rblk_fd = fd;
545 }
546
547 KASSERT(rblk->rblk_fd != -1);
548 return 0;
549 }
550
551 int
552 rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l)
553 {
554 struct rblkdev *rblk = &minors[minor(dev)];
555 int dummy;
556
557 if (rblk->rblk_mmflags)
558 wincleanup(rblk);
559 rumpuser_fsync(rblk->rblk_fd, &dummy);
560 rumpuser_close(rblk->rblk_fd, &dummy);
561 rblk->rblk_fd = -1;
562
563 return 0;
564 }
565
566 int
567 rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l)
568 {
569 devminor_t dmin = minor(dev);
570 struct rblkdev *rblk = &minors[dmin];
571 struct partinfo *pi;
572 int error = 0;
573
574 /* well, me should support a few more, but we don't for now */
575 switch (xfer) {
576 case DIOCGDINFO:
577 *(struct disklabel *)addr = rblk->rblk_label;
578 break;
579
580 case DIOCGPART:
581 pi = addr;
582 pi->part = &rblk->rblk_label.d_partitions[DISKPART(dmin)];
583 pi->disklab = &rblk->rblk_label;
584 break;
585
586 /* it's synced enough along the write path */
587 case DIOCCACHESYNC:
588 break;
589
590 default:
591 error = ENOTTY;
592 break;
593 }
594
595 return error;
596 }
597
598 static int
599 do_physio(dev_t dev, struct uio *uio, int which)
600 {
601 void (*strat)(struct buf *);
602
603 if (blkfail)
604 strat = rumpblk_strategy_fail;
605 else
606 strat = rumpblk_strategy;
607
608 return physio(strat, NULL, dev, which, minphys, uio);
609 }
610
611 int
612 rumpblk_read(dev_t dev, struct uio *uio, int flags)
613 {
614
615 return do_physio(dev, uio, B_READ);
616 }
617
618 int
619 rumpblk_write(dev_t dev, struct uio *uio, int flags)
620 {
621
622 return do_physio(dev, uio, B_WRITE);
623 }
624
625 static void
626 dostrategy(struct buf *bp)
627 {
628 struct rblkdev *rblk = &minors[minor(bp->b_dev)];
629 off_t off;
630 int async = bp->b_flags & B_ASYNC;
631 int error;
632
633 if (bp->b_bcount % (1<<sectshift) != 0) {
634 rump_biodone(bp, 0, EINVAL);
635 return;
636 }
637
638 /* collect statistics */
639 ev_io_total.ev_count++;
640 if (async)
641 ev_io_async.ev_count++;
642 if (BUF_ISWRITE(bp)) {
643 ev_bwrite_total.ev_count += bp->b_bcount;
644 if (async)
645 ev_bwrite_async.ev_count += bp->b_bcount;
646 } else {
647 ev_bread_total.ev_count++;
648 }
649
650 /*
651 * b_blkno is always in terms of DEV_BSIZE, and since we need
652 * to translate to a byte offset for the host read, this
653 * calculation does not need sectshift.
654 */
655 off = bp->b_blkno << DEV_BSHIFT;
656
657 /*
658 * Do bounds checking if we're working on a file. Otherwise
659 * invalid file systems might attempt to read beyond EOF. This
660 * is bad(tm) especially on mmapped images. This is essentially
661 * the kernel bounds_check() routines.
662 */
663 if (off + bp->b_bcount > rblk->rblk_size) {
664 int64_t sz = rblk->rblk_size - off;
665
666 /* EOF */
667 if (sz == 0) {
668 rump_biodone(bp, 0, 0);
669 return;
670 }
671 /* beyond EOF ==> error */
672 if (sz < 0) {
673 rump_biodone(bp, 0, EINVAL);
674 return;
675 }
676
677 /* truncate to device size */
678 bp->b_bcount = sz;
679 }
680
681 off += rblk->rblk_hostoffset;
682 DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64
683 " (0x%" PRIx64 " - 0x%" PRIx64 "), %ssync\n",
684 bp->b_bcount, BUF_ISREAD(bp) ? "READ" : "WRITE",
685 off, off, (off + bp->b_bcount), async ? "a" : ""));
686
687 /* mmap? handle here and return */
688 if (rblk->rblk_mmflags) {
689 struct blkwin *win;
690 int winsize, iodone;
691 uint8_t *ioaddr, *bufaddr;
692
693 for (iodone = 0; iodone < bp->b_bcount;
694 iodone += winsize, off += winsize) {
695 winsize = bp->b_bcount - iodone;
696 win = getwindow(rblk, off, &winsize, &error);
697 if (win == NULL) {
698 rump_biodone(bp, iodone, error);
699 return;
700 }
701
702 ioaddr = (uint8_t *)win->win_mem + (off-STARTWIN(off));
703 bufaddr = (uint8_t *)bp->b_data + iodone;
704
705 DPRINTF(("strat: %p off 0x%" PRIx64
706 ", ioaddr %p (%p)/buf %p\n", win,
707 win->win_off, ioaddr, win->win_mem, bufaddr));
708 if (BUF_ISREAD(bp)) {
709 memcpy(bufaddr, ioaddr, winsize);
710 } else {
711 memcpy(ioaddr, bufaddr, winsize);
712 }
713
714 /* synchronous write, sync bits back to disk */
715 if (BUF_ISWRITE(bp) && !async) {
716 rumpuser_memsync(ioaddr, winsize, &error);
717 }
718 putwindow(rblk, win);
719 }
720
721 rump_biodone(bp, bp->b_bcount, 0);
722 return;
723 }
724
725 /*
726 * Do I/O. We have different paths for async and sync I/O.
727 * Async I/O is done by passing a request to rumpuser where
728 * it is executed. The rumpuser routine then calls
729 * biodone() to signal any waiters in the kernel. I/O's are
730 * executed in series. Technically executing them in parallel
731 * would produce better results, but then we'd need either
732 * more threads or posix aio. Maybe worth investigating
733 * this later.
734 *
735 * Using bufq here might be a good idea.
736 */
737
738 if (rump_threads) {
739 struct rumpuser_aio *rua;
740 int op, fd;
741
742 fd = rblk->rblk_fd;
743 if (BUF_ISREAD(bp)) {
744 op = RUA_OP_READ;
745 } else {
746 op = RUA_OP_WRITE;
747 if (!async) {
748 /* O_DIRECT not fully automatic yet */
749 #ifdef HAS_ODIRECT
750 if ((off & ((1<<sectshift)-1)) == 0
751 && ((intptr_t)bp->b_data
752 & ((1<<sectshift)-1)) == 0
753 && (bp->b_bcount & ((1<<sectshift)-1)) == 0)
754 fd = rblk->rblk_dfd;
755 else
756 #endif
757 op |= RUA_OP_SYNC;
758 }
759 }
760
761 rumpuser_mutex_enter(&rumpuser_aio_mtx);
762 while ((rumpuser_aio_head+1) % N_AIOS == rumpuser_aio_tail) {
763 rumpuser_cv_wait(&rumpuser_aio_cv, &rumpuser_aio_mtx);
764 }
765
766 rua = &rumpuser_aios[rumpuser_aio_head];
767 KASSERT(rua->rua_bp == NULL);
768 rua->rua_fd = fd;
769 rua->rua_data = bp->b_data;
770 rua->rua_dlen = bp->b_bcount;
771 rua->rua_off = off;
772 rua->rua_bp = bp;
773 rua->rua_op = op;
774
775 /* insert into queue & signal */
776 rumpuser_aio_head = (rumpuser_aio_head+1) % N_AIOS;
777 rumpuser_cv_signal(&rumpuser_aio_cv);
778 rumpuser_mutex_exit(&rumpuser_aio_mtx);
779 } else {
780 if (BUF_ISREAD(bp)) {
781 rumpuser_read_bio(rblk->rblk_fd, bp->b_data,
782 bp->b_bcount, off, rump_biodone, bp);
783 } else {
784 rumpuser_write_bio(rblk->rblk_fd, bp->b_data,
785 bp->b_bcount, off, rump_biodone, bp);
786 }
787 if (BUF_ISWRITE(bp) && !async)
788 rumpuser_fsync(rblk->rblk_fd, &error);
789 }
790 }
791
792 void
793 rumpblk_strategy(struct buf *bp)
794 {
795
796 dostrategy(bp);
797 }
798
799 /*
800 * Simple random number generator. This is private so that we can
801 * very repeatedly control which blocks will fail.
802 *
803 * <mlelstv> pooka, rand()
804 * <mlelstv> [paste]
805 */
806 static unsigned
807 gimmerand(void)
808 {
809
810 return (randstate = randstate * 1103515245 + 12345) % (0x80000000L);
811 }
812
813 /*
814 * Block device with very simple fault injection. Fails every
815 * n out of BLKFAIL_MAX I/O with EIO. n is determined by the env
816 * variable RUMP_BLKFAIL.
817 */
818 void
819 rumpblk_strategy_fail(struct buf *bp)
820 {
821
822 if (gimmerand() % BLKFAIL_MAX >= blkfail) {
823 dostrategy(bp);
824 } else {
825 printf("block fault injection: failing I/O on block %lld\n",
826 (long long)bp->b_blkno);
827 bp->b_error = EIO;
828 biodone(bp);
829 }
830 }
831