rumpblk.c revision 1.46 1 /* $NetBSD: rumpblk.c,v 1.46 2011/02/03 22:16:11 pooka Exp $ */
2
3 /*
4 * Copyright (c) 2009 Antti Kantee. All Rights Reserved.
5 *
6 * Development of this software was supported by the
7 * Finnish Cultural Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
19 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 /*
32 * Block device emulation. Presents a block device interface and
33 * uses rumpuser system calls to satisfy I/O requests.
34 *
35 * We provide fault injection. The driver can be made to fail
36 * I/O occasionally.
37 *
38 * The driver also provides an optimization for regular files by
39 * using memory-mapped I/O. This avoids kernel access for every
40 * I/O operation. It also gives finer-grained control of how to
41 * flush data. Additionally, in case the rump kernel dumps core,
42 * we get way less carnage.
43 *
44 * However, it is quite costly in writing large amounts of
45 * file data, since old contents cannot merely be overwritten, but
46 * must be paged in first before replacing (i.e. r/m/w). Ideally,
47 * we should use directio. The problem is that directio can fail
48 * silently causing improper file system semantics (i.e. unflushed
49 * data). Therefore, default to mmap for now. Even so, directio
50 * _should_ be safe and can be enabled by compiling this module
51 * with -DHAS_DIRECTIO.
52 */
53
54 #include <sys/cdefs.h>
55 __KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.46 2011/02/03 22:16:11 pooka Exp $");
56
57 #include <sys/param.h>
58 #include <sys/buf.h>
59 #include <sys/conf.h>
60 #include <sys/condvar.h>
61 #include <sys/disklabel.h>
62 #include <sys/evcnt.h>
63 #include <sys/fcntl.h>
64 #include <sys/kmem.h>
65 #include <sys/malloc.h>
66 #include <sys/queue.h>
67 #include <sys/stat.h>
68
69 #include <rump/rumpuser.h>
70
71 #include "rump_private.h"
72 #include "rump_vfs_private.h"
73
74 /*
75 * O_DIRECT is the fastest alternative, but since it falls back to
76 * non-direct writes silently, I am not sure it will always be 100% safe.
77 * Use it and play with it, but do that with caution.
78 */
79 #if 0
80 #define HAS_ODIRECT
81 #endif
82
83 #if 0
84 #define DPRINTF(x) printf x
85 #else
86 #define DPRINTF(x)
87 #endif
88
89 /* Default: 16 x 1MB windows */
90 unsigned memwinsize = (1<<20);
91 unsigned memwincnt = 16;
92
93 #define STARTWIN(off) ((off) & ~((off_t)memwinsize-1))
94 #define INWIN(win,off) ((win)->win_off == STARTWIN(off))
95 #define WINSIZE(rblk, win) (MIN((rblk->rblk_hostsize-win->win_off), \
96 memwinsize))
97 #define WINVALID(win) ((win)->win_off != (off_t)-1)
98 #define WINVALIDATE(win) ((win)->win_off = (off_t)-1)
99 struct blkwin {
100 off_t win_off;
101 void *win_mem;
102 int win_refcnt;
103
104 TAILQ_ENTRY(blkwin) win_lru;
105 };
106
107 #define RUMPBLK_SIZE 16
108 static struct rblkdev {
109 char *rblk_path;
110 int rblk_fd;
111 int rblk_mode;
112 #ifdef HAS_ODIRECT
113 int rblk_dfd;
114 #endif
115 uint64_t rblk_size;
116 uint64_t rblk_hostoffset;
117 uint64_t rblk_hostsize;
118 int rblk_ftype;
119
120 /* for mmap */
121 int rblk_mmflags;
122 kmutex_t rblk_memmtx;
123 kcondvar_t rblk_memcv;
124 TAILQ_HEAD(winlru, blkwin) rblk_lruq;
125 bool rblk_waiting;
126
127 struct disklabel rblk_label;
128 } minors[RUMPBLK_SIZE];
129
130 static struct evcnt ev_io_total;
131 static struct evcnt ev_io_async;
132
133 static struct evcnt ev_memblk_hits;
134 static struct evcnt ev_memblk_busy;
135
136 static struct evcnt ev_bwrite_total;
137 static struct evcnt ev_bwrite_async;
138 static struct evcnt ev_bread_total;
139
140 dev_type_open(rumpblk_open);
141 dev_type_close(rumpblk_close);
142 dev_type_read(rumpblk_read);
143 dev_type_write(rumpblk_write);
144 dev_type_ioctl(rumpblk_ioctl);
145 dev_type_strategy(rumpblk_strategy);
146 dev_type_strategy(rumpblk_strategy_fail);
147 dev_type_dump(rumpblk_dump);
148 dev_type_size(rumpblk_size);
149
150 static const struct bdevsw rumpblk_bdevsw = {
151 rumpblk_open, rumpblk_close, rumpblk_strategy, rumpblk_ioctl,
152 nodump, nosize, D_DISK
153 };
154
155 static const struct bdevsw rumpblk_bdevsw_fail = {
156 rumpblk_open, rumpblk_close, rumpblk_strategy_fail, rumpblk_ioctl,
157 nodump, nosize, D_DISK
158 };
159
160 static const struct cdevsw rumpblk_cdevsw = {
161 rumpblk_open, rumpblk_close, rumpblk_read, rumpblk_write,
162 rumpblk_ioctl, nostop, notty, nopoll, nommap, nokqfilter, D_DISK
163 };
164
165 static int backend_open(struct rblkdev *, const char *);
166 static int backend_close(struct rblkdev *);
167
168 /* fail every n out of BLKFAIL_MAX */
169 #define BLKFAIL_MAX 10000
170 static int blkfail;
171 static unsigned randstate;
172 static kmutex_t rumpblk_lock;
173 static int sectshift = DEV_BSHIFT;
174
175 static void
176 makedefaultlabel(struct disklabel *lp, off_t size, int part)
177 {
178 int i;
179
180 memset(lp, 0, sizeof(*lp));
181
182 lp->d_secperunit = size;
183 lp->d_secsize = 1 << sectshift;
184 lp->d_nsectors = size >> sectshift;
185 lp->d_ntracks = 1;
186 lp->d_ncylinders = 1;
187 lp->d_secpercyl = lp->d_nsectors;
188
189 /* oh dear oh dear */
190 strncpy(lp->d_typename, "rumpd", sizeof(lp->d_typename));
191 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
192
193 lp->d_type = DTYPE_RUMPD;
194 lp->d_rpm = 11;
195 lp->d_interleave = 1;
196 lp->d_flags = 0;
197
198 /* XXX: RAW_PART handling? */
199 for (i = 0; i < part; i++) {
200 lp->d_partitions[i].p_fstype = FS_UNUSED;
201 }
202 lp->d_partitions[part].p_size = size >> sectshift;
203 lp->d_npartitions = part+1;
204 /* XXX: file system type? */
205
206 lp->d_magic = DISKMAGIC;
207 lp->d_magic2 = DISKMAGIC;
208 lp->d_checksum = 0; /* XXX */
209 }
210
211 static struct blkwin *
212 getwindow(struct rblkdev *rblk, off_t off, int *wsize, int *error)
213 {
214 struct blkwin *win;
215
216 mutex_enter(&rblk->rblk_memmtx);
217 retry:
218 /* search for window */
219 TAILQ_FOREACH(win, &rblk->rblk_lruq, win_lru) {
220 if (INWIN(win, off) && WINVALID(win))
221 break;
222 }
223
224 /* found? return */
225 if (win) {
226 ev_memblk_hits.ev_count++;
227 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
228 goto good;
229 }
230
231 /*
232 * Else, create new window. If the least recently used is not
233 * currently in use, reuse that. Otherwise we need to wait.
234 */
235 win = TAILQ_LAST(&rblk->rblk_lruq, winlru);
236 if (win->win_refcnt == 0) {
237 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
238 mutex_exit(&rblk->rblk_memmtx);
239
240 if (WINVALID(win)) {
241 DPRINTF(("win %p, unmap mem %p, off 0x%" PRIx64 "\n",
242 win, win->win_mem, win->win_off));
243 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win));
244 WINVALIDATE(win);
245 }
246
247 win->win_off = STARTWIN(off);
248 win->win_mem = rumpuser_filemmap(rblk->rblk_fd, win->win_off,
249 WINSIZE(rblk, win), rblk->rblk_mmflags, error);
250 DPRINTF(("win %p, off 0x%" PRIx64 ", mem %p\n",
251 win, win->win_off, win->win_mem));
252
253 mutex_enter(&rblk->rblk_memmtx);
254 if (win->win_mem == NULL) {
255 WINVALIDATE(win);
256 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru);
257 mutex_exit(&rblk->rblk_memmtx);
258 return NULL;
259 }
260 } else {
261 DPRINTF(("memwin wait\n"));
262 ev_memblk_busy.ev_count++;
263
264 rblk->rblk_waiting = true;
265 cv_wait(&rblk->rblk_memcv, &rblk->rblk_memmtx);
266 goto retry;
267 }
268
269 good:
270 KASSERT(win);
271 win->win_refcnt++;
272 TAILQ_INSERT_HEAD(&rblk->rblk_lruq, win, win_lru);
273 mutex_exit(&rblk->rblk_memmtx);
274 *wsize = MIN(*wsize, memwinsize - (off-win->win_off));
275 KASSERT(*wsize);
276
277 return win;
278 }
279
280 static void
281 putwindow(struct rblkdev *rblk, struct blkwin *win)
282 {
283
284 mutex_enter(&rblk->rblk_memmtx);
285 if (--win->win_refcnt == 0 && rblk->rblk_waiting) {
286 rblk->rblk_waiting = false;
287 cv_broadcast(&rblk->rblk_memcv);
288 }
289 KASSERT(win->win_refcnt >= 0);
290 mutex_exit(&rblk->rblk_memmtx);
291 }
292
293 static void
294 wincleanup(struct rblkdev *rblk)
295 {
296 struct blkwin *win;
297
298 while ((win = TAILQ_FIRST(&rblk->rblk_lruq)) != NULL) {
299 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
300 if (WINVALID(win)) {
301 DPRINTF(("cleanup win %p addr %p\n",
302 win, win->win_mem));
303 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win));
304 }
305 kmem_free(win, sizeof(*win));
306 }
307 rblk->rblk_mmflags = 0;
308 }
309
310 int
311 rumpblk_init(void)
312 {
313 char buf[64];
314 devmajor_t rumpblkmaj = RUMPBLK_DEVMAJOR;
315 unsigned tmp;
316 int error, i;
317
318 mutex_init(&rumpblk_lock, MUTEX_DEFAULT, IPL_NONE);
319
320 if (rumpuser_getenv("RUMP_BLKFAIL", buf, sizeof(buf), &error) == 0) {
321 blkfail = strtoul(buf, NULL, 10);
322 /* fail everything */
323 if (blkfail > BLKFAIL_MAX)
324 blkfail = BLKFAIL_MAX;
325 if (rumpuser_getenv("RUMP_BLKFAIL_SEED", buf, sizeof(buf),
326 &error) == 0) {
327 randstate = strtoul(buf, NULL, 10);
328 } else {
329 randstate = arc4random();
330 }
331 printf("rumpblk: FAULT INJECTION ACTIVE! fail %d/%d. "
332 "seed %u\n", blkfail, BLKFAIL_MAX, randstate);
333 } else {
334 blkfail = 0;
335 }
336
337 if (rumpuser_getenv("RUMP_BLKWINSIZE", buf, sizeof(buf), &error) == 0) {
338 printf("rumpblk: ");
339 tmp = strtoul(buf, NULL, 10);
340 if (tmp && !(tmp & (tmp-1)))
341 memwinsize = tmp;
342 else
343 printf("invalid RUMP_BLKWINSIZE %d, ", tmp);
344 printf("using %d for memwinsize\n", memwinsize);
345 }
346 if (rumpuser_getenv("RUMP_BLKWINCOUNT", buf, sizeof(buf), &error) == 0){
347 printf("rumpblk: ");
348 tmp = strtoul(buf, NULL, 10);
349 if (tmp)
350 memwincnt = tmp;
351 else
352 printf("invalid RUMP_BLKWINCOUNT %d, ", tmp);
353 printf("using %d for memwincount\n", memwincnt);
354 }
355 if (rumpuser_getenv("RUMP_BLKSECTSHIFT", buf, sizeof(buf), &error)==0){
356 printf("rumpblk: ");
357 tmp = strtoul(buf, NULL, 10);
358 if (tmp >= DEV_BSHIFT)
359 sectshift = tmp;
360 else
361 printf("RUMP_BLKSECTSHIFT must be least %d (now %d), ",
362 DEV_BSHIFT, tmp);
363 printf("using %d for sector shift (size %d)\n",
364 sectshift, 1<<sectshift);
365 }
366
367 memset(minors, 0, sizeof(minors));
368 for (i = 0; i < RUMPBLK_SIZE; i++) {
369 mutex_init(&minors[i].rblk_memmtx, MUTEX_DEFAULT, IPL_NONE);
370 cv_init(&minors[i].rblk_memcv, "rblkmcv");
371 minors[i].rblk_fd = -1;
372 }
373
374 evcnt_attach_dynamic(&ev_io_total, EVCNT_TYPE_MISC, NULL,
375 "rumpblk", "I/O reqs");
376 evcnt_attach_dynamic(&ev_io_async, EVCNT_TYPE_MISC, NULL,
377 "rumpblk", "async I/O");
378
379 evcnt_attach_dynamic(&ev_bread_total, EVCNT_TYPE_MISC, NULL,
380 "rumpblk", "bytes read");
381 evcnt_attach_dynamic(&ev_bwrite_total, EVCNT_TYPE_MISC, NULL,
382 "rumpblk", "bytes written");
383 evcnt_attach_dynamic(&ev_bwrite_async, EVCNT_TYPE_MISC, NULL,
384 "rumpblk", "bytes written async");
385
386 evcnt_attach_dynamic(&ev_memblk_hits, EVCNT_TYPE_MISC, NULL,
387 "rumpblk", "window hits");
388 evcnt_attach_dynamic(&ev_memblk_busy, EVCNT_TYPE_MISC, NULL,
389 "rumpblk", "all windows busy");
390
391 if (blkfail) {
392 return devsw_attach("rumpblk",
393 &rumpblk_bdevsw_fail, &rumpblkmaj,
394 &rumpblk_cdevsw, &rumpblkmaj);
395 } else {
396 return devsw_attach("rumpblk",
397 &rumpblk_bdevsw, &rumpblkmaj,
398 &rumpblk_cdevsw, &rumpblkmaj);
399 }
400 }
401
402 int
403 rumpblk_register(const char *path, devminor_t *dmin,
404 uint64_t offset, uint64_t size)
405 {
406 struct rblkdev *rblk;
407 uint64_t flen;
408 size_t len;
409 int ftype, error, i;
410
411 /* devices might not report correct size unless they're open */
412 if (rumpuser_getfileinfo(path, &flen, &ftype, &error) == -1)
413 return error;
414
415 /* verify host file is of supported type */
416 if (!(ftype == RUMPUSER_FT_REG
417 || ftype == RUMPUSER_FT_BLK
418 || ftype == RUMPUSER_FT_CHR))
419 return EINVAL;
420
421 mutex_enter(&rumpblk_lock);
422 for (i = 0; i < RUMPBLK_SIZE; i++) {
423 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
424 mutex_exit(&rumpblk_lock);
425 *dmin = i;
426 return 0;
427 }
428 }
429
430 for (i = 0; i < RUMPBLK_SIZE; i++)
431 if (minors[i].rblk_path == NULL)
432 break;
433 if (i == RUMPBLK_SIZE) {
434 mutex_exit(&rumpblk_lock);
435 return EBUSY;
436 }
437
438 rblk = &minors[i];
439 rblk->rblk_path = __UNCONST("taken");
440 mutex_exit(&rumpblk_lock);
441
442 len = strlen(path);
443 rblk->rblk_path = malloc(len + 1, M_TEMP, M_WAITOK);
444 strcpy(rblk->rblk_path, path);
445 rblk->rblk_hostoffset = offset;
446 if (size != RUMPBLK_SIZENOTSET) {
447 KASSERT(size + offset <= flen);
448 rblk->rblk_size = size;
449 } else {
450 KASSERT(offset < flen);
451 rblk->rblk_size = flen - offset;
452 }
453 rblk->rblk_hostsize = flen;
454 rblk->rblk_ftype = ftype;
455 makedefaultlabel(&rblk->rblk_label, rblk->rblk_size, i);
456
457 if ((error = backend_open(rblk, path)) != 0) {
458 memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label));
459 free(rblk->rblk_path, M_TEMP);
460 rblk->rblk_path = NULL;
461 return error;
462 }
463
464 *dmin = i;
465 return 0;
466 }
467
468 /*
469 * Unregister rumpblk. It's the callers responsibility to make
470 * sure it's no longer in use.
471 */
472 int
473 rumpblk_deregister(const char *path)
474 {
475 struct rblkdev *rblk;
476 int i;
477
478 mutex_enter(&rumpblk_lock);
479 for (i = 0; i < RUMPBLK_SIZE; i++) {
480 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
481 break;
482 }
483 }
484 mutex_exit(&rumpblk_lock);
485
486 if (i == RUMPBLK_SIZE)
487 return ENOENT;
488
489 rblk = &minors[i];
490 backend_close(rblk);
491
492 wincleanup(rblk);
493 free(rblk->rblk_path, M_TEMP);
494 memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label));
495 rblk->rblk_path = NULL;
496
497 return 0;
498 }
499
500 static int
501 backend_open(struct rblkdev *rblk, const char *path)
502 {
503 int error, fd;
504
505 KASSERT(rblk->rblk_fd == -1);
506 fd = rumpuser_open(path, O_RDWR, &error);
507 if (error) {
508 fd = rumpuser_open(path, O_RDONLY, &error);
509 if (error)
510 return error;
511 rblk->rblk_mode = FREAD;
512
513 #ifdef HAS_ODIRECT
514 rblk->rblk_dfd = rumpuser_open(path,
515 O_RDONLY | O_DIRECT, &error);
516 if (error) {
517 close(fd);
518 return error;
519 }
520 #endif
521 } else {
522 rblk->rblk_mode = FREAD|FWRITE;
523
524 #ifdef HAS_ODIRECT
525 rblk->rblk_dfd = rumpuser_open(path,
526 O_RDWR | O_DIRECT, &error);
527 if (error) {
528 close(fd);
529 return error;
530 }
531 #endif
532 }
533
534 if (rblk->rblk_ftype == RUMPUSER_FT_REG) {
535 uint64_t fsize= rblk->rblk_hostsize, off= rblk->rblk_hostoffset;
536 struct blkwin *win;
537 int i, winsize;
538
539 /*
540 * Use mmap to access a regular file. Allocate and
541 * cache initial windows here. Failure to allocate one
542 * means fallback to read/write i/o.
543 */
544
545 rblk->rblk_mmflags = 0;
546 if (rblk->rblk_mode & FREAD)
547 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_READ;
548 if (rblk->rblk_mode & FWRITE) {
549 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_WRITE;
550 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_SHARED;
551 }
552
553 TAILQ_INIT(&rblk->rblk_lruq);
554 rblk->rblk_fd = fd;
555
556 for (i = 0; i < memwincnt && off + i*memwinsize < fsize; i++) {
557 win = kmem_zalloc(sizeof(*win), KM_SLEEP);
558 WINVALIDATE(win);
559 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru);
560
561 /*
562 * Allocate first windows. Here we just generally
563 * make sure a) we can mmap at all b) we have the
564 * necessary VA available
565 */
566 winsize = memwinsize;
567 win = getwindow(rblk, off + i*memwinsize, &winsize,
568 &error);
569 if (win) {
570 putwindow(rblk, win);
571 } else {
572 wincleanup(rblk);
573 break;
574 }
575 }
576 } else {
577 rblk->rblk_fd = fd;
578 }
579
580 KASSERT(rblk->rblk_fd != -1);
581 return 0;
582 }
583
584 static int
585 backend_close(struct rblkdev *rblk)
586 {
587 int dummy;
588
589 if (rblk->rblk_mmflags)
590 wincleanup(rblk);
591 rumpuser_fsync(rblk->rblk_fd, &dummy);
592 rumpuser_close(rblk->rblk_fd, &dummy);
593 rblk->rblk_fd = -1;
594 #ifdef HAS_ODIRECT
595 if (rblk->rblk_dfd != -1) {
596 rumpuser_close(rblk->rblk_dfd, &dummy);
597 rblk->rblk_dfd = -1;
598 }
599 #endif
600
601 return 0;
602 }
603
604 int
605 rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l)
606 {
607 struct rblkdev *rblk = &minors[minor(dev)];
608
609 if (rblk->rblk_fd == -1)
610 return ENXIO;
611
612 if (((flag & (FREAD|FWRITE)) & ~rblk->rblk_mode) != 0) {
613 return EACCES;
614 }
615
616 return 0;
617 }
618
619 int
620 rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l)
621 {
622
623 return 0;
624 }
625
626 int
627 rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l)
628 {
629 devminor_t dmin = minor(dev);
630 struct rblkdev *rblk = &minors[dmin];
631 struct partinfo *pi;
632 int error = 0;
633
634 /* well, me should support a few more, but we don't for now */
635 switch (xfer) {
636 case DIOCGDINFO:
637 *(struct disklabel *)addr = rblk->rblk_label;
638 break;
639
640 case DIOCGPART:
641 pi = addr;
642 pi->part = &rblk->rblk_label.d_partitions[DISKPART(dmin)];
643 pi->disklab = &rblk->rblk_label;
644 break;
645
646 /* it's synced enough along the write path */
647 case DIOCCACHESYNC:
648 break;
649
650 default:
651 error = ENOTTY;
652 break;
653 }
654
655 return error;
656 }
657
658 static int
659 do_physio(dev_t dev, struct uio *uio, int which)
660 {
661 void (*strat)(struct buf *);
662
663 if (blkfail)
664 strat = rumpblk_strategy_fail;
665 else
666 strat = rumpblk_strategy;
667
668 return physio(strat, NULL, dev, which, minphys, uio);
669 }
670
671 int
672 rumpblk_read(dev_t dev, struct uio *uio, int flags)
673 {
674
675 return do_physio(dev, uio, B_READ);
676 }
677
678 int
679 rumpblk_write(dev_t dev, struct uio *uio, int flags)
680 {
681
682 return do_physio(dev, uio, B_WRITE);
683 }
684
685 static void
686 dostrategy(struct buf *bp)
687 {
688 struct rblkdev *rblk = &minors[minor(bp->b_dev)];
689 off_t off;
690 int async = bp->b_flags & B_ASYNC;
691 int error;
692
693 if (bp->b_bcount % (1<<sectshift) != 0) {
694 rump_biodone(bp, 0, EINVAL);
695 return;
696 }
697
698 /* collect statistics */
699 ev_io_total.ev_count++;
700 if (async)
701 ev_io_async.ev_count++;
702 if (BUF_ISWRITE(bp)) {
703 ev_bwrite_total.ev_count += bp->b_bcount;
704 if (async)
705 ev_bwrite_async.ev_count += bp->b_bcount;
706 } else {
707 ev_bread_total.ev_count++;
708 }
709
710 /*
711 * b_blkno is always in terms of DEV_BSIZE, and since we need
712 * to translate to a byte offset for the host read, this
713 * calculation does not need sectshift.
714 */
715 off = bp->b_blkno << DEV_BSHIFT;
716
717 /*
718 * Do bounds checking if we're working on a file. Otherwise
719 * invalid file systems might attempt to read beyond EOF. This
720 * is bad(tm) especially on mmapped images. This is essentially
721 * the kernel bounds_check() routines.
722 */
723 if (off + bp->b_bcount > rblk->rblk_size) {
724 int64_t sz = rblk->rblk_size - off;
725
726 /* EOF */
727 if (sz == 0) {
728 rump_biodone(bp, 0, 0);
729 return;
730 }
731 /* beyond EOF ==> error */
732 if (sz < 0) {
733 rump_biodone(bp, 0, EINVAL);
734 return;
735 }
736
737 /* truncate to device size */
738 bp->b_bcount = sz;
739 }
740
741 off += rblk->rblk_hostoffset;
742 DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64
743 " (0x%" PRIx64 " - 0x%" PRIx64 "), %ssync\n",
744 bp->b_bcount, BUF_ISREAD(bp) ? "READ" : "WRITE",
745 off, off, (off + bp->b_bcount), async ? "a" : ""));
746
747 /* mmap? handle here and return */
748 if (rblk->rblk_mmflags) {
749 struct blkwin *win;
750 int winsize, iodone;
751 uint8_t *ioaddr, *bufaddr;
752
753 for (iodone = 0; iodone < bp->b_bcount;
754 iodone += winsize, off += winsize) {
755 winsize = bp->b_bcount - iodone;
756 win = getwindow(rblk, off, &winsize, &error);
757 if (win == NULL) {
758 rump_biodone(bp, iodone, error);
759 return;
760 }
761
762 ioaddr = (uint8_t *)win->win_mem + (off-STARTWIN(off));
763 bufaddr = (uint8_t *)bp->b_data + iodone;
764
765 DPRINTF(("strat: %p off 0x%" PRIx64
766 ", ioaddr %p (%p)/buf %p\n", win,
767 win->win_off, ioaddr, win->win_mem, bufaddr));
768 if (BUF_ISREAD(bp)) {
769 memcpy(bufaddr, ioaddr, winsize);
770 } else {
771 memcpy(ioaddr, bufaddr, winsize);
772 }
773
774 /* synchronous write, sync bits back to disk */
775 if (BUF_ISWRITE(bp) && !async) {
776 rumpuser_memsync(ioaddr, winsize, &error);
777 }
778 putwindow(rblk, win);
779 }
780
781 rump_biodone(bp, bp->b_bcount, 0);
782 return;
783 }
784
785 /*
786 * Do I/O. We have different paths for async and sync I/O.
787 * Async I/O is done by passing a request to rumpuser where
788 * it is executed. The rumpuser routine then calls
789 * biodone() to signal any waiters in the kernel. I/O's are
790 * executed in series. Technically executing them in parallel
791 * would produce better results, but then we'd need either
792 * more threads or posix aio. Maybe worth investigating
793 * this later.
794 *
795 * Using bufq here might be a good idea.
796 */
797
798 if (rump_threads) {
799 struct rumpuser_aio *rua;
800 int op, fd;
801
802 fd = rblk->rblk_fd;
803 if (BUF_ISREAD(bp)) {
804 op = RUA_OP_READ;
805 } else {
806 op = RUA_OP_WRITE;
807 if (!async) {
808 /* O_DIRECT not fully automatic yet */
809 #ifdef HAS_ODIRECT
810 if ((off & ((1<<sectshift)-1)) == 0
811 && ((intptr_t)bp->b_data
812 & ((1<<sectshift)-1)) == 0
813 && (bp->b_bcount & ((1<<sectshift)-1)) == 0)
814 fd = rblk->rblk_dfd;
815 else
816 #endif
817 op |= RUA_OP_SYNC;
818 }
819 }
820
821 rumpuser_mutex_enter(&rumpuser_aio_mtx);
822 while ((rumpuser_aio_head+1) % N_AIOS == rumpuser_aio_tail) {
823 rumpuser_cv_wait(&rumpuser_aio_cv, &rumpuser_aio_mtx);
824 }
825
826 rua = &rumpuser_aios[rumpuser_aio_head];
827 KASSERT(rua->rua_bp == NULL);
828 rua->rua_fd = fd;
829 rua->rua_data = bp->b_data;
830 rua->rua_dlen = bp->b_bcount;
831 rua->rua_off = off;
832 rua->rua_bp = bp;
833 rua->rua_op = op;
834
835 /* insert into queue & signal */
836 rumpuser_aio_head = (rumpuser_aio_head+1) % N_AIOS;
837 rumpuser_cv_signal(&rumpuser_aio_cv);
838 rumpuser_mutex_exit(&rumpuser_aio_mtx);
839 } else {
840 if (BUF_ISREAD(bp)) {
841 rumpuser_read_bio(rblk->rblk_fd, bp->b_data,
842 bp->b_bcount, off, rump_biodone, bp);
843 } else {
844 rumpuser_write_bio(rblk->rblk_fd, bp->b_data,
845 bp->b_bcount, off, rump_biodone, bp);
846 }
847 if (BUF_ISWRITE(bp) && !async)
848 rumpuser_fsync(rblk->rblk_fd, &error);
849 }
850 }
851
852 void
853 rumpblk_strategy(struct buf *bp)
854 {
855
856 dostrategy(bp);
857 }
858
859 /*
860 * Simple random number generator. This is private so that we can
861 * very repeatedly control which blocks will fail.
862 *
863 * <mlelstv> pooka, rand()
864 * <mlelstv> [paste]
865 */
866 static unsigned
867 gimmerand(void)
868 {
869
870 return (randstate = randstate * 1103515245 + 12345) % (0x80000000L);
871 }
872
873 /*
874 * Block device with very simple fault injection. Fails every
875 * n out of BLKFAIL_MAX I/O with EIO. n is determined by the env
876 * variable RUMP_BLKFAIL.
877 */
878 void
879 rumpblk_strategy_fail(struct buf *bp)
880 {
881
882 if (gimmerand() % BLKFAIL_MAX >= blkfail) {
883 dostrategy(bp);
884 } else {
885 printf("block fault injection: failing I/O on block %lld\n",
886 (long long)bp->b_blkno);
887 bp->b_error = EIO;
888 biodone(bp);
889 }
890 }
891