rumpblk.c revision 1.48 1 /* $NetBSD: rumpblk.c,v 1.48 2012/09/14 16:29:21 pooka Exp $ */
2
3 /*
4 * Copyright (c) 2009 Antti Kantee. All Rights Reserved.
5 *
6 * Development of this software was supported by the
7 * Finnish Cultural Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
19 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 /*
32 * Block device emulation. Presents a block device interface and
33 * uses rumpuser system calls to satisfy I/O requests.
34 *
35 * We provide fault injection. The driver can be made to fail
36 * I/O occasionally.
37 *
38 * The driver also provides an optimization for regular files by
39 * using memory-mapped I/O. This avoids kernel access for every
40 * I/O operation. It also gives finer-grained control of how to
41 * flush data. Additionally, in case the rump kernel dumps core,
42 * we get way less carnage.
43 *
44 * However, it is quite costly in writing large amounts of
45 * file data, since old contents cannot merely be overwritten, but
46 * must be paged in first before replacing (i.e. r/m/w). Ideally,
47 * we should use directio. The problem is that directio can fail
48 * silently causing improper file system semantics (i.e. unflushed
49 * data). Therefore, default to mmap for now. Even so, directio
50 * _should_ be safe and can be enabled by compiling this module
51 * with -DHAS_DIRECTIO.
52 */
53
54 #include <sys/cdefs.h>
55 __KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.48 2012/09/14 16:29:21 pooka Exp $");
56
57 #include <sys/param.h>
58 #include <sys/buf.h>
59 #include <sys/conf.h>
60 #include <sys/condvar.h>
61 #include <sys/disklabel.h>
62 #include <sys/evcnt.h>
63 #include <sys/fcntl.h>
64 #include <sys/kmem.h>
65 #include <sys/malloc.h>
66 #include <sys/queue.h>
67 #include <sys/stat.h>
68 #include <sys/cprng.h>
69
70 #include <rump/rumpuser.h>
71
72 #include "rump_private.h"
73 #include "rump_vfs_private.h"
74
75 /*
76 * O_DIRECT is the fastest alternative, but since it falls back to
77 * non-direct writes silently, I am not sure it will always be 100% safe.
78 * Use it and play with it, but do that with caution.
79 */
80 #if 0
81 #define HAS_ODIRECT
82 #endif
83
84 #if 0
85 #define DPRINTF(x) printf x
86 #else
87 #define DPRINTF(x)
88 #endif
89
90 /* Default: 16 x 1MB windows */
91 unsigned memwinsize = (1<<20);
92 unsigned memwincnt = 16;
93
94 #define STARTWIN(off) ((off) & ~((off_t)memwinsize-1))
95 #define INWIN(win,off) ((win)->win_off == STARTWIN(off))
96 #define WINSIZE(rblk, win) (MIN((rblk->rblk_hostsize-win->win_off), \
97 memwinsize))
98 #define WINVALID(win) ((win)->win_off != (off_t)-1)
99 #define WINVALIDATE(win) ((win)->win_off = (off_t)-1)
100 struct blkwin {
101 off_t win_off;
102 void *win_mem;
103 int win_refcnt;
104
105 TAILQ_ENTRY(blkwin) win_lru;
106 };
107
108 #define RUMPBLK_SIZE 16
109 static struct rblkdev {
110 char *rblk_path;
111 int rblk_fd;
112 int rblk_mode;
113 #ifdef HAS_ODIRECT
114 int rblk_dfd;
115 #endif
116 uint64_t rblk_size;
117 uint64_t rblk_hostoffset;
118 uint64_t rblk_hostsize;
119 int rblk_ftype;
120
121 /* for mmap */
122 int rblk_mmflags;
123 kmutex_t rblk_memmtx;
124 kcondvar_t rblk_memcv;
125 TAILQ_HEAD(winlru, blkwin) rblk_lruq;
126 bool rblk_waiting;
127
128 struct disklabel rblk_label;
129 } minors[RUMPBLK_SIZE];
130
131 static struct evcnt ev_io_total;
132 static struct evcnt ev_io_async;
133
134 static struct evcnt ev_memblk_hits;
135 static struct evcnt ev_memblk_busy;
136
137 static struct evcnt ev_bwrite_total;
138 static struct evcnt ev_bwrite_async;
139 static struct evcnt ev_bread_total;
140
141 dev_type_open(rumpblk_open);
142 dev_type_close(rumpblk_close);
143 dev_type_read(rumpblk_read);
144 dev_type_write(rumpblk_write);
145 dev_type_ioctl(rumpblk_ioctl);
146 dev_type_strategy(rumpblk_strategy);
147 dev_type_strategy(rumpblk_strategy_fail);
148 dev_type_dump(rumpblk_dump);
149 dev_type_size(rumpblk_size);
150
151 static const struct bdevsw rumpblk_bdevsw = {
152 rumpblk_open, rumpblk_close, rumpblk_strategy, rumpblk_ioctl,
153 nodump, nosize, D_DISK
154 };
155
156 static const struct bdevsw rumpblk_bdevsw_fail = {
157 rumpblk_open, rumpblk_close, rumpblk_strategy_fail, rumpblk_ioctl,
158 nodump, nosize, D_DISK
159 };
160
161 static const struct cdevsw rumpblk_cdevsw = {
162 rumpblk_open, rumpblk_close, rumpblk_read, rumpblk_write,
163 rumpblk_ioctl, nostop, notty, nopoll, nommap, nokqfilter, D_DISK
164 };
165
166 static int backend_open(struct rblkdev *, const char *);
167 static int backend_close(struct rblkdev *);
168
169 /* fail every n out of BLKFAIL_MAX */
170 #define BLKFAIL_MAX 10000
171 static int blkfail;
172 static unsigned randstate;
173 static kmutex_t rumpblk_lock;
174 static int sectshift = DEV_BSHIFT;
175
176 static void
177 makedefaultlabel(struct disklabel *lp, off_t size, int part)
178 {
179 int i;
180
181 memset(lp, 0, sizeof(*lp));
182
183 lp->d_secperunit = size;
184 lp->d_secsize = 1 << sectshift;
185 lp->d_nsectors = size >> sectshift;
186 lp->d_ntracks = 1;
187 lp->d_ncylinders = 1;
188 lp->d_secpercyl = lp->d_nsectors;
189
190 /* oh dear oh dear */
191 strncpy(lp->d_typename, "rumpd", sizeof(lp->d_typename));
192 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
193
194 lp->d_type = DTYPE_RUMPD;
195 lp->d_rpm = 11;
196 lp->d_interleave = 1;
197 lp->d_flags = 0;
198
199 /* XXX: RAW_PART handling? */
200 for (i = 0; i < part; i++) {
201 lp->d_partitions[i].p_fstype = FS_UNUSED;
202 }
203 lp->d_partitions[part].p_size = size >> sectshift;
204 lp->d_npartitions = part+1;
205 /* XXX: file system type? */
206
207 lp->d_magic = DISKMAGIC;
208 lp->d_magic2 = DISKMAGIC;
209 lp->d_checksum = 0; /* XXX */
210 }
211
212 static struct blkwin *
213 getwindow(struct rblkdev *rblk, off_t off, int *wsize, int *error)
214 {
215 struct blkwin *win;
216
217 mutex_enter(&rblk->rblk_memmtx);
218 retry:
219 /* search for window */
220 TAILQ_FOREACH(win, &rblk->rblk_lruq, win_lru) {
221 if (INWIN(win, off) && WINVALID(win))
222 break;
223 }
224
225 /* found? return */
226 if (win) {
227 ev_memblk_hits.ev_count++;
228 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
229 goto good;
230 }
231
232 /*
233 * Else, create new window. If the least recently used is not
234 * currently in use, reuse that. Otherwise we need to wait.
235 */
236 win = TAILQ_LAST(&rblk->rblk_lruq, winlru);
237 if (win->win_refcnt == 0) {
238 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
239 mutex_exit(&rblk->rblk_memmtx);
240
241 if (WINVALID(win)) {
242 DPRINTF(("win %p, unmap mem %p, off 0x%" PRIx64 "\n",
243 win, win->win_mem, win->win_off));
244 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win));
245 WINVALIDATE(win);
246 }
247
248 win->win_off = STARTWIN(off);
249 win->win_mem = rumpuser_filemmap(rblk->rblk_fd, win->win_off,
250 WINSIZE(rblk, win), rblk->rblk_mmflags, error);
251 DPRINTF(("win %p, off 0x%" PRIx64 ", mem %p\n",
252 win, win->win_off, win->win_mem));
253
254 mutex_enter(&rblk->rblk_memmtx);
255 if (win->win_mem == NULL) {
256 WINVALIDATE(win);
257 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru);
258 mutex_exit(&rblk->rblk_memmtx);
259 return NULL;
260 }
261 } else {
262 DPRINTF(("memwin wait\n"));
263 ev_memblk_busy.ev_count++;
264
265 rblk->rblk_waiting = true;
266 cv_wait(&rblk->rblk_memcv, &rblk->rblk_memmtx);
267 goto retry;
268 }
269
270 good:
271 KASSERT(win);
272 win->win_refcnt++;
273 TAILQ_INSERT_HEAD(&rblk->rblk_lruq, win, win_lru);
274 mutex_exit(&rblk->rblk_memmtx);
275 *wsize = MIN(*wsize, memwinsize - (off-win->win_off));
276 KASSERT(*wsize);
277
278 return win;
279 }
280
281 static void
282 putwindow(struct rblkdev *rblk, struct blkwin *win)
283 {
284
285 mutex_enter(&rblk->rblk_memmtx);
286 if (--win->win_refcnt == 0 && rblk->rblk_waiting) {
287 rblk->rblk_waiting = false;
288 cv_broadcast(&rblk->rblk_memcv);
289 }
290 KASSERT(win->win_refcnt >= 0);
291 mutex_exit(&rblk->rblk_memmtx);
292 }
293
294 static void
295 wincleanup(struct rblkdev *rblk)
296 {
297 struct blkwin *win;
298
299 while ((win = TAILQ_FIRST(&rblk->rblk_lruq)) != NULL) {
300 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
301 if (WINVALID(win)) {
302 DPRINTF(("cleanup win %p addr %p\n",
303 win, win->win_mem));
304 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win));
305 }
306 kmem_free(win, sizeof(*win));
307 }
308 rblk->rblk_mmflags = 0;
309 }
310
311 int
312 rumpblk_init(void)
313 {
314 char buf[64];
315 devmajor_t rumpblkmaj = RUMPBLK_DEVMAJOR;
316 unsigned tmp;
317 int error, i;
318
319 mutex_init(&rumpblk_lock, MUTEX_DEFAULT, IPL_NONE);
320
321 if (rumpuser_getenv("RUMP_BLKFAIL", buf, sizeof(buf), &error) == 0) {
322 blkfail = strtoul(buf, NULL, 10);
323 /* fail everything */
324 if (blkfail > BLKFAIL_MAX)
325 blkfail = BLKFAIL_MAX;
326 if (rumpuser_getenv("RUMP_BLKFAIL_SEED", buf, sizeof(buf),
327 &error) == 0) {
328 randstate = strtoul(buf, NULL, 10);
329 } else {
330 randstate = cprng_fast32();
331 }
332 printf("rumpblk: FAULT INJECTION ACTIVE! fail %d/%d. "
333 "seed %u\n", blkfail, BLKFAIL_MAX, randstate);
334 } else {
335 blkfail = 0;
336 }
337
338 if (rumpuser_getenv("RUMP_BLKWINSIZE", buf, sizeof(buf), &error) == 0) {
339 printf("rumpblk: ");
340 tmp = strtoul(buf, NULL, 10);
341 if (tmp && !(tmp & (tmp-1)))
342 memwinsize = tmp;
343 else
344 printf("invalid RUMP_BLKWINSIZE %d, ", tmp);
345 printf("using %d for memwinsize\n", memwinsize);
346 }
347 if (rumpuser_getenv("RUMP_BLKWINCOUNT", buf, sizeof(buf), &error) == 0){
348 printf("rumpblk: ");
349 tmp = strtoul(buf, NULL, 10);
350 if (tmp)
351 memwincnt = tmp;
352 else
353 printf("invalid RUMP_BLKWINCOUNT %d, ", tmp);
354 printf("using %d for memwincount\n", memwincnt);
355 }
356 if (rumpuser_getenv("RUMP_BLKSECTSHIFT", buf, sizeof(buf), &error)==0){
357 printf("rumpblk: ");
358 tmp = strtoul(buf, NULL, 10);
359 if (tmp >= DEV_BSHIFT)
360 sectshift = tmp;
361 else
362 printf("RUMP_BLKSECTSHIFT must be least %d (now %d), ",
363 DEV_BSHIFT, tmp);
364 printf("using %d for sector shift (size %d)\n",
365 sectshift, 1<<sectshift);
366 }
367
368 memset(minors, 0, sizeof(minors));
369 for (i = 0; i < RUMPBLK_SIZE; i++) {
370 mutex_init(&minors[i].rblk_memmtx, MUTEX_DEFAULT, IPL_NONE);
371 cv_init(&minors[i].rblk_memcv, "rblkmcv");
372 minors[i].rblk_fd = -1;
373 }
374
375 evcnt_attach_dynamic(&ev_io_total, EVCNT_TYPE_MISC, NULL,
376 "rumpblk", "I/O reqs");
377 evcnt_attach_dynamic(&ev_io_async, EVCNT_TYPE_MISC, NULL,
378 "rumpblk", "async I/O");
379
380 evcnt_attach_dynamic(&ev_bread_total, EVCNT_TYPE_MISC, NULL,
381 "rumpblk", "bytes read");
382 evcnt_attach_dynamic(&ev_bwrite_total, EVCNT_TYPE_MISC, NULL,
383 "rumpblk", "bytes written");
384 evcnt_attach_dynamic(&ev_bwrite_async, EVCNT_TYPE_MISC, NULL,
385 "rumpblk", "bytes written async");
386
387 evcnt_attach_dynamic(&ev_memblk_hits, EVCNT_TYPE_MISC, NULL,
388 "rumpblk", "window hits");
389 evcnt_attach_dynamic(&ev_memblk_busy, EVCNT_TYPE_MISC, NULL,
390 "rumpblk", "all windows busy");
391
392 if (blkfail) {
393 return devsw_attach("rumpblk",
394 &rumpblk_bdevsw_fail, &rumpblkmaj,
395 &rumpblk_cdevsw, &rumpblkmaj);
396 } else {
397 return devsw_attach("rumpblk",
398 &rumpblk_bdevsw, &rumpblkmaj,
399 &rumpblk_cdevsw, &rumpblkmaj);
400 }
401 }
402
403 int
404 rumpblk_register(const char *path, devminor_t *dmin,
405 uint64_t offset, uint64_t size)
406 {
407 struct rblkdev *rblk;
408 uint64_t flen;
409 size_t len;
410 int ftype, error, i;
411
412 /* devices might not report correct size unless they're open */
413 if (rumpuser_getfileinfo(path, &flen, &ftype, &error) == -1)
414 return error;
415
416 /* verify host file is of supported type */
417 if (!(ftype == RUMPUSER_FT_REG
418 || ftype == RUMPUSER_FT_BLK
419 || ftype == RUMPUSER_FT_CHR))
420 return EINVAL;
421
422 mutex_enter(&rumpblk_lock);
423 for (i = 0; i < RUMPBLK_SIZE; i++) {
424 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
425 mutex_exit(&rumpblk_lock);
426 *dmin = i;
427 return 0;
428 }
429 }
430
431 for (i = 0; i < RUMPBLK_SIZE; i++)
432 if (minors[i].rblk_path == NULL)
433 break;
434 if (i == RUMPBLK_SIZE) {
435 mutex_exit(&rumpblk_lock);
436 return EBUSY;
437 }
438
439 rblk = &minors[i];
440 rblk->rblk_path = __UNCONST("taken");
441 mutex_exit(&rumpblk_lock);
442
443 len = strlen(path);
444 rblk->rblk_path = malloc(len + 1, M_TEMP, M_WAITOK);
445 strcpy(rblk->rblk_path, path);
446 rblk->rblk_hostoffset = offset;
447 if (size != RUMPBLK_SIZENOTSET) {
448 KASSERT(size + offset <= flen);
449 rblk->rblk_size = size;
450 } else {
451 KASSERT(offset < flen);
452 rblk->rblk_size = flen - offset;
453 }
454 rblk->rblk_hostsize = flen;
455 rblk->rblk_ftype = ftype;
456 makedefaultlabel(&rblk->rblk_label, rblk->rblk_size, i);
457
458 if ((error = backend_open(rblk, path)) != 0) {
459 memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label));
460 free(rblk->rblk_path, M_TEMP);
461 rblk->rblk_path = NULL;
462 return error;
463 }
464
465 *dmin = i;
466 return 0;
467 }
468
469 /*
470 * Unregister rumpblk. It's the callers responsibility to make
471 * sure it's no longer in use.
472 */
473 int
474 rumpblk_deregister(const char *path)
475 {
476 struct rblkdev *rblk;
477 int i;
478
479 mutex_enter(&rumpblk_lock);
480 for (i = 0; i < RUMPBLK_SIZE; i++) {
481 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
482 break;
483 }
484 }
485 mutex_exit(&rumpblk_lock);
486
487 if (i == RUMPBLK_SIZE)
488 return ENOENT;
489
490 rblk = &minors[i];
491 backend_close(rblk);
492
493 wincleanup(rblk);
494 free(rblk->rblk_path, M_TEMP);
495 memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label));
496 rblk->rblk_path = NULL;
497
498 return 0;
499 }
500
501 static int
502 backend_open(struct rblkdev *rblk, const char *path)
503 {
504 int error, fd;
505
506 KASSERT(rblk->rblk_fd == -1);
507 fd = rumpuser_open(path, RUMPUSER_OPEN_RDWR, &error);
508 if (error) {
509 fd = rumpuser_open(path, RUMPUSER_OPEN_RDONLY, &error);
510 if (error)
511 return error;
512 rblk->rblk_mode = FREAD;
513
514 #ifdef HAS_ODIRECT
515 rblk->rblk_dfd = rumpuser_open(path,
516 RUMPUSER_OPEN_RDONLY | RUMPUSER_OPEN_DIRECT, &error);
517 if (error) {
518 close(fd);
519 return error;
520 }
521 #endif
522 } else {
523 rblk->rblk_mode = FREAD|FWRITE;
524
525 #ifdef HAS_ODIRECT
526 rblk->rblk_dfd = rumpuser_open(path,
527 RUMPUSER_OPEN_RDWR | RUMPUSER_OPEN_DIRECT, &error);
528 if (error) {
529 close(fd);
530 return error;
531 }
532 #endif
533 }
534
535 if (rblk->rblk_ftype == RUMPUSER_FT_REG) {
536 uint64_t fsize= rblk->rblk_hostsize, off= rblk->rblk_hostoffset;
537 struct blkwin *win;
538 int i, winsize;
539
540 /*
541 * Use mmap to access a regular file. Allocate and
542 * cache initial windows here. Failure to allocate one
543 * means fallback to read/write i/o.
544 */
545
546 rblk->rblk_mmflags = 0;
547 if (rblk->rblk_mode & FREAD)
548 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_READ;
549 if (rblk->rblk_mode & FWRITE) {
550 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_WRITE;
551 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_SHARED;
552 }
553
554 TAILQ_INIT(&rblk->rblk_lruq);
555 rblk->rblk_fd = fd;
556
557 for (i = 0; i < memwincnt && off + i*memwinsize < fsize; i++) {
558 win = kmem_zalloc(sizeof(*win), KM_SLEEP);
559 WINVALIDATE(win);
560 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru);
561
562 /*
563 * Allocate first windows. Here we just generally
564 * make sure a) we can mmap at all b) we have the
565 * necessary VA available
566 */
567 winsize = memwinsize;
568 win = getwindow(rblk, off + i*memwinsize, &winsize,
569 &error);
570 if (win) {
571 putwindow(rblk, win);
572 } else {
573 wincleanup(rblk);
574 break;
575 }
576 }
577 } else {
578 rblk->rblk_fd = fd;
579 }
580
581 KASSERT(rblk->rblk_fd != -1);
582 return 0;
583 }
584
585 static int
586 backend_close(struct rblkdev *rblk)
587 {
588 int dummy;
589
590 if (rblk->rblk_mmflags)
591 wincleanup(rblk);
592 rumpuser_fsync(rblk->rblk_fd, &dummy);
593 rumpuser_close(rblk->rblk_fd, &dummy);
594 rblk->rblk_fd = -1;
595 #ifdef HAS_ODIRECT
596 if (rblk->rblk_dfd != -1) {
597 rumpuser_close(rblk->rblk_dfd, &dummy);
598 rblk->rblk_dfd = -1;
599 }
600 #endif
601
602 return 0;
603 }
604
605 int
606 rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l)
607 {
608 struct rblkdev *rblk = &minors[minor(dev)];
609
610 if (rblk->rblk_fd == -1)
611 return ENXIO;
612
613 if (((flag & (FREAD|FWRITE)) & ~rblk->rblk_mode) != 0) {
614 return EACCES;
615 }
616
617 return 0;
618 }
619
620 int
621 rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l)
622 {
623
624 return 0;
625 }
626
627 int
628 rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l)
629 {
630 devminor_t dmin = minor(dev);
631 struct rblkdev *rblk = &minors[dmin];
632 struct partinfo *pi;
633 int error = 0;
634
635 /* well, me should support a few more, but we don't for now */
636 switch (xfer) {
637 case DIOCGDINFO:
638 *(struct disklabel *)addr = rblk->rblk_label;
639 break;
640
641 case DIOCGPART:
642 pi = addr;
643 pi->part = &rblk->rblk_label.d_partitions[DISKPART(dmin)];
644 pi->disklab = &rblk->rblk_label;
645 break;
646
647 /* it's synced enough along the write path */
648 case DIOCCACHESYNC:
649 break;
650
651 default:
652 error = ENOTTY;
653 break;
654 }
655
656 return error;
657 }
658
659 static int
660 do_physio(dev_t dev, struct uio *uio, int which)
661 {
662 void (*strat)(struct buf *);
663
664 if (blkfail)
665 strat = rumpblk_strategy_fail;
666 else
667 strat = rumpblk_strategy;
668
669 return physio(strat, NULL, dev, which, minphys, uio);
670 }
671
672 int
673 rumpblk_read(dev_t dev, struct uio *uio, int flags)
674 {
675
676 return do_physio(dev, uio, B_READ);
677 }
678
679 int
680 rumpblk_write(dev_t dev, struct uio *uio, int flags)
681 {
682
683 return do_physio(dev, uio, B_WRITE);
684 }
685
686 static void
687 dostrategy(struct buf *bp)
688 {
689 struct rblkdev *rblk = &minors[minor(bp->b_dev)];
690 off_t off;
691 int async = bp->b_flags & B_ASYNC;
692 int error;
693
694 if (bp->b_bcount % (1<<sectshift) != 0) {
695 rump_biodone(bp, 0, EINVAL);
696 return;
697 }
698
699 /* collect statistics */
700 ev_io_total.ev_count++;
701 if (async)
702 ev_io_async.ev_count++;
703 if (BUF_ISWRITE(bp)) {
704 ev_bwrite_total.ev_count += bp->b_bcount;
705 if (async)
706 ev_bwrite_async.ev_count += bp->b_bcount;
707 } else {
708 ev_bread_total.ev_count++;
709 }
710
711 /*
712 * b_blkno is always in terms of DEV_BSIZE, and since we need
713 * to translate to a byte offset for the host read, this
714 * calculation does not need sectshift.
715 */
716 off = bp->b_blkno << DEV_BSHIFT;
717
718 /*
719 * Do bounds checking if we're working on a file. Otherwise
720 * invalid file systems might attempt to read beyond EOF. This
721 * is bad(tm) especially on mmapped images. This is essentially
722 * the kernel bounds_check() routines.
723 */
724 if (off + bp->b_bcount > rblk->rblk_size) {
725 int64_t sz = rblk->rblk_size - off;
726
727 /* EOF */
728 if (sz == 0) {
729 rump_biodone(bp, 0, 0);
730 return;
731 }
732 /* beyond EOF ==> error */
733 if (sz < 0) {
734 rump_biodone(bp, 0, EINVAL);
735 return;
736 }
737
738 /* truncate to device size */
739 bp->b_bcount = sz;
740 }
741
742 off += rblk->rblk_hostoffset;
743 DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64
744 " (0x%" PRIx64 " - 0x%" PRIx64 "), %ssync\n",
745 bp->b_bcount, BUF_ISREAD(bp) ? "READ" : "WRITE",
746 off, off, (off + bp->b_bcount), async ? "a" : ""));
747
748 /* mmap? handle here and return */
749 if (rblk->rblk_mmflags) {
750 struct blkwin *win;
751 int winsize, iodone;
752 uint8_t *ioaddr, *bufaddr;
753
754 for (iodone = 0; iodone < bp->b_bcount;
755 iodone += winsize, off += winsize) {
756 winsize = bp->b_bcount - iodone;
757 win = getwindow(rblk, off, &winsize, &error);
758 if (win == NULL) {
759 rump_biodone(bp, iodone, error);
760 return;
761 }
762
763 ioaddr = (uint8_t *)win->win_mem + (off-STARTWIN(off));
764 bufaddr = (uint8_t *)bp->b_data + iodone;
765
766 DPRINTF(("strat: %p off 0x%" PRIx64
767 ", ioaddr %p (%p)/buf %p\n", win,
768 win->win_off, ioaddr, win->win_mem, bufaddr));
769 if (BUF_ISREAD(bp)) {
770 memcpy(bufaddr, ioaddr, winsize);
771 } else {
772 memcpy(ioaddr, bufaddr, winsize);
773 }
774
775 /* synchronous write, sync bits back to disk */
776 if (BUF_ISWRITE(bp) && !async) {
777 rumpuser_memsync(ioaddr, winsize, &error);
778 }
779 putwindow(rblk, win);
780 }
781
782 rump_biodone(bp, bp->b_bcount, 0);
783 return;
784 }
785
786 /*
787 * Do I/O. We have different paths for async and sync I/O.
788 * Async I/O is done by passing a request to rumpuser where
789 * it is executed. The rumpuser routine then calls
790 * biodone() to signal any waiters in the kernel. I/O's are
791 * executed in series. Technically executing them in parallel
792 * would produce better results, but then we'd need either
793 * more threads or posix aio. Maybe worth investigating
794 * this later.
795 *
796 * Using bufq here might be a good idea.
797 */
798
799 if (rump_threads) {
800 struct rumpuser_aio *rua;
801 int op, fd;
802
803 fd = rblk->rblk_fd;
804 if (BUF_ISREAD(bp)) {
805 op = RUA_OP_READ;
806 } else {
807 op = RUA_OP_WRITE;
808 if (!async) {
809 /* O_DIRECT not fully automatic yet */
810 #ifdef HAS_ODIRECT
811 if ((off & ((1<<sectshift)-1)) == 0
812 && ((intptr_t)bp->b_data
813 & ((1<<sectshift)-1)) == 0
814 && (bp->b_bcount & ((1<<sectshift)-1)) == 0)
815 fd = rblk->rblk_dfd;
816 else
817 #endif
818 op |= RUA_OP_SYNC;
819 }
820 }
821
822 rumpuser_mutex_enter(&rumpuser_aio_mtx);
823 while ((rumpuser_aio_head+1) % N_AIOS == rumpuser_aio_tail) {
824 rumpuser_cv_wait(&rumpuser_aio_cv, &rumpuser_aio_mtx);
825 }
826
827 rua = &rumpuser_aios[rumpuser_aio_head];
828 KASSERT(rua->rua_bp == NULL);
829 rua->rua_fd = fd;
830 rua->rua_data = bp->b_data;
831 rua->rua_dlen = bp->b_bcount;
832 rua->rua_off = off;
833 rua->rua_bp = bp;
834 rua->rua_op = op;
835
836 /* insert into queue & signal */
837 rumpuser_aio_head = (rumpuser_aio_head+1) % N_AIOS;
838 rumpuser_cv_signal(&rumpuser_aio_cv);
839 rumpuser_mutex_exit(&rumpuser_aio_mtx);
840 } else {
841 if (BUF_ISREAD(bp)) {
842 rumpuser_read_bio(rblk->rblk_fd, bp->b_data,
843 bp->b_bcount, off, rump_biodone, bp);
844 } else {
845 rumpuser_write_bio(rblk->rblk_fd, bp->b_data,
846 bp->b_bcount, off, rump_biodone, bp);
847 }
848 if (BUF_ISWRITE(bp) && !async)
849 rumpuser_fsync(rblk->rblk_fd, &error);
850 }
851 }
852
853 void
854 rumpblk_strategy(struct buf *bp)
855 {
856
857 dostrategy(bp);
858 }
859
860 /*
861 * Simple random number generator. This is private so that we can
862 * very repeatedly control which blocks will fail.
863 *
864 * <mlelstv> pooka, rand()
865 * <mlelstv> [paste]
866 */
867 static unsigned
868 gimmerand(void)
869 {
870
871 return (randstate = randstate * 1103515245 + 12345) % (0x80000000L);
872 }
873
874 /*
875 * Block device with very simple fault injection. Fails every
876 * n out of BLKFAIL_MAX I/O with EIO. n is determined by the env
877 * variable RUMP_BLKFAIL.
878 */
879 void
880 rumpblk_strategy_fail(struct buf *bp)
881 {
882
883 if (gimmerand() % BLKFAIL_MAX >= blkfail) {
884 dostrategy(bp);
885 } else {
886 printf("block fault injection: failing I/O on block %lld\n",
887 (long long)bp->b_blkno);
888 bp->b_error = EIO;
889 biodone(bp);
890 }
891 }
892