rumpblk.c revision 1.16 1 /* $NetBSD: rumpblk.c,v 1.16 2009/04/06 20:40:33 pooka Exp $ */
2
3 /*
4 * Copyright (c) 2009 Antti Kantee. All Rights Reserved.
5 *
6 * Development of this software was supported by the
7 * Finnish Cultural Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
19 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 /*
32 * Block device emulation. Presents a block device interface and
33 * uses rumpuser system calls to satisfy I/O requests.
34 *
35 * We provide fault injection. The driver can be made to fail
36 * I/O occasionally.
37 *
38 * The driver also provides an optimization for regular files by
39 * using memory-mapped I/O. This avoids kernel access for every
40 * I/O operation. It also gives finer-grained control of how to
41 * flush data. Additionally, in case the rump kernel dumps core,
42 * we get way less carnage.
43 */
44
45 #include <sys/cdefs.h>
46 __KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.16 2009/04/06 20:40:33 pooka Exp $");
47
48 #include <sys/param.h>
49 #include <sys/buf.h>
50 #include <sys/conf.h>
51 #include <sys/condvar.h>
52 #include <sys/disklabel.h>
53 #include <sys/fcntl.h>
54 #include <sys/kmem.h>
55 #include <sys/malloc.h>
56 #include <sys/queue.h>
57 #include <sys/stat.h>
58
59 #include <rump/rumpuser.h>
60
61 #include "rump_private.h"
62 #include "rump_vfs_private.h"
63
64 #if 0
65 #define DPRINTF(x) printf x
66 #else
67 #define DPRINTF(x)
68 #endif
69
70 #define MEMWINSIZE (1<<20) /* 1MB */
71 #define MEMWINCOUNT 16 /* max 16 windows == 16 megs of memory */
72 #define STARTWIN(off) ((off) & ~(MEMWINSIZE-1))
73 #define INWIN(win,off) ((win)->win_off == STARTWIN(off))
74 #define WINSIZE(rblk, win) (MIN((rblk->rblk_size-win->win_off),MEMWINSIZE))
75 #define WINVALID(win) ((win)->win_off != (off_t)-1)
76 #define WINVALIDATE(win) ((win)->win_off = (off_t)-1)
77 struct blkwin {
78 off_t win_off;
79 void *win_mem;
80 int win_refcnt;
81
82 TAILQ_ENTRY(blkwin) win_lru;
83 };
84
85 #define RUMPBLK_SIZE 16
86 static struct rblkdev {
87 char *rblk_path;
88 int rblk_fd;
89
90 /* for mmap */
91 int rblk_mmflags;
92 kmutex_t rblk_memmtx;
93 kcondvar_t rblk_memcv;
94 TAILQ_HEAD(winlru, blkwin) rblk_lruq;
95 size_t rblk_size;
96 bool rblk_waiting;
97
98 struct partition *rblk_curpi;
99 struct partition rblk_pi;
100 struct disklabel rblk_dl;
101 } minors[RUMPBLK_SIZE];
102
103 dev_type_open(rumpblk_open);
104 dev_type_close(rumpblk_close);
105 dev_type_read(rumpblk_read);
106 dev_type_write(rumpblk_write);
107 dev_type_ioctl(rumpblk_ioctl);
108 dev_type_strategy(rumpblk_strategy);
109 dev_type_strategy(rumpblk_strategy_fail);
110 dev_type_dump(rumpblk_dump);
111 dev_type_size(rumpblk_size);
112
113 static const struct bdevsw rumpblk_bdevsw = {
114 rumpblk_open, rumpblk_close, rumpblk_strategy, rumpblk_ioctl,
115 nodump, nosize, D_DISK
116 };
117
118 static const struct bdevsw rumpblk_bdevsw_fail = {
119 rumpblk_open, rumpblk_close, rumpblk_strategy_fail, rumpblk_ioctl,
120 nodump, nosize, D_DISK
121 };
122
123 static const struct cdevsw rumpblk_cdevsw = {
124 rumpblk_open, rumpblk_close, rumpblk_read, rumpblk_write,
125 rumpblk_ioctl, nostop, notty, nopoll, nommap, nokqfilter, D_DISK
126 };
127
128 /* fail every n out of BLKFAIL_MAX */
129 #define BLKFAIL_MAX 10000
130 static int blkfail;
131 static unsigned randstate;
132
133 static struct blkwin *
134 getwindow(struct rblkdev *rblk, off_t off, int *wsize, int *error)
135 {
136 struct blkwin *win;
137
138 mutex_enter(&rblk->rblk_memmtx);
139 retry:
140 /* search for window */
141 TAILQ_FOREACH(win, &rblk->rblk_lruq, win_lru) {
142 if (INWIN(win, off) && WINVALID(win))
143 break;
144 }
145
146 /* found? return */
147 if (win) {
148 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
149 goto good;
150 }
151
152 /*
153 * Else, create new window. If the least recently used is not
154 * currently in use, reuse that. Otherwise we need to wait.
155 */
156 win = TAILQ_LAST(&rblk->rblk_lruq, winlru);
157 if (win->win_refcnt == 0) {
158 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
159 mutex_exit(&rblk->rblk_memmtx);
160
161 if (WINVALID(win)) {
162 DPRINTF(("win %p, unmap mem %p, off 0x%" PRIx64 "\n",
163 win, win->win_mem, win->win_off));
164 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win));
165 WINVALIDATE(win);
166 }
167
168 win->win_off = STARTWIN(off);
169 win->win_mem = rumpuser_filemmap(rblk->rblk_fd, win->win_off,
170 WINSIZE(rblk, win), rblk->rblk_mmflags, error);
171 DPRINTF(("win %p, off 0x%" PRIx64 ", mem %p\n",
172 win, win->win_off, win->win_mem));
173
174 mutex_enter(&rblk->rblk_memmtx);
175 if (win->win_mem == NULL) {
176 WINVALIDATE(win);
177 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru);
178 mutex_exit(&rblk->rblk_memmtx);
179 return NULL;
180 }
181 } else {
182 DPRINTF(("memwin wait\n"));
183 rblk->rblk_waiting = true;
184 cv_wait(&rblk->rblk_memcv, &rblk->rblk_memmtx);
185 goto retry;
186 }
187
188 good:
189 KASSERT(win);
190 win->win_refcnt++;
191 TAILQ_INSERT_HEAD(&rblk->rblk_lruq, win, win_lru);
192 mutex_exit(&rblk->rblk_memmtx);
193 *wsize = MIN(*wsize, MEMWINSIZE - (off-win->win_off));
194 KASSERT(*wsize);
195
196 return win;
197 }
198
199 static void
200 putwindow(struct rblkdev *rblk, struct blkwin *win)
201 {
202
203 mutex_enter(&rblk->rblk_memmtx);
204 if (--win->win_refcnt == 0 && rblk->rblk_waiting) {
205 rblk->rblk_waiting = false;
206 cv_signal(&rblk->rblk_memcv);
207 }
208 KASSERT(win->win_refcnt >= 0);
209 mutex_exit(&rblk->rblk_memmtx);
210 }
211
212 static void
213 wincleanup(struct rblkdev *rblk)
214 {
215 struct blkwin *win;
216
217 while ((win = TAILQ_FIRST(&rblk->rblk_lruq)) != NULL) {
218 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
219 if (WINVALID(win)) {
220 DPRINTF(("cleanup win %p addr %p\n",
221 win, win->win_mem));
222 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win));
223 }
224 kmem_free(win, sizeof(*win));
225 }
226 rblk->rblk_mmflags = 0;
227 }
228
229 int
230 rumpblk_init(void)
231 {
232 char buf[64];
233 int rumpblk = RUMPBLK;
234 int error, i;
235
236 if (rumpuser_getenv("RUMP_BLKFAIL", buf, sizeof(buf), &error) == 0) {
237 blkfail = strtoul(buf, NULL, 10);
238 /* fail everything */
239 if (blkfail > BLKFAIL_MAX)
240 blkfail = BLKFAIL_MAX;
241 if (rumpuser_getenv("RUMP_BLKFAIL_SEED", buf, sizeof(buf),
242 &error) == 0) {
243 randstate = strtoul(buf, NULL, 10);
244 } else {
245 randstate = arc4random(); /* XXX: not enough entropy */
246 }
247 printf("rumpblk: FAULT INJECTION ACTIVE! every %d out of"
248 " %d I/O will fail. key %u\n", blkfail, BLKFAIL_MAX,
249 randstate);
250 } else {
251 blkfail = 0;
252 }
253
254 memset(minors, 0, sizeof(minors));
255 for (i = 0; i < RUMPBLK_SIZE; i++) {
256 mutex_init(&minors[i].rblk_memmtx, MUTEX_DEFAULT, IPL_NONE);
257 cv_init(&minors[i].rblk_memcv, "rblkmcv");
258 }
259
260 if (blkfail) {
261 return devsw_attach("rumpblk", &rumpblk_bdevsw_fail, &rumpblk,
262 &rumpblk_cdevsw, &rumpblk);
263 } else {
264 return devsw_attach("rumpblk", &rumpblk_bdevsw, &rumpblk,
265 &rumpblk_cdevsw, &rumpblk);
266 }
267 }
268
269 int
270 rumpblk_register(const char *path)
271 {
272 size_t len;
273 int i;
274
275 for (i = 0; i < RUMPBLK_SIZE; i++)
276 if (minors[i].rblk_path && strcmp(minors[i].rblk_path, path)==0)
277 return i;
278
279 for (i = 0; i < RUMPBLK_SIZE; i++)
280 if (minors[i].rblk_path == NULL)
281 break;
282 if (i == RUMPBLK_SIZE)
283 return -1;
284
285 len = strlen(path);
286 minors[i].rblk_path = malloc(len + 1, M_TEMP, M_WAITOK);
287 strcpy(minors[i].rblk_path, path);
288 minors[i].rblk_fd = -1;
289 return i;
290 }
291
292 int
293 rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l)
294 {
295 struct rblkdev *rblk = &minors[minor(dev)];
296 uint64_t fsize;
297 int ft, dummy;
298 int error, fd;
299
300 KASSERT(rblk->rblk_fd == -1);
301 fd = rumpuser_open(rblk->rblk_path, OFLAGS(flag), &error);
302 if (error)
303 return error;
304
305 if (rumpuser_getfileinfo(rblk->rblk_path, &fsize, &ft, &error) == -1) {
306 rumpuser_close(fd, &dummy);
307 return error;
308 }
309
310 if (ft == RUMPUSER_FT_REG) {
311 struct blkwin *win;
312 int i, winsize;
313
314 /*
315 * Use mmap to access a regular file. Allocate and
316 * cache initial windows here. Failure to allocate one
317 * means fallback to read/write i/o.
318 */
319
320 rblk->rblk_mmflags = 0;
321 if (flag & FREAD)
322 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_READ;
323 if (flag & FWRITE) {
324 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_WRITE;
325 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_SHARED;
326 }
327
328 TAILQ_INIT(&rblk->rblk_lruq);
329 rblk->rblk_size = fsize;
330 rblk->rblk_fd = fd;
331
332 for (i = 0; i < MEMWINCOUNT && i * MEMWINSIZE < fsize; i++) {
333 win = kmem_zalloc(sizeof(*win), KM_SLEEP);
334 WINVALIDATE(win);
335 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru);
336
337 /*
338 * Allocate first windows. Here we just generally
339 * make sure a) we can mmap at all b) we have the
340 * necessary VA available
341 */
342 winsize = 1;
343 win = getwindow(rblk, i*MEMWINSIZE, &winsize, &error);
344 if (win) {
345 putwindow(rblk, win);
346 } else {
347 wincleanup(rblk);
348 break;
349 }
350 }
351
352 memset(&rblk->rblk_dl, 0, sizeof(rblk->rblk_dl));
353 rblk->rblk_pi.p_size = fsize >> DEV_BSHIFT;
354 rblk->rblk_dl.d_secsize = DEV_BSIZE;
355 rblk->rblk_curpi = &rblk->rblk_pi;
356 } else {
357 if (rumpuser_ioctl(fd, DIOCGDINFO, &rblk->rblk_dl,
358 &error) != -1) {
359 rumpuser_close(fd, &dummy);
360 return error;
361 }
362
363 rblk->rblk_fd = fd;
364 rblk->rblk_curpi = &rblk->rblk_dl.d_partitions[0];
365 }
366
367 KASSERT(rblk->rblk_fd != -1);
368 return 0;
369 }
370
371 int
372 rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l)
373 {
374 struct rblkdev *rblk = &minors[minor(dev)];
375 int dummy;
376
377 if (rblk->rblk_mmflags)
378 wincleanup(rblk);
379 rumpuser_fsync(rblk->rblk_fd, &dummy);
380 rumpuser_close(rblk->rblk_fd, &dummy);
381 rblk->rblk_fd = -1;
382
383 return 0;
384 }
385
386 int
387 rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l)
388 {
389 struct rblkdev *rblk = &minors[minor(dev)];
390 int rv, error;
391
392 if (xfer == DIOCGPART) {
393 struct partinfo *pi = (struct partinfo *)addr;
394
395 pi->part = rblk->rblk_curpi;
396 pi->disklab = &rblk->rblk_dl;
397
398 return 0;
399 }
400
401 rv = rumpuser_ioctl(rblk->rblk_fd, xfer, addr, &error);
402 if (rv == -1)
403 return error;
404
405 return 0;
406 }
407
408 int
409 rumpblk_read(dev_t dev, struct uio *uio, int flags)
410 {
411
412 panic("%s: unimplemented", __func__);
413 }
414
415 int
416 rumpblk_write(dev_t dev, struct uio *uio, int flags)
417 {
418
419 panic("%s: unimplemented", __func__);
420 }
421
422 static void
423 dostrategy(struct buf *bp)
424 {
425 struct rblkdev *rblk = &minors[minor(bp->b_dev)];
426 off_t off;
427 int async, error;
428
429 off = bp->b_blkno << DEV_BSHIFT;
430 /*
431 * Do bounds checking if we're working on a file. Otherwise
432 * invalid file systems might attempt to read beyond EOF. This
433 * is bad(tm) especially on mmapped images. This is essentially
434 * the kernel bounds_check() routines.
435 */
436 if (rblk->rblk_size && off + bp->b_bcount > rblk->rblk_size) {
437 int64_t sz = rblk->rblk_size - off;
438
439 /* EOF */
440 if (sz == 0) {
441 rump_biodone(bp, 0, 0);
442 return;
443 }
444 /* beyond EOF ==> error */
445 if (sz < 0) {
446 rump_biodone(bp, 0, EINVAL);
447 return;
448 }
449
450 /* truncate to device size */
451 bp->b_bcount = sz;
452 }
453
454 async = bp->b_flags & B_ASYNC;
455 DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64
456 " (0x%" PRIx64 " - 0x%" PRIx64 ")\n",
457 bp->b_bcount, BUF_ISREAD(bp) ? "READ" : "WRITE",
458 off, off, (off + bp->b_bcount)));
459
460 /* mmap? handle here and return */
461 if (rblk->rblk_mmflags) {
462 struct blkwin *win;
463 int winsize, iodone;
464 uint8_t *ioaddr, *bufaddr;
465
466 for (iodone = 0; iodone < bp->b_bcount;
467 iodone += winsize, off += winsize) {
468 winsize = bp->b_bcount - iodone;
469 win = getwindow(rblk, off, &winsize, &error);
470 if (win == NULL) {
471 rump_biodone(bp, iodone, error);
472 return;
473 }
474
475 ioaddr = (uint8_t *)win->win_mem + (off-STARTWIN(off));
476 bufaddr = (uint8_t *)bp->b_data + iodone;
477
478 DPRINTF(("strat: %p off 0x%" PRIx64
479 ", ioaddr %p (%p)/buf %p\n", win,
480 win->win_off, ioaddr, win->win_mem, bufaddr));
481 if (BUF_ISREAD(bp)) {
482 memcpy(bufaddr, ioaddr, winsize);
483 } else {
484 memcpy(ioaddr, bufaddr, winsize);
485 }
486
487 /* synchronous write, sync bits back to disk */
488 if (BUF_ISWRITE(bp) && !async) {
489 rumpuser_memsync(ioaddr, winsize, &error);
490 }
491 putwindow(rblk, win);
492 }
493
494 rump_biodone(bp, bp->b_bcount, 0);
495 return;
496 }
497
498 /*
499 * Do I/O. We have different paths for async and sync I/O.
500 * Async I/O is done by passing a request to rumpuser where
501 * it is executed. The rumpuser routine then calls
502 * biodone() to signal any waiters in the kernel. I/O's are
503 * executed in series. Technically executing them in parallel
504 * would produce better results, but then we'd need either
505 * more threads or posix aio. Maybe worth investigating
506 * this later.
507 *
508 * Using bufq here might be a good idea.
509 */
510 if (rump_threads) {
511 struct rumpuser_aio *rua;
512
513 rumpuser_mutex_enter(&rumpuser_aio_mtx);
514 while ((rumpuser_aio_head+1) % N_AIOS == rumpuser_aio_tail)
515 rumpuser_cv_wait(&rumpuser_aio_cv, &rumpuser_aio_mtx);
516
517 rua = &rumpuser_aios[rumpuser_aio_head];
518 KASSERT(rua->rua_bp == NULL);
519 rua->rua_fd = rblk->rblk_fd;
520 rua->rua_data = bp->b_data;
521 rua->rua_dlen = bp->b_bcount;
522 rua->rua_off = off;
523 rua->rua_bp = bp;
524 rua->rua_op = BUF_ISREAD(bp);
525
526 /* insert into queue & signal */
527 rumpuser_aio_head = (rumpuser_aio_head+1) % N_AIOS;
528 rumpuser_cv_signal(&rumpuser_aio_cv);
529 rumpuser_mutex_exit(&rumpuser_aio_mtx);
530
531 /* make sure non-async writes end up on backing media */
532 if (BUF_ISWRITE(bp) && !async) {
533 biowait(bp);
534 rumpuser_fsync(rblk->rblk_fd, &error);
535 }
536 } else {
537 if (BUF_ISREAD(bp)) {
538 rumpuser_read_bio(rblk->rblk_fd, bp->b_data,
539 bp->b_bcount, off, rump_biodone, bp);
540 } else {
541 rumpuser_write_bio(rblk->rblk_fd, bp->b_data,
542 bp->b_bcount, off, rump_biodone, bp);
543 }
544 if (!async) {
545 if (BUF_ISWRITE(bp))
546 rumpuser_fsync(rblk->rblk_fd, &error);
547 }
548 }
549 }
550
551 void
552 rumpblk_strategy(struct buf *bp)
553 {
554
555 dostrategy(bp);
556 }
557
558 /*
559 * Simple random number generator. This is private so that we can
560 * very repeatedly control which blocks will fail.
561 *
562 * <mlelstv> pooka, rand()
563 * <mlelstv> [paste]
564 */
565 static unsigned
566 gimmerand(void)
567 {
568
569 return (randstate = randstate * 1103515245 + 12345) % (0x80000000L);
570 }
571
572 /*
573 * Block device with very simple fault injection. Fails every
574 * n out of BLKFAIL_MAX I/O with EIO. n is determined by the env
575 * variable RUMP_BLKFAIL.
576 */
577 void
578 rumpblk_strategy_fail(struct buf *bp)
579 {
580
581 if (gimmerand() % BLKFAIL_MAX >= blkfail) {
582 dostrategy(bp);
583 } else {
584 printf("block fault injection: failing I/O on block %lld\n",
585 (long long)bp->b_blkno);
586 bp->b_error = EIO;
587 biodone(bp);
588 }
589 }
590