rumpblk.c revision 1.14 1 1.14 pooka /* $NetBSD: rumpblk.c,v 1.14 2009/03/23 11:52:42 pooka Exp $ */
2 1.1 pooka
3 1.1 pooka /*
4 1.1 pooka * Copyright (c) 2009 Antti Kantee. All Rights Reserved.
5 1.1 pooka *
6 1.1 pooka * Development of this software was supported by the
7 1.1 pooka * Finnish Cultural Foundation.
8 1.1 pooka *
9 1.1 pooka * Redistribution and use in source and binary forms, with or without
10 1.1 pooka * modification, are permitted provided that the following conditions
11 1.1 pooka * are met:
12 1.1 pooka * 1. Redistributions of source code must retain the above copyright
13 1.1 pooka * notice, this list of conditions and the following disclaimer.
14 1.1 pooka * 2. Redistributions in binary form must reproduce the above copyright
15 1.1 pooka * notice, this list of conditions and the following disclaimer in the
16 1.1 pooka * documentation and/or other materials provided with the distribution.
17 1.1 pooka *
18 1.1 pooka * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
19 1.1 pooka * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 1.1 pooka * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 1.1 pooka * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 1.1 pooka * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 1.1 pooka * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 1.1 pooka * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 1.1 pooka * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 1.1 pooka * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 1.1 pooka * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 1.1 pooka * SUCH DAMAGE.
29 1.1 pooka */
30 1.1 pooka
31 1.1 pooka /*
32 1.1 pooka * Block device emulation. Presents a block device interface and
33 1.1 pooka * uses rumpuser system calls to satisfy I/O requests.
34 1.1 pooka */
35 1.1 pooka
36 1.1 pooka #include <sys/cdefs.h>
37 1.14 pooka __KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.14 2009/03/23 11:52:42 pooka Exp $");
38 1.1 pooka
39 1.1 pooka #include <sys/param.h>
40 1.1 pooka #include <sys/buf.h>
41 1.1 pooka #include <sys/conf.h>
42 1.1 pooka #include <sys/disklabel.h>
43 1.1 pooka #include <sys/fcntl.h>
44 1.1 pooka #include <sys/kmem.h>
45 1.1 pooka #include <sys/malloc.h>
46 1.1 pooka #include <sys/stat.h>
47 1.1 pooka
48 1.1 pooka #include <rump/rumpuser.h>
49 1.1 pooka
50 1.1 pooka #include "rump_private.h"
51 1.1 pooka #include "rump_vfs_private.h"
52 1.1 pooka
53 1.1 pooka #define RUMPBLK_SIZE 16
54 1.1 pooka static struct rblkdev {
55 1.1 pooka char *rblk_path;
56 1.1 pooka int rblk_fd;
57 1.9 pooka uint8_t *rblk_mem;
58 1.11 pooka off_t rblk_size;
59 1.1 pooka
60 1.1 pooka struct partition *rblk_curpi;
61 1.1 pooka struct partition rblk_pi;
62 1.1 pooka struct disklabel rblk_dl;
63 1.1 pooka } minors[RUMPBLK_SIZE];
64 1.1 pooka
65 1.1 pooka dev_type_open(rumpblk_open);
66 1.1 pooka dev_type_close(rumpblk_close);
67 1.1 pooka dev_type_read(rumpblk_read);
68 1.1 pooka dev_type_write(rumpblk_write);
69 1.1 pooka dev_type_ioctl(rumpblk_ioctl);
70 1.1 pooka dev_type_strategy(rumpblk_strategy);
71 1.3 pooka dev_type_strategy(rumpblk_strategy_fail);
72 1.1 pooka dev_type_dump(rumpblk_dump);
73 1.1 pooka dev_type_size(rumpblk_size);
74 1.1 pooka
75 1.1 pooka static const struct bdevsw rumpblk_bdevsw = {
76 1.1 pooka rumpblk_open, rumpblk_close, rumpblk_strategy, rumpblk_ioctl,
77 1.1 pooka nodump, nosize, D_DISK
78 1.1 pooka };
79 1.1 pooka
80 1.3 pooka static const struct bdevsw rumpblk_bdevsw_fail = {
81 1.3 pooka rumpblk_open, rumpblk_close, rumpblk_strategy_fail, rumpblk_ioctl,
82 1.3 pooka nodump, nosize, D_DISK
83 1.3 pooka };
84 1.3 pooka
85 1.1 pooka static const struct cdevsw rumpblk_cdevsw = {
86 1.1 pooka rumpblk_open, rumpblk_close, rumpblk_read, rumpblk_write,
87 1.1 pooka rumpblk_ioctl, nostop, notty, nopoll, nommap, nokqfilter, D_DISK
88 1.1 pooka };
89 1.1 pooka
90 1.3 pooka /* fail every n out of BLKFAIL_MAX */
91 1.3 pooka #define BLKFAIL_MAX 10000
92 1.3 pooka static int blkfail;
93 1.3 pooka static unsigned randstate;
94 1.1 pooka
95 1.1 pooka int
96 1.8 cegger rumpblk_init(void)
97 1.1 pooka {
98 1.3 pooka char buf[64];
99 1.1 pooka int rumpblk = RUMPBLK;
100 1.3 pooka int error;
101 1.3 pooka
102 1.3 pooka if (rumpuser_getenv("RUMP_BLKFAIL", buf, sizeof(buf), &error) == 0) {
103 1.3 pooka blkfail = strtoul(buf, NULL, 10);
104 1.3 pooka /* fail everything */
105 1.3 pooka if (blkfail > BLKFAIL_MAX)
106 1.3 pooka blkfail = BLKFAIL_MAX;
107 1.3 pooka if (rumpuser_getenv("RUMP_BLKFAIL_SEED", buf, sizeof(buf),
108 1.3 pooka &error) == 0) {
109 1.3 pooka randstate = strtoul(buf, NULL, 10);
110 1.3 pooka } else {
111 1.3 pooka randstate = arc4random(); /* XXX: not enough entropy */
112 1.3 pooka }
113 1.3 pooka printf("rumpblk: FAULT INJECTION ACTIVE! every %d out of"
114 1.3 pooka " %d I/O will fail. key %u\n", blkfail, BLKFAIL_MAX,
115 1.3 pooka randstate);
116 1.3 pooka } else {
117 1.3 pooka blkfail = 0;
118 1.3 pooka }
119 1.1 pooka
120 1.3 pooka if (blkfail) {
121 1.3 pooka return devsw_attach("rumpblk", &rumpblk_bdevsw_fail, &rumpblk,
122 1.3 pooka &rumpblk_cdevsw, &rumpblk);
123 1.3 pooka } else {
124 1.3 pooka return devsw_attach("rumpblk", &rumpblk_bdevsw, &rumpblk,
125 1.3 pooka &rumpblk_cdevsw, &rumpblk);
126 1.3 pooka }
127 1.1 pooka }
128 1.1 pooka
129 1.1 pooka int
130 1.1 pooka rumpblk_register(const char *path)
131 1.1 pooka {
132 1.1 pooka size_t len;
133 1.1 pooka int i;
134 1.1 pooka
135 1.1 pooka for (i = 0; i < RUMPBLK_SIZE; i++)
136 1.10 uebayasi if (minors[i].rblk_path && strcmp(minors[i].rblk_path, path) == 0)
137 1.1 pooka return i;
138 1.1 pooka
139 1.1 pooka for (i = 0; i < RUMPBLK_SIZE; i++)
140 1.1 pooka if (minors[i].rblk_path == NULL)
141 1.1 pooka break;
142 1.1 pooka if (i == RUMPBLK_SIZE)
143 1.1 pooka return -1;
144 1.1 pooka
145 1.1 pooka len = strlen(path);
146 1.10 uebayasi minors[i].rblk_path = malloc(len + 1, M_TEMP, M_WAITOK);
147 1.1 pooka strcpy(minors[i].rblk_path, path);
148 1.1 pooka minors[i].rblk_fd = -1;
149 1.1 pooka return i;
150 1.1 pooka }
151 1.1 pooka
152 1.1 pooka int
153 1.1 pooka rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l)
154 1.1 pooka {
155 1.1 pooka struct rblkdev *rblk = &minors[minor(dev)];
156 1.9 pooka uint8_t *mem = NULL;
157 1.5 pooka uint64_t fsize;
158 1.9 pooka int ft, dummy;
159 1.1 pooka int error, fd;
160 1.1 pooka
161 1.1 pooka KASSERT(rblk->rblk_fd == -1);
162 1.1 pooka fd = rumpuser_open(rblk->rblk_path, OFLAGS(flag), &error);
163 1.1 pooka if (error)
164 1.1 pooka return error;
165 1.1 pooka
166 1.9 pooka if (rumpuser_getfileinfo(rblk->rblk_path, &fsize, &ft, &error) == -1) {
167 1.9 pooka rumpuser_close(fd, &dummy);
168 1.9 pooka return error;
169 1.9 pooka }
170 1.9 pooka
171 1.9 pooka if (ft == RUMPUSER_FT_REG) {
172 1.1 pooka /*
173 1.9 pooka * Try to mmap the file if it's size is max. half of
174 1.9 pooka * the address space. If mmap fails due to e.g. limits,
175 1.9 pooka * we fall back to the read/write path. This test is only
176 1.9 pooka * to prevent size_t vs. off_t wraparounds.
177 1.1 pooka */
178 1.10 uebayasi if (fsize < UINT64_C(1) << (sizeof(void *) * 8 - 1)) {
179 1.9 pooka int mmflags;
180 1.9 pooka
181 1.9 pooka mmflags = 0;
182 1.9 pooka if (flag & FREAD)
183 1.9 pooka mmflags |= RUMPUSER_FILEMMAP_READ;
184 1.12 pooka if (flag & FWRITE) {
185 1.9 pooka mmflags |= RUMPUSER_FILEMMAP_WRITE;
186 1.12 pooka mmflags |= RUMPUSER_FILEMMAP_SHARED;
187 1.12 pooka }
188 1.9 pooka mem = rumpuser_filemmap(fd, 0, fsize, mmflags, &error);
189 1.9 pooka }
190 1.9 pooka
191 1.1 pooka memset(&rblk->rblk_dl, 0, sizeof(rblk->rblk_dl));
192 1.1 pooka
193 1.9 pooka rblk->rblk_size = fsize;
194 1.9 pooka rblk->rblk_pi.p_size = fsize >> DEV_BSHIFT;
195 1.9 pooka rblk->rblk_dl.d_secsize = DEV_BSIZE;
196 1.9 pooka rblk->rblk_curpi = &rblk->rblk_pi;
197 1.9 pooka } else {
198 1.13 pooka if (rumpuser_ioctl(fd, DIOCGDINFO, &rblk->rblk_dl,
199 1.13 pooka &error) != -1) {
200 1.1 pooka rumpuser_close(fd, &dummy);
201 1.1 pooka return error;
202 1.1 pooka }
203 1.9 pooka
204 1.9 pooka rblk->rblk_curpi = &rblk->rblk_dl.d_partitions[0];
205 1.1 pooka }
206 1.1 pooka rblk->rblk_fd = fd;
207 1.9 pooka rblk->rblk_mem = mem;
208 1.13 pooka if (rblk->rblk_mem != NULL)
209 1.13 pooka printf("rumpblk%d: using mmio for %s\n",
210 1.13 pooka minor(dev), rblk->rblk_path);
211 1.1 pooka
212 1.1 pooka return 0;
213 1.1 pooka }
214 1.1 pooka
215 1.1 pooka int
216 1.1 pooka rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l)
217 1.1 pooka {
218 1.1 pooka struct rblkdev *rblk = &minors[minor(dev)];
219 1.1 pooka int dummy;
220 1.1 pooka
221 1.9 pooka if (rblk->rblk_mem) {
222 1.9 pooka KASSERT(rblk->rblk_size);
223 1.9 pooka rumpuser_memsync(rblk->rblk_mem, rblk->rblk_size, &dummy);
224 1.9 pooka rumpuser_unmap(rblk->rblk_mem, rblk->rblk_size);
225 1.9 pooka rblk->rblk_mem = NULL;
226 1.9 pooka }
227 1.1 pooka rumpuser_close(rblk->rblk_fd, &dummy);
228 1.1 pooka rblk->rblk_fd = -1;
229 1.1 pooka
230 1.1 pooka return 0;
231 1.1 pooka }
232 1.1 pooka
233 1.1 pooka int
234 1.1 pooka rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l)
235 1.1 pooka {
236 1.1 pooka struct rblkdev *rblk = &minors[minor(dev)];
237 1.1 pooka int rv, error;
238 1.1 pooka
239 1.1 pooka if (xfer == DIOCGPART) {
240 1.1 pooka struct partinfo *pi = (struct partinfo *)addr;
241 1.1 pooka
242 1.1 pooka pi->part = rblk->rblk_curpi;
243 1.1 pooka pi->disklab = &rblk->rblk_dl;
244 1.1 pooka
245 1.1 pooka return 0;
246 1.1 pooka }
247 1.1 pooka
248 1.1 pooka rv = rumpuser_ioctl(rblk->rblk_fd, xfer, addr, &error);
249 1.1 pooka if (rv == -1)
250 1.1 pooka return error;
251 1.1 pooka
252 1.1 pooka return 0;
253 1.1 pooka }
254 1.1 pooka
255 1.1 pooka int
256 1.1 pooka rumpblk_read(dev_t dev, struct uio *uio, int flags)
257 1.1 pooka {
258 1.1 pooka
259 1.1 pooka panic("%s: unimplemented", __func__);
260 1.1 pooka }
261 1.1 pooka
262 1.1 pooka int
263 1.1 pooka rumpblk_write(dev_t dev, struct uio *uio, int flags)
264 1.1 pooka {
265 1.1 pooka
266 1.1 pooka panic("%s: unimplemented", __func__);
267 1.1 pooka }
268 1.1 pooka
269 1.3 pooka static void
270 1.3 pooka dostrategy(struct buf *bp)
271 1.1 pooka {
272 1.1 pooka struct rblkdev *rblk = &minors[minor(bp->b_dev)];
273 1.1 pooka off_t off;
274 1.9 pooka int async, error;
275 1.1 pooka
276 1.1 pooka off = bp->b_blkno << DEV_BSHIFT;
277 1.11 pooka /*
278 1.11 pooka * Do bounds checking if we're working on a file. Otherwise
279 1.11 pooka * invalid file systems might attempt to read beyond EOF. This
280 1.11 pooka * is bad(tm) especially on mmapped images. This is essentially
281 1.11 pooka * the kernel bounds_check() routines.
282 1.11 pooka */
283 1.11 pooka if (rblk->rblk_size && off + bp->b_bcount > rblk->rblk_size) {
284 1.11 pooka int64_t sz = rblk->rblk_size - off;
285 1.11 pooka
286 1.11 pooka /* EOF */
287 1.11 pooka if (sz == 0) {
288 1.11 pooka rump_biodone(bp, 0, 0);
289 1.11 pooka return;
290 1.11 pooka }
291 1.11 pooka /* beyond EOF ==> error */
292 1.11 pooka if (sz < 0) {
293 1.11 pooka rump_biodone(bp, 0, EINVAL);
294 1.11 pooka return;
295 1.11 pooka }
296 1.11 pooka
297 1.11 pooka /* truncate to device size */
298 1.11 pooka bp->b_bcount = sz;
299 1.11 pooka }
300 1.11 pooka
301 1.9 pooka async = bp->b_flags & B_ASYNC;
302 1.1 pooka DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64
303 1.1 pooka " (0x%" PRIx64 " - 0x%" PRIx64")\n",
304 1.1 pooka bp->b_bcount, BUF_ISREAD(bp) "READ" : "WRITE",
305 1.1 pooka off, off, (off + bp->b_bcount)));
306 1.1 pooka
307 1.9 pooka /* mem optimization? handle here and return */
308 1.9 pooka if (rblk->rblk_mem) {
309 1.9 pooka uint8_t *ioaddr = rblk->rblk_mem + off;
310 1.11 pooka
311 1.9 pooka if (BUF_ISREAD(bp)) {
312 1.9 pooka memcpy(bp->b_data, ioaddr, bp->b_bcount);
313 1.9 pooka } else {
314 1.9 pooka memcpy(ioaddr, bp->b_data, bp->b_bcount);
315 1.9 pooka }
316 1.9 pooka
317 1.9 pooka /* synchronous write, sync necessary bits back to disk */
318 1.9 pooka if (BUF_ISWRITE(bp) && !async) {
319 1.9 pooka rumpuser_memsync(ioaddr, bp->b_bcount, &error);
320 1.9 pooka }
321 1.9 pooka rump_biodone(bp, bp->b_bcount, 0);
322 1.9 pooka
323 1.9 pooka return;
324 1.9 pooka }
325 1.9 pooka
326 1.1 pooka /*
327 1.1 pooka * Do I/O. We have different paths for async and sync I/O.
328 1.1 pooka * Async I/O is done by passing a request to rumpuser where
329 1.1 pooka * it is executed. The rumpuser routine then calls
330 1.1 pooka * biodone() to signal any waiters in the kernel. I/O's are
331 1.1 pooka * executed in series. Technically executing them in parallel
332 1.1 pooka * would produce better results, but then we'd need either
333 1.1 pooka * more threads or posix aio. Maybe worth investigating
334 1.1 pooka * this later.
335 1.14 pooka *
336 1.14 pooka * Using bufq here might be a good idea.
337 1.1 pooka */
338 1.13 pooka if (rump_threads) {
339 1.1 pooka struct rumpuser_aio *rua;
340 1.1 pooka
341 1.1 pooka rumpuser_mutex_enter(&rumpuser_aio_mtx);
342 1.13 pooka while ((rumpuser_aio_head+1) % N_AIOS == rumpuser_aio_tail)
343 1.13 pooka rumpuser_cv_wait(&rumpuser_aio_cv, &rumpuser_aio_mtx);
344 1.1 pooka
345 1.2 pooka rua = &rumpuser_aios[rumpuser_aio_head];
346 1.2 pooka KASSERT(rua->rua_bp == NULL);
347 1.2 pooka rua->rua_fd = rblk->rblk_fd;
348 1.2 pooka rua->rua_data = bp->b_data;
349 1.2 pooka rua->rua_dlen = bp->b_bcount;
350 1.2 pooka rua->rua_off = off;
351 1.2 pooka rua->rua_bp = bp;
352 1.2 pooka rua->rua_op = BUF_ISREAD(bp);
353 1.2 pooka
354 1.1 pooka /* insert into queue & signal */
355 1.6 pooka rumpuser_aio_head = (rumpuser_aio_head+1) % N_AIOS;
356 1.1 pooka rumpuser_cv_signal(&rumpuser_aio_cv);
357 1.1 pooka rumpuser_mutex_exit(&rumpuser_aio_mtx);
358 1.13 pooka
359 1.13 pooka /* make sure non-async writes end up on backing media */
360 1.13 pooka if (BUF_ISWRITE(bp) && !async) {
361 1.13 pooka biowait(bp);
362 1.13 pooka rumpuser_fsync(rblk->rblk_fd, &error);
363 1.13 pooka }
364 1.1 pooka } else {
365 1.1 pooka if (BUF_ISREAD(bp)) {
366 1.1 pooka rumpuser_read_bio(rblk->rblk_fd, bp->b_data,
367 1.1 pooka bp->b_bcount, off, rump_biodone, bp);
368 1.1 pooka } else {
369 1.1 pooka rumpuser_write_bio(rblk->rblk_fd, bp->b_data,
370 1.1 pooka bp->b_bcount, off, rump_biodone, bp);
371 1.1 pooka }
372 1.1 pooka if (!async) {
373 1.1 pooka if (BUF_ISWRITE(bp))
374 1.1 pooka rumpuser_fsync(rblk->rblk_fd, &error);
375 1.1 pooka }
376 1.1 pooka }
377 1.1 pooka }
378 1.3 pooka
379 1.3 pooka void
380 1.3 pooka rumpblk_strategy(struct buf *bp)
381 1.3 pooka {
382 1.3 pooka
383 1.3 pooka dostrategy(bp);
384 1.3 pooka }
385 1.3 pooka
386 1.3 pooka /*
387 1.4 pooka * Simple random number generator. This is private so that we can
388 1.4 pooka * very repeatedly control which blocks will fail.
389 1.4 pooka *
390 1.3 pooka * <mlelstv> pooka, rand()
391 1.3 pooka * <mlelstv> [paste]
392 1.3 pooka */
393 1.3 pooka static unsigned
394 1.3 pooka gimmerand(void)
395 1.3 pooka {
396 1.3 pooka
397 1.3 pooka return (randstate = randstate * 1103515245 + 12345) % (0x80000000L);
398 1.3 pooka }
399 1.3 pooka
400 1.3 pooka /*
401 1.3 pooka * Block device with very simple fault injection. Fails every
402 1.3 pooka * n out of BLKFAIL_MAX I/O with EIO. n is determined by the env
403 1.3 pooka * variable RUMP_BLKFAIL.
404 1.3 pooka */
405 1.3 pooka void
406 1.3 pooka rumpblk_strategy_fail(struct buf *bp)
407 1.3 pooka {
408 1.3 pooka
409 1.3 pooka if (gimmerand() % BLKFAIL_MAX >= blkfail) {
410 1.3 pooka dostrategy(bp);
411 1.3 pooka } else {
412 1.3 pooka printf("block fault injection: failing I/O on block %lld\n",
413 1.3 pooka (long long)bp->b_blkno);
414 1.3 pooka bp->b_error = EIO;
415 1.3 pooka biodone(bp);
416 1.3 pooka }
417 1.3 pooka }
418