rumpblk.c revision 1.14 1 /* $NetBSD: rumpblk.c,v 1.14 2009/03/23 11:52:42 pooka Exp $ */
2
3 /*
4 * Copyright (c) 2009 Antti Kantee. All Rights Reserved.
5 *
6 * Development of this software was supported by the
7 * Finnish Cultural Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
19 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 /*
32 * Block device emulation. Presents a block device interface and
33 * uses rumpuser system calls to satisfy I/O requests.
34 */
35
36 #include <sys/cdefs.h>
37 __KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.14 2009/03/23 11:52:42 pooka Exp $");
38
39 #include <sys/param.h>
40 #include <sys/buf.h>
41 #include <sys/conf.h>
42 #include <sys/disklabel.h>
43 #include <sys/fcntl.h>
44 #include <sys/kmem.h>
45 #include <sys/malloc.h>
46 #include <sys/stat.h>
47
48 #include <rump/rumpuser.h>
49
50 #include "rump_private.h"
51 #include "rump_vfs_private.h"
52
53 #define RUMPBLK_SIZE 16
54 static struct rblkdev {
55 char *rblk_path;
56 int rblk_fd;
57 uint8_t *rblk_mem;
58 off_t rblk_size;
59
60 struct partition *rblk_curpi;
61 struct partition rblk_pi;
62 struct disklabel rblk_dl;
63 } minors[RUMPBLK_SIZE];
64
65 dev_type_open(rumpblk_open);
66 dev_type_close(rumpblk_close);
67 dev_type_read(rumpblk_read);
68 dev_type_write(rumpblk_write);
69 dev_type_ioctl(rumpblk_ioctl);
70 dev_type_strategy(rumpblk_strategy);
71 dev_type_strategy(rumpblk_strategy_fail);
72 dev_type_dump(rumpblk_dump);
73 dev_type_size(rumpblk_size);
74
75 static const struct bdevsw rumpblk_bdevsw = {
76 rumpblk_open, rumpblk_close, rumpblk_strategy, rumpblk_ioctl,
77 nodump, nosize, D_DISK
78 };
79
80 static const struct bdevsw rumpblk_bdevsw_fail = {
81 rumpblk_open, rumpblk_close, rumpblk_strategy_fail, rumpblk_ioctl,
82 nodump, nosize, D_DISK
83 };
84
85 static const struct cdevsw rumpblk_cdevsw = {
86 rumpblk_open, rumpblk_close, rumpblk_read, rumpblk_write,
87 rumpblk_ioctl, nostop, notty, nopoll, nommap, nokqfilter, D_DISK
88 };
89
90 /* fail every n out of BLKFAIL_MAX */
91 #define BLKFAIL_MAX 10000
92 static int blkfail;
93 static unsigned randstate;
94
95 int
96 rumpblk_init(void)
97 {
98 char buf[64];
99 int rumpblk = RUMPBLK;
100 int error;
101
102 if (rumpuser_getenv("RUMP_BLKFAIL", buf, sizeof(buf), &error) == 0) {
103 blkfail = strtoul(buf, NULL, 10);
104 /* fail everything */
105 if (blkfail > BLKFAIL_MAX)
106 blkfail = BLKFAIL_MAX;
107 if (rumpuser_getenv("RUMP_BLKFAIL_SEED", buf, sizeof(buf),
108 &error) == 0) {
109 randstate = strtoul(buf, NULL, 10);
110 } else {
111 randstate = arc4random(); /* XXX: not enough entropy */
112 }
113 printf("rumpblk: FAULT INJECTION ACTIVE! every %d out of"
114 " %d I/O will fail. key %u\n", blkfail, BLKFAIL_MAX,
115 randstate);
116 } else {
117 blkfail = 0;
118 }
119
120 if (blkfail) {
121 return devsw_attach("rumpblk", &rumpblk_bdevsw_fail, &rumpblk,
122 &rumpblk_cdevsw, &rumpblk);
123 } else {
124 return devsw_attach("rumpblk", &rumpblk_bdevsw, &rumpblk,
125 &rumpblk_cdevsw, &rumpblk);
126 }
127 }
128
129 int
130 rumpblk_register(const char *path)
131 {
132 size_t len;
133 int i;
134
135 for (i = 0; i < RUMPBLK_SIZE; i++)
136 if (minors[i].rblk_path && strcmp(minors[i].rblk_path, path) == 0)
137 return i;
138
139 for (i = 0; i < RUMPBLK_SIZE; i++)
140 if (minors[i].rblk_path == NULL)
141 break;
142 if (i == RUMPBLK_SIZE)
143 return -1;
144
145 len = strlen(path);
146 minors[i].rblk_path = malloc(len + 1, M_TEMP, M_WAITOK);
147 strcpy(minors[i].rblk_path, path);
148 minors[i].rblk_fd = -1;
149 return i;
150 }
151
152 int
153 rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l)
154 {
155 struct rblkdev *rblk = &minors[minor(dev)];
156 uint8_t *mem = NULL;
157 uint64_t fsize;
158 int ft, dummy;
159 int error, fd;
160
161 KASSERT(rblk->rblk_fd == -1);
162 fd = rumpuser_open(rblk->rblk_path, OFLAGS(flag), &error);
163 if (error)
164 return error;
165
166 if (rumpuser_getfileinfo(rblk->rblk_path, &fsize, &ft, &error) == -1) {
167 rumpuser_close(fd, &dummy);
168 return error;
169 }
170
171 if (ft == RUMPUSER_FT_REG) {
172 /*
173 * Try to mmap the file if it's size is max. half of
174 * the address space. If mmap fails due to e.g. limits,
175 * we fall back to the read/write path. This test is only
176 * to prevent size_t vs. off_t wraparounds.
177 */
178 if (fsize < UINT64_C(1) << (sizeof(void *) * 8 - 1)) {
179 int mmflags;
180
181 mmflags = 0;
182 if (flag & FREAD)
183 mmflags |= RUMPUSER_FILEMMAP_READ;
184 if (flag & FWRITE) {
185 mmflags |= RUMPUSER_FILEMMAP_WRITE;
186 mmflags |= RUMPUSER_FILEMMAP_SHARED;
187 }
188 mem = rumpuser_filemmap(fd, 0, fsize, mmflags, &error);
189 }
190
191 memset(&rblk->rblk_dl, 0, sizeof(rblk->rblk_dl));
192
193 rblk->rblk_size = fsize;
194 rblk->rblk_pi.p_size = fsize >> DEV_BSHIFT;
195 rblk->rblk_dl.d_secsize = DEV_BSIZE;
196 rblk->rblk_curpi = &rblk->rblk_pi;
197 } else {
198 if (rumpuser_ioctl(fd, DIOCGDINFO, &rblk->rblk_dl,
199 &error) != -1) {
200 rumpuser_close(fd, &dummy);
201 return error;
202 }
203
204 rblk->rblk_curpi = &rblk->rblk_dl.d_partitions[0];
205 }
206 rblk->rblk_fd = fd;
207 rblk->rblk_mem = mem;
208 if (rblk->rblk_mem != NULL)
209 printf("rumpblk%d: using mmio for %s\n",
210 minor(dev), rblk->rblk_path);
211
212 return 0;
213 }
214
215 int
216 rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l)
217 {
218 struct rblkdev *rblk = &minors[minor(dev)];
219 int dummy;
220
221 if (rblk->rblk_mem) {
222 KASSERT(rblk->rblk_size);
223 rumpuser_memsync(rblk->rblk_mem, rblk->rblk_size, &dummy);
224 rumpuser_unmap(rblk->rblk_mem, rblk->rblk_size);
225 rblk->rblk_mem = NULL;
226 }
227 rumpuser_close(rblk->rblk_fd, &dummy);
228 rblk->rblk_fd = -1;
229
230 return 0;
231 }
232
233 int
234 rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l)
235 {
236 struct rblkdev *rblk = &minors[minor(dev)];
237 int rv, error;
238
239 if (xfer == DIOCGPART) {
240 struct partinfo *pi = (struct partinfo *)addr;
241
242 pi->part = rblk->rblk_curpi;
243 pi->disklab = &rblk->rblk_dl;
244
245 return 0;
246 }
247
248 rv = rumpuser_ioctl(rblk->rblk_fd, xfer, addr, &error);
249 if (rv == -1)
250 return error;
251
252 return 0;
253 }
254
255 int
256 rumpblk_read(dev_t dev, struct uio *uio, int flags)
257 {
258
259 panic("%s: unimplemented", __func__);
260 }
261
262 int
263 rumpblk_write(dev_t dev, struct uio *uio, int flags)
264 {
265
266 panic("%s: unimplemented", __func__);
267 }
268
269 static void
270 dostrategy(struct buf *bp)
271 {
272 struct rblkdev *rblk = &minors[minor(bp->b_dev)];
273 off_t off;
274 int async, error;
275
276 off = bp->b_blkno << DEV_BSHIFT;
277 /*
278 * Do bounds checking if we're working on a file. Otherwise
279 * invalid file systems might attempt to read beyond EOF. This
280 * is bad(tm) especially on mmapped images. This is essentially
281 * the kernel bounds_check() routines.
282 */
283 if (rblk->rblk_size && off + bp->b_bcount > rblk->rblk_size) {
284 int64_t sz = rblk->rblk_size - off;
285
286 /* EOF */
287 if (sz == 0) {
288 rump_biodone(bp, 0, 0);
289 return;
290 }
291 /* beyond EOF ==> error */
292 if (sz < 0) {
293 rump_biodone(bp, 0, EINVAL);
294 return;
295 }
296
297 /* truncate to device size */
298 bp->b_bcount = sz;
299 }
300
301 async = bp->b_flags & B_ASYNC;
302 DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64
303 " (0x%" PRIx64 " - 0x%" PRIx64")\n",
304 bp->b_bcount, BUF_ISREAD(bp) "READ" : "WRITE",
305 off, off, (off + bp->b_bcount)));
306
307 /* mem optimization? handle here and return */
308 if (rblk->rblk_mem) {
309 uint8_t *ioaddr = rblk->rblk_mem + off;
310
311 if (BUF_ISREAD(bp)) {
312 memcpy(bp->b_data, ioaddr, bp->b_bcount);
313 } else {
314 memcpy(ioaddr, bp->b_data, bp->b_bcount);
315 }
316
317 /* synchronous write, sync necessary bits back to disk */
318 if (BUF_ISWRITE(bp) && !async) {
319 rumpuser_memsync(ioaddr, bp->b_bcount, &error);
320 }
321 rump_biodone(bp, bp->b_bcount, 0);
322
323 return;
324 }
325
326 /*
327 * Do I/O. We have different paths for async and sync I/O.
328 * Async I/O is done by passing a request to rumpuser where
329 * it is executed. The rumpuser routine then calls
330 * biodone() to signal any waiters in the kernel. I/O's are
331 * executed in series. Technically executing them in parallel
332 * would produce better results, but then we'd need either
333 * more threads or posix aio. Maybe worth investigating
334 * this later.
335 *
336 * Using bufq here might be a good idea.
337 */
338 if (rump_threads) {
339 struct rumpuser_aio *rua;
340
341 rumpuser_mutex_enter(&rumpuser_aio_mtx);
342 while ((rumpuser_aio_head+1) % N_AIOS == rumpuser_aio_tail)
343 rumpuser_cv_wait(&rumpuser_aio_cv, &rumpuser_aio_mtx);
344
345 rua = &rumpuser_aios[rumpuser_aio_head];
346 KASSERT(rua->rua_bp == NULL);
347 rua->rua_fd = rblk->rblk_fd;
348 rua->rua_data = bp->b_data;
349 rua->rua_dlen = bp->b_bcount;
350 rua->rua_off = off;
351 rua->rua_bp = bp;
352 rua->rua_op = BUF_ISREAD(bp);
353
354 /* insert into queue & signal */
355 rumpuser_aio_head = (rumpuser_aio_head+1) % N_AIOS;
356 rumpuser_cv_signal(&rumpuser_aio_cv);
357 rumpuser_mutex_exit(&rumpuser_aio_mtx);
358
359 /* make sure non-async writes end up on backing media */
360 if (BUF_ISWRITE(bp) && !async) {
361 biowait(bp);
362 rumpuser_fsync(rblk->rblk_fd, &error);
363 }
364 } else {
365 if (BUF_ISREAD(bp)) {
366 rumpuser_read_bio(rblk->rblk_fd, bp->b_data,
367 bp->b_bcount, off, rump_biodone, bp);
368 } else {
369 rumpuser_write_bio(rblk->rblk_fd, bp->b_data,
370 bp->b_bcount, off, rump_biodone, bp);
371 }
372 if (!async) {
373 if (BUF_ISWRITE(bp))
374 rumpuser_fsync(rblk->rblk_fd, &error);
375 }
376 }
377 }
378
379 void
380 rumpblk_strategy(struct buf *bp)
381 {
382
383 dostrategy(bp);
384 }
385
386 /*
387 * Simple random number generator. This is private so that we can
388 * very repeatedly control which blocks will fail.
389 *
390 * <mlelstv> pooka, rand()
391 * <mlelstv> [paste]
392 */
393 static unsigned
394 gimmerand(void)
395 {
396
397 return (randstate = randstate * 1103515245 + 12345) % (0x80000000L);
398 }
399
400 /*
401 * Block device with very simple fault injection. Fails every
402 * n out of BLKFAIL_MAX I/O with EIO. n is determined by the env
403 * variable RUMP_BLKFAIL.
404 */
405 void
406 rumpblk_strategy_fail(struct buf *bp)
407 {
408
409 if (gimmerand() % BLKFAIL_MAX >= blkfail) {
410 dostrategy(bp);
411 } else {
412 printf("block fault injection: failing I/O on block %lld\n",
413 (long long)bp->b_blkno);
414 bp->b_error = EIO;
415 biodone(bp);
416 }
417 }
418