rumpblk.c revision 1.12 1 /* $NetBSD: rumpblk.c,v 1.12 2009/03/23 10:26:49 pooka Exp $ */
2
3 /*
4 * Copyright (c) 2009 Antti Kantee. All Rights Reserved.
5 *
6 * Development of this software was supported by the
7 * Finnish Cultural Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
19 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 /*
32 * Block device emulation. Presents a block device interface and
33 * uses rumpuser system calls to satisfy I/O requests.
34 */
35
36 #include <sys/cdefs.h>
37 __KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.12 2009/03/23 10:26:49 pooka Exp $");
38
39 #include <sys/param.h>
40 #include <sys/buf.h>
41 #include <sys/conf.h>
42 #include <sys/disklabel.h>
43 #include <sys/fcntl.h>
44 #include <sys/kmem.h>
45 #include <sys/malloc.h>
46 #include <sys/stat.h>
47
48 #include <rump/rumpuser.h>
49
50 #include "rump_private.h"
51 #include "rump_vfs_private.h"
52
53 #define RUMPBLK_SIZE 16
54 static struct rblkdev {
55 char *rblk_path;
56 int rblk_fd;
57 uint8_t *rblk_mem;
58 off_t rblk_size;
59
60 struct partition *rblk_curpi;
61 struct partition rblk_pi;
62 struct disklabel rblk_dl;
63 } minors[RUMPBLK_SIZE];
64
65 dev_type_open(rumpblk_open);
66 dev_type_close(rumpblk_close);
67 dev_type_read(rumpblk_read);
68 dev_type_write(rumpblk_write);
69 dev_type_ioctl(rumpblk_ioctl);
70 dev_type_strategy(rumpblk_strategy);
71 dev_type_strategy(rumpblk_strategy_fail);
72 dev_type_dump(rumpblk_dump);
73 dev_type_size(rumpblk_size);
74
75 static const struct bdevsw rumpblk_bdevsw = {
76 rumpblk_open, rumpblk_close, rumpblk_strategy, rumpblk_ioctl,
77 nodump, nosize, D_DISK
78 };
79
80 static const struct bdevsw rumpblk_bdevsw_fail = {
81 rumpblk_open, rumpblk_close, rumpblk_strategy_fail, rumpblk_ioctl,
82 nodump, nosize, D_DISK
83 };
84
85 static const struct cdevsw rumpblk_cdevsw = {
86 rumpblk_open, rumpblk_close, rumpblk_read, rumpblk_write,
87 rumpblk_ioctl, nostop, notty, nopoll, nommap, nokqfilter, D_DISK
88 };
89
90 /* fail every n out of BLKFAIL_MAX */
91 #define BLKFAIL_MAX 10000
92 static int blkfail;
93 static unsigned randstate;
94
95 int
96 rumpblk_init(void)
97 {
98 char buf[64];
99 int rumpblk = RUMPBLK;
100 int error;
101
102 if (rumpuser_getenv("RUMP_BLKFAIL", buf, sizeof(buf), &error) == 0) {
103 blkfail = strtoul(buf, NULL, 10);
104 /* fail everything */
105 if (blkfail > BLKFAIL_MAX)
106 blkfail = BLKFAIL_MAX;
107 if (rumpuser_getenv("RUMP_BLKFAIL_SEED", buf, sizeof(buf),
108 &error) == 0) {
109 randstate = strtoul(buf, NULL, 10);
110 } else {
111 randstate = arc4random(); /* XXX: not enough entropy */
112 }
113 printf("rumpblk: FAULT INJECTION ACTIVE! every %d out of"
114 " %d I/O will fail. key %u\n", blkfail, BLKFAIL_MAX,
115 randstate);
116 } else {
117 blkfail = 0;
118 }
119
120 if (blkfail) {
121 return devsw_attach("rumpblk", &rumpblk_bdevsw_fail, &rumpblk,
122 &rumpblk_cdevsw, &rumpblk);
123 } else {
124 return devsw_attach("rumpblk", &rumpblk_bdevsw, &rumpblk,
125 &rumpblk_cdevsw, &rumpblk);
126 }
127 }
128
129 int
130 rumpblk_register(const char *path)
131 {
132 size_t len;
133 int i;
134
135 for (i = 0; i < RUMPBLK_SIZE; i++)
136 if (minors[i].rblk_path && strcmp(minors[i].rblk_path, path) == 0)
137 return i;
138
139 for (i = 0; i < RUMPBLK_SIZE; i++)
140 if (minors[i].rblk_path == NULL)
141 break;
142 if (i == RUMPBLK_SIZE)
143 return -1;
144
145 len = strlen(path);
146 minors[i].rblk_path = malloc(len + 1, M_TEMP, M_WAITOK);
147 strcpy(minors[i].rblk_path, path);
148 minors[i].rblk_fd = -1;
149 return i;
150 }
151
152 int
153 rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l)
154 {
155 struct rblkdev *rblk = &minors[minor(dev)];
156 uint8_t *mem = NULL;
157 uint64_t fsize;
158 int ft, dummy;
159 int error, fd;
160
161 KASSERT(rblk->rblk_fd == -1);
162 fd = rumpuser_open(rblk->rblk_path, OFLAGS(flag), &error);
163 if (error)
164 return error;
165
166 if (rumpuser_getfileinfo(rblk->rblk_path, &fsize, &ft, &error) == -1) {
167 rumpuser_close(fd, &dummy);
168 return error;
169 }
170
171 if (ft == RUMPUSER_FT_REG) {
172 /*
173 * Try to mmap the file if it's size is max. half of
174 * the address space. If mmap fails due to e.g. limits,
175 * we fall back to the read/write path. This test is only
176 * to prevent size_t vs. off_t wraparounds.
177 */
178 if (fsize < UINT64_C(1) << (sizeof(void *) * 8 - 1)) {
179 int mmflags;
180
181 mmflags = 0;
182 if (flag & FREAD)
183 mmflags |= RUMPUSER_FILEMMAP_READ;
184 if (flag & FWRITE) {
185 mmflags |= RUMPUSER_FILEMMAP_WRITE;
186 mmflags |= RUMPUSER_FILEMMAP_SHARED;
187 }
188 mem = rumpuser_filemmap(fd, 0, fsize, mmflags, &error);
189 }
190
191 memset(&rblk->rblk_dl, 0, sizeof(rblk->rblk_dl));
192
193 rblk->rblk_size = fsize;
194 rblk->rblk_pi.p_size = fsize >> DEV_BSHIFT;
195 rblk->rblk_dl.d_secsize = DEV_BSIZE;
196 rblk->rblk_curpi = &rblk->rblk_pi;
197 } else {
198 if (rumpuser_ioctl(fd,DIOCGDINFO, &rblk->rblk_dl, &error) != -1) {
199 rumpuser_close(fd, &dummy);
200 return error;
201 }
202
203 rblk->rblk_curpi = &rblk->rblk_dl.d_partitions[0];
204 }
205 rblk->rblk_fd = fd;
206 rblk->rblk_mem = mem;
207
208 return 0;
209 }
210
211 int
212 rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l)
213 {
214 struct rblkdev *rblk = &minors[minor(dev)];
215 int dummy;
216
217 if (rblk->rblk_mem) {
218 KASSERT(rblk->rblk_size);
219 rumpuser_memsync(rblk->rblk_mem, rblk->rblk_size, &dummy);
220 rumpuser_unmap(rblk->rblk_mem, rblk->rblk_size);
221 rblk->rblk_mem = NULL;
222 }
223 rumpuser_close(rblk->rblk_fd, &dummy);
224 rblk->rblk_fd = -1;
225
226 return 0;
227 }
228
229 int
230 rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l)
231 {
232 struct rblkdev *rblk = &minors[minor(dev)];
233 int rv, error;
234
235 if (xfer == DIOCGPART) {
236 struct partinfo *pi = (struct partinfo *)addr;
237
238 pi->part = rblk->rblk_curpi;
239 pi->disklab = &rblk->rblk_dl;
240
241 return 0;
242 }
243
244 rv = rumpuser_ioctl(rblk->rblk_fd, xfer, addr, &error);
245 if (rv == -1)
246 return error;
247
248 return 0;
249 }
250
251 int
252 rumpblk_read(dev_t dev, struct uio *uio, int flags)
253 {
254
255 panic("%s: unimplemented", __func__);
256 }
257
258 int
259 rumpblk_write(dev_t dev, struct uio *uio, int flags)
260 {
261
262 panic("%s: unimplemented", __func__);
263 }
264
265 static void
266 dostrategy(struct buf *bp)
267 {
268 struct rblkdev *rblk = &minors[minor(bp->b_dev)];
269 off_t off;
270 int async, error;
271
272 off = bp->b_blkno << DEV_BSHIFT;
273 /*
274 * Do bounds checking if we're working on a file. Otherwise
275 * invalid file systems might attempt to read beyond EOF. This
276 * is bad(tm) especially on mmapped images. This is essentially
277 * the kernel bounds_check() routines.
278 */
279 if (rblk->rblk_size && off + bp->b_bcount > rblk->rblk_size) {
280 int64_t sz = rblk->rblk_size - off;
281
282 /* EOF */
283 if (sz == 0) {
284 rump_biodone(bp, 0, 0);
285 return;
286 }
287 /* beyond EOF ==> error */
288 if (sz < 0) {
289 rump_biodone(bp, 0, EINVAL);
290 return;
291 }
292
293 /* truncate to device size */
294 bp->b_bcount = sz;
295 }
296
297 async = bp->b_flags & B_ASYNC;
298 DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64
299 " (0x%" PRIx64 " - 0x%" PRIx64")\n",
300 bp->b_bcount, BUF_ISREAD(bp) "READ" : "WRITE",
301 off, off, (off + bp->b_bcount)));
302
303 /* mem optimization? handle here and return */
304 if (rblk->rblk_mem) {
305 uint8_t *ioaddr = rblk->rblk_mem + off;
306
307 if (BUF_ISREAD(bp)) {
308 memcpy(bp->b_data, ioaddr, bp->b_bcount);
309 } else {
310 memcpy(ioaddr, bp->b_data, bp->b_bcount);
311 }
312
313 /* synchronous write, sync necessary bits back to disk */
314 if (BUF_ISWRITE(bp) && !async) {
315 rumpuser_memsync(ioaddr, bp->b_bcount, &error);
316 }
317 rump_biodone(bp, bp->b_bcount, 0);
318
319 return;
320 }
321
322 /*
323 * Do I/O. We have different paths for async and sync I/O.
324 * Async I/O is done by passing a request to rumpuser where
325 * it is executed. The rumpuser routine then calls
326 * biodone() to signal any waiters in the kernel. I/O's are
327 * executed in series. Technically executing them in parallel
328 * would produce better results, but then we'd need either
329 * more threads or posix aio. Maybe worth investigating
330 * this later.
331 *
332 * Synchronous I/O is done directly in the context mainly to
333 * avoid unnecessary scheduling with the I/O thread.
334 */
335 if (async && rump_threads) {
336 struct rumpuser_aio *rua;
337
338 rumpuser_mutex_enter(&rumpuser_aio_mtx);
339 /*
340 * Check if our buffer is full. Doing it this way
341 * throttles the I/O a bit if we have a massive
342 * async I/O burst.
343 */
344 if ((rumpuser_aio_head+1) % N_AIOS == rumpuser_aio_tail) {
345 rumpuser_mutex_exit(&rumpuser_aio_mtx);
346 goto syncfallback;
347 }
348
349 rua = &rumpuser_aios[rumpuser_aio_head];
350 KASSERT(rua->rua_bp == NULL);
351 rua->rua_fd = rblk->rblk_fd;
352 rua->rua_data = bp->b_data;
353 rua->rua_dlen = bp->b_bcount;
354 rua->rua_off = off;
355 rua->rua_bp = bp;
356 rua->rua_op = BUF_ISREAD(bp);
357
358 /* insert into queue & signal */
359 rumpuser_aio_head = (rumpuser_aio_head+1) % N_AIOS;
360 rumpuser_cv_signal(&rumpuser_aio_cv);
361 rumpuser_mutex_exit(&rumpuser_aio_mtx);
362 } else {
363 syncfallback:
364 if (BUF_ISREAD(bp)) {
365 rumpuser_read_bio(rblk->rblk_fd, bp->b_data,
366 bp->b_bcount, off, rump_biodone, bp);
367 } else {
368 rumpuser_write_bio(rblk->rblk_fd, bp->b_data,
369 bp->b_bcount, off, rump_biodone, bp);
370 }
371 if (!async) {
372 if (BUF_ISWRITE(bp))
373 rumpuser_fsync(rblk->rblk_fd, &error);
374 }
375 }
376 }
377
378 void
379 rumpblk_strategy(struct buf *bp)
380 {
381
382 dostrategy(bp);
383 }
384
385 /*
386 * Simple random number generator. This is private so that we can
387 * very repeatedly control which blocks will fail.
388 *
389 * <mlelstv> pooka, rand()
390 * <mlelstv> [paste]
391 */
392 static unsigned
393 gimmerand(void)
394 {
395
396 return (randstate = randstate * 1103515245 + 12345) % (0x80000000L);
397 }
398
399 /*
400 * Block device with very simple fault injection. Fails every
401 * n out of BLKFAIL_MAX I/O with EIO. n is determined by the env
402 * variable RUMP_BLKFAIL.
403 */
404 void
405 rumpblk_strategy_fail(struct buf *bp)
406 {
407
408 if (gimmerand() % BLKFAIL_MAX >= blkfail) {
409 dostrategy(bp);
410 } else {
411 printf("block fault injection: failing I/O on block %lld\n",
412 (long long)bp->b_blkno);
413 bp->b_error = EIO;
414 biodone(bp);
415 }
416 }
417